# Task 3

# Imports

In [1]:
import numpy as np
import pandas as pd
from joblib import dump, load
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
plt.style.use('ggplot')
plt.rcParams.update({'font.size': 16, 'axes.labelweight': 'bold', 'figure.figsize': (8,6)})

Matplotlib created a temporary config/cache directory at /tmp/matplotlib-zv8mqxbr because the default path (/home/jupyter-student85/.cache/matplotlib) is not a writable directory; it is highly recommended to set the MPLCONFIGDIR environment variable to a writable directory, in particular to speed up the import of Matplotlib and to better support multiprocessing.


## Part 1:

Recall as a final goal of this project. We want to build and deploy ensemble machine learning models in the cloud, where features are outputs of different climate models and the target is the actual rainfall observation. In this milestone, you'll actually build these ensemble machine learning models in the cloud.  

**Your tasks:**

1. Read the data CSV from your s3 bucket. 
2. Drop rows with nans. 
3. Split the data into train (80%) and test (20%) portions with `random_state=123`. 
4. Carry out EDA of your choice on the train split. 
5. Train ensemble machine learning model using `RandomForestRegressor` and evaluate with metric of your choice (e.g., `RMSE`) by considering `Observed` as the target column. 
6. Discuss your results. Are you getting better results with ensemble models compared to the individual climate models? 

> Recall that individual columns in the data are predictions of different climate models. 

In [2]:
# Step 1: Read the data CSV from our s3 bucket
aws_credentials ={"key": " ","secret": " "} 
df = pd.read_csv('s3://mds-s3-student85/output/ml_data_SYD.csv', storage_options=aws_credentials)

In [3]:
# Step 2: Drop rows with nans
df = df.dropna()
df.head()

Unnamed: 0,time,ACCESS-CM2,ACCESS-ESM1-5,AWI-ESM-1-1-LR,BCC-CSM2-MR,BCC-ESM1,CMCC-CM2-HR4,CMCC-CM2-SR5,CMCC-ESM2,CanESM5,...,MPI-ESM-1-2-HAM,MPI-ESM1-2-HR,MPI-ESM1-2-LR,MRI-ESM2-0,NESM3,NorESM2-LM,NorESM2-MM,SAM0-UNICON,TaiESM1,observed_rainfall
0,1889-01-01,0.040427,1.814552,35.579336,4.268112,0.001107466,11.410537,3.322009e-08,2.6688,1.321215,...,4.244226e-13,1.390174e-13,6.537884e-05,3.445495e-06,15.76096,4.759651e-05,2.451075,0.221324,2.257933,0.006612
1,1889-01-02,0.073777,0.303965,4.59652,1.190141,0.0001015323,4.014984,1.3127,0.946211,2.788724,...,4.409552,0.1222283,1.049131e-13,4.791993e-09,0.367551,0.4350863,0.477231,3.757179,2.287381,0.090422
2,1889-01-03,0.232656,0.019976,5.927467,1.003845e-09,1.760345e-05,9.660565,9.10372,0.431999,0.003672,...,0.22693,0.3762301,9.758706e-14,0.6912302,0.1562869,9.561101,0.023083,0.253357,1.199909,1.401452
3,1889-01-04,0.911319,13.623777,8.029624,0.08225225,0.1808932,3.951528,13.1716,0.368693,0.013578,...,0.02344586,0.4214019,0.007060915,0.03835721,2.472226e-07,0.5301038,0.002699,2.185454,2.106737,14.869798
4,1889-01-05,0.698013,0.021048,2.132686,2.496841,4.708019e-09,2.766362,18.2294,0.339267,0.002468,...,4.270161e-13,0.1879692,4.504985,3.506923e-07,1.949792e-13,1.460928e-10,0.001026,2.766507,1.763335,0.467628


In [4]:
# Step 3: Split the data into train (80%) and test (20%) portions with random_state=123
X = df.drop(columns = ['observed_rainfall', 'time'])
y = df['observed_rainfall']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=123
)

In [5]:
# Step 4: Carry out EDA of our choice on the train split
X_train.describe()

Unnamed: 0,ACCESS-CM2,ACCESS-ESM1-5,AWI-ESM-1-1-LR,BCC-CSM2-MR,BCC-ESM1,CMCC-CM2-HR4,CMCC-CM2-SR5,CMCC-ESM2,CanESM5,EC-Earth3-Veg-LR,...,MIROC6,MPI-ESM-1-2-HAM,MPI-ESM1-2-HR,MPI-ESM1-2-LR,MRI-ESM2-0,NESM3,NorESM2-LM,NorESM2-MM,SAM0-UNICON,TaiESM1
count,36816.0,36816.0,36816.0,36816.0,36816.0,36816.0,36816.0,36816.0,36816.0,36816.0,...,36816.0,36816.0,36816.0,36816.0,36816.0,36816.0,36816.0,36816.0,36816.0,36816.0
mean,2.435598,2.911161,3.6851,2.19516,2.771609,3.116934,3.591418,3.490604,2.950611,2.559451,...,3.095043,3.175946,1.328797,2.048747,1.536491,1.752144,2.451512,2.909697,3.391212,3.403765
std,6.876014,6.951689,7.227256,6.502536,6.051221,6.466975,7.392305,7.076361,7.074549,5.739063,...,7.728605,6.883672,4.955151,5.375858,4.993425,4.937174,5.796878,7.173033,7.960724,7.525256
min,0.0,0.0,9.161142e-14,0.0,0.0,0.0,-3.479596e-18,-3.1861769999999997e-19,0.0,-9.934637e-19,...,1.044483e-31,3.315622e-13,1.089808e-13,9.155419e-14,9.479186000000001e-33,1.426891e-13,0.0,0.0,-3.6046730000000005e-17,-2.148475e-14
25%,0.053584,0.021379,0.0281984,0.000518,0.00237,0.138181,0.08941694,0.09016145,0.022656,0.01192093,...,0.01873953,0.0001005828,1.270362e-13,1.352331e-13,5.353678e-05,1.862711e-13,0.005547,0.010028,0.03754041,0.04883792
50%,0.191574,0.494985,0.585113,0.096505,0.295341,0.643671,0.8435672,0.8216741,0.348699,0.4261732,...,0.3512424,0.2054757,0.001752656,0.114682,0.03193842,0.05167065,0.16797,0.256126,0.6540263,0.6658721
75%,1.435693,2.398416,3.571731,1.323656,2.508854,3.219543,3.724556,3.630505,2.615149,2.294516,...,2.306289,2.685723,0.3616506,1.18362,0.6686751,0.7920023,1.819091,2.502725,3.271716,3.217312
max,149.967634,157.605713,89.46575,134.465223,87.134722,124.95239,140.1478,137.5916,135.569753,134.2262,...,167.1499,93.06766,109.5008,80.05998,101.69,80.45783,103.367212,163.164524,154.9718,167.3562


In [6]:
y_train.describe()

count    36816.000000
mean         2.736204
std          8.108492
min          0.000000
25%          0.008082
50%          0.164671
75%          1.652147
max        192.933030
Name: observed_rainfall, dtype: float64

In [7]:
# Step 5: Train ensemble machine learning model using RandomForestRegressor
# We chose to evalutate on RMSE metric
model = RandomForestRegressor(criterion='mse', random_state=123, n_jobs=-1)
model.fit(X_train, y_train)

RandomForestRegressor(n_jobs=-1, random_state=123)

In [8]:
pred = model.predict(X_test)
pred

array([4.41141222, 3.30595151, 3.55104322, ..., 1.08873845, 2.40564656,
       4.23071184])

In [9]:
rmse = mean_squared_error(y_test, pred, squared=False)
rmse

8.73168482300724

In [10]:
# Step 6: Discuss results
# RMSE for the individual climate models
rmse = {}
for col in X_test.columns:
    rmse[col] = mean_squared_error(y_test, X_test[col], squared=False)
rmse

{'ACCESS-CM2': 10.764463280409297,
 'ACCESS-ESM1-5': 10.846550490025752,
 'AWI-ESM-1-1-LR': 11.18712359703712,
 'BCC-CSM2-MR': 10.796497068583623,
 'BCC-ESM1': 10.432163671907844,
 'CMCC-CM2-HR4': 10.56494458356347,
 'CMCC-CM2-SR5': 11.285054457979179,
 'CMCC-ESM2': 11.129110641875158,
 'CanESM5': 10.6384880201457,
 'EC-Earth3-Veg-LR': 10.298655489422481,
 'FGOALS-g3': 9.565204988737806,
 'GFDL-CM4': 10.400779732820741,
 'INM-CM4-8': 11.691141311221779,
 'INM-CM5-0': 12.060185621540278,
 'KIOST-ESM': 9.409771409376939,
 'MIROC6': 11.498469722391727,
 'MPI-ESM-1-2-HAM': 11.042786788847769,
 'MPI-ESM1-2-HR': 9.770185835104554,
 'MPI-ESM1-2-LR': 10.053474074574387,
 'MRI-ESM2-0': 9.844214758267789,
 'NESM3': 9.694444444949722,
 'NorESM2-LM': 10.331408116735595,
 'NorESM2-MM': 10.659912320443588,
 'SAM0-UNICON': 11.527239676095723,
 'TaiESM1': 11.4725463436798}

Yes, we are getting better results with ensemble models compared to the individual climate models as we can see that individual climate models have greater RMSEs than the ensemble model.

## Part 2:

### Preparation for deploying model next week

#### Complete task 4 from the milestone3 before coming here

We’ve found ```n_estimators=100, max_depth=5``` to be the best hyperparameter settings with MLlib (from the task 4 from milestone3), here we then use the same hyperparameters to train a scikit-learn model. 

In [14]:
model = RandomForestRegressor(n_estimators=100, max_depth=5, bootstrap=True)
model.fit(X_train, y_train)

RandomForestRegressor(max_depth=5)

In [15]:
print(f"Train RMSE: {mean_squared_error(y_train, model.predict(X_train), squared=False):.2f}")
print(f" Test RMSE: {mean_squared_error(y_test, model.predict(X_test), squared=False):.2f}")

Train RMSE: 7.92
 Test RMSE: 8.51


In [16]:
# ready to deploy
dump(model, "model.joblib")

['model.joblib']

***Upload model.joblib to s3. You choose how you want to upload it.***