# Task 3

# Imports

In [1]:
import numpy as np
import pandas as pd
from joblib import dump, load
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
plt.style.use('ggplot')
plt.rcParams.update({'font.size': 16, 'axes.labelweight': 'bold', 'figure.figsize': (8,6)})
## add any other additional packages that you need. You are free to use any packages for vizualization.

## Part 1:

Recall as a final goal of this project. We want to build and deploy ensemble machine learning models in the cloud, where features are outputs of different climate models and the target is the actual rainfall observation. In this milestone, you'll actually build these ensemble machine learning models in the cloud.  

**Your tasks:**

1. Read the data CSV from your s3 bucket. 
2. Drop rows with nans. 
3. Split the data into train (80%) and test (20%) portions with `random_state=123`. 
4. Carry out EDA of your choice on the train split. 
5. Train ensemble machine learning model using `RandomForestRegressor` and evaluate with metric of your choice (e.g., `RMSE`) by considering `Observed` as the target column. 
6. Discuss your results. Are you getting better results with ensemble models compared to the individual climate models? 

> Recall that individual columns in the data are predictions of different climate models. 

In [2]:
## Depending on the permissions that you provided to your bucket you might need to provide your aws credentials
## to read from the bucket, if so provide with your credentials and pass as storage_options=aws_credentials
# aws_credentials = {"key": "","secret": "","token":""}
df = pd.read_csv("s3://mds-s3-9/output/ml_data_SYD.csv", index_col=0, parse_dates=True)

In [3]:
## Use your ML skills to get from step 1 to step 6

In [4]:
df = df.dropna()

In [5]:
train, test = train_test_split(df, test_size=0.2, random_state=123)

In [6]:
train.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 36816 entries, 1904-07-25 to 1932-01-22
Data columns (total 26 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   ACCESS-CM2         36816 non-null  float64
 1   ACCESS-ESM1-5      36816 non-null  float64
 2   AWI-ESM-1-1-LR     36816 non-null  float64
 3   BCC-CSM2-MR        36816 non-null  float64
 4   BCC-ESM1           36816 non-null  float64
 5   CMCC-CM2-HR4       36816 non-null  float64
 6   CMCC-CM2-SR5       36816 non-null  float64
 7   CMCC-ESM2          36816 non-null  float64
 8   CanESM5            36816 non-null  float64
 9   EC-Earth3-Veg-LR   36816 non-null  float64
 10  FGOALS-g3          36816 non-null  float64
 11  GFDL-CM4           36816 non-null  float64
 12  INM-CM4-8          36816 non-null  float64
 13  INM-CM5-0          36816 non-null  float64
 14  KIOST-ESM          36816 non-null  float64
 15  MIROC6             36816 non-null  float64
 16  MPI-E

In [7]:
train.describe()

Unnamed: 0,ACCESS-CM2,ACCESS-ESM1-5,AWI-ESM-1-1-LR,BCC-CSM2-MR,BCC-ESM1,CMCC-CM2-HR4,CMCC-CM2-SR5,CMCC-ESM2,CanESM5,EC-Earth3-Veg-LR,...,MPI-ESM-1-2-HAM,MPI-ESM1-2-HR,MPI-ESM1-2-LR,MRI-ESM2-0,NESM3,NorESM2-LM,NorESM2-MM,SAM0-UNICON,TaiESM1,observed_rainfall
count,36816.0,36816.0,36816.0,36816.0,36816.0,36816.0,36816.0,36816.0,36816.0,36816.0,...,36816.0,36816.0,36816.0,36816.0,36816.0,36816.0,36816.0,36816.0,36816.0,36816.0
mean,2.435598,2.911161,3.6851,2.19516,2.771609,3.116934,3.591418,3.490604,2.950611,2.559451,...,3.175946,1.328797,2.048747,1.536491,1.752144,2.451512,2.909697,3.391212,3.403765,2.736204
std,6.876014,6.951689,7.227256,6.502536,6.051221,6.466975,7.392305,7.076361,7.074549,5.739063,...,6.883672,4.955151,5.375858,4.993425,4.937174,5.796878,7.173033,7.960724,7.525256,8.108492
min,0.0,0.0,9.161142e-14,0.0,0.0,0.0,-3.479596e-18,-3.1861769999999997e-19,0.0,-9.934637e-19,...,3.315622e-13,1.089808e-13,9.155419e-14,9.479186000000001e-33,1.426891e-13,0.0,0.0,-3.6046730000000005e-17,-2.148475e-14,0.0
25%,0.053584,0.021379,0.0281984,0.000518,0.00237,0.138181,0.08941694,0.09016145,0.022656,0.01192093,...,0.0001005828,1.270362e-13,1.352331e-13,5.353678e-05,1.862711e-13,0.005547,0.010028,0.03754041,0.04883792,0.008082
50%,0.191574,0.494985,0.585113,0.096505,0.295341,0.643671,0.8435672,0.8216741,0.348699,0.4261732,...,0.2054757,0.001752656,0.114682,0.03193842,0.05167065,0.16797,0.256126,0.6540263,0.6658721,0.164671
75%,1.435693,2.398416,3.571731,1.323656,2.508854,3.219543,3.724556,3.630505,2.615149,2.294516,...,2.685723,0.3616506,1.18362,0.6686751,0.7920023,1.819091,2.502725,3.271716,3.217312,1.652147
max,149.967634,157.605713,89.46575,134.465223,87.134722,124.95239,140.1478,137.5916,135.569753,134.2262,...,93.06766,109.5008,80.05998,101.69,80.45783,103.367212,163.164524,154.9718,167.3562,192.93303


In [8]:
X_train, y_train = train.iloc[:, :-1].copy(), train.iloc[:, -1].copy()
X_test, y_test = test.iloc[:, :-1].copy(), test.iloc[:, -1].copy()
rf = RandomForestRegressor(random_state=123)
rf.fit(X_train, y_train)

RandomForestRegressor(random_state=123)

In [11]:
print(
    f"The training RMSE for the random forest model is {mean_squared_error(y_train, rf.predict(X_train), squared=False):.2f}."
)
print(
    f"The test RMSE for the random forest model is {mean_squared_error(y_test, rf.predict(X_test), squared=False):.2f}."
)

The training RMSE for the random forest model is 3.14.
The test RMSE for the random forest model is 8.72.


In [13]:
X_test.iloc[:, 0].name

'ACCESS-CM2'

In [18]:
test_results = {
    "model": ["RandomForestRegressor"],
    "test_RMSE": [mean_squared_error(y_test, rf.predict(X_test), squared=False)],
}

for i in range(len(X_test.columns)):
    test_results["model"].append(X_test.iloc[:, i].name)
    test_results["test_RMSE"].append(
        mean_squared_error(y_test, X_test.iloc[:, i], squared=False)
    )
    
pd.DataFrame(test_results).sort_values("test_RMSE").reset_index(drop=True)

Unnamed: 0,model,test_RMSE
0,RandomForestRegressor,8.721994
1,KIOST-ESM,9.409771
2,FGOALS-g3,9.565205
3,NESM3,9.694444
4,MPI-ESM1-2-HR,9.770186
5,MRI-ESM2-0,9.844215
6,MPI-ESM1-2-LR,10.053474
7,EC-Earth3-Veg-LR,10.298655
8,NorESM2-LM,10.331408
9,GFDL-CM4,10.40078


We can see that our untuned random forest model performs slightly better on the test set than the best performing model in CMIP6 model. This result is not that surprising as the strength of ensemble models is that mistakes made by individual models are averaged out by the results of the other models in the ensemble.

## Part 2:

### Preparation for deploying model next week

***NOTE: Complete task 4 from the milestone3 before coming here***

We’ve found the best hyperparameter settings with MLlib (from the task 4 from milestone3), here we then use the same hyperparameters to train a scikit-learn model. 

In [None]:
model = RandomForestRegressor(n_estimators=___, max_depth=___)
model.fit(X_train, y_train)

In [None]:
print(f"Train RMSE: {mean_squared_error(y_train, model.predict(X_train), squared=False):.2f}")
print(f" Test RMSE: {mean_squared_error(y_test, model.predict(X_test), squared=False):.2f}")

In [28]:
# ready to deploy
dump(model, "model.joblib")

['model.joblib']

***Upload model.joblib to s3 under output folder. You choose how you want to upload it (using CLI, SDK, or web console).***