# Task 3

# Imports

In [1]:
import numpy as np
import pandas as pd
from joblib import dump, load
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
plt.style.use('ggplot')
plt.rcParams.update({'font.size': 16, 'axes.labelweight': 'bold', 'figure.figsize': (8,6)})
## add any other additional packages that you need. You are free to use any packages for vizualization.

## Part 1:

Recall as a final goal of this project. We want to build and deploy ensemble machine learning models in the cloud, where features are outputs of different climate models and the target is the actual rainfall observation. In this milestone, you'll actually build these ensemble machine learning models in the cloud.  

**Your tasks:**

1. Read the data CSV from your s3 bucket. 
2. Drop rows with nans. 
3. Split the data into train (80%) and test (20%) portions with `random_state=123`. 
4. Carry out EDA of your choice on the train split. 
5. Train ensemble machine learning model using `RandomForestRegressor` and evaluate with metric of your choice (e.g., `RMSE`) by considering `Observed` as the target column. 
6. Discuss your results. Are you getting better results with ensemble models compared to the individual climate models? 

> Recall that individual columns in the data are predictions of different climate models. 

In [8]:
## Depending on the permissions that you provided to your bucket you might need to provide your aws credentials
## to read from the bucket, if so provide with your credentials and pass as storage_options=aws_credentials
aws_credentials = {"key": "ASIAVAPDWVQZMAX3RTZL","secret": "/8PcRQsP80USrk0SRbt5NF5iuUZxOZb3GIU2rYW0","token":"FwoGZXIvYXdzELf//////////wEaDEr678Cjfgao2Rp7QSLKAVGgDLQQp74brmfa101Oj/8c5uErtwIveT14sGYzPomATXQoIWhO+uElctICFKcCJaQKCWyX/0FKJqBbPAJn2+37Laj3a+/IqaVyBrvhHj25xtyCn9wbo6r0zr28eqJkeVUFzfJB9k6vGRPn0+mNCCzk6LYXWgUz74e5dnWkEDB5LQ9yAo6NXaJ7Ryh1HNhOgG9mPhvaYj/TSWqn7ciTnZIZ9vOchdslyqF/2TIWdZ78WvwqktJv1OOj82KqOXsXGYCVSqsMcKZRqgwolOnXkgYyLWsaVzZS1T+tF0yIDUi8VrpmiKuil0EK2TakBWTTD5z/9en1prXSnLOaHJBBfw=="}
df = pd.read_csv("s3://mds-s3-30/output/wrangled.csv", parse_dates=True, storage_options=aws_credentials)

In [9]:
df.head()

Unnamed: 0,ACCESS-CM2,ACCESS-ESM1-5,AWI-ESM-1-1-LR,BCC-CSM2-MR,BCC-ESM1,CMCC-CM2-HR4,CMCC-CM2-SR5,CMCC-ESM2,CanESM5,EC-Earth3-Veg-LR,...,MPI-ESM-1-2-HAM,MPI-ESM1-2-HR,MPI-ESM1-2-LR,MRI-ESM2-0,NESM3,NorESM2-LM,NorESM2-MM,SAM0-UNICON,TaiESM1,rain (mm/day)
0,0.040427,1.814552,35.579336,4.268112,0.001107466,11.410537,3.322009e-08,2.6688,1.321215,1.515293,...,4.244226e-13,1.390174e-13,6.537884e-05,3.445495e-06,15.76096,4.759651e-05,2.451075,0.221324,2.257933,0.006612
1,0.073777,0.303965,4.59652,1.190141,0.0001015323,4.014984,1.3127,0.946211,2.788724,4.771375,...,4.409552,0.1222283,1.049131e-13,4.791993e-09,0.367551,0.4350863,0.477231,3.757179,2.287381,0.090422
2,0.232656,0.019976,5.927467,1.003845e-09,1.760345e-05,9.660565,9.10372,0.431999,0.003672,4.23398,...,0.22693,0.3762301,9.758706e-14,0.6912302,0.1562869,9.561101,0.023083,0.253357,1.199909,1.401452
3,0.911319,13.623777,8.029624,0.08225225,0.1808932,3.951528,13.1716,0.368693,0.013578,15.252495,...,0.02344586,0.4214019,0.007060915,0.03835721,2.472226e-07,0.5301038,0.002699,2.185454,2.106737,14.869798
4,0.698013,0.021048,2.132686,2.496841,4.708019e-09,2.766362,18.2294,0.339267,0.002468,11.920356,...,4.270161e-13,0.1879692,4.504985,3.506923e-07,1.949792e-13,1.460928e-10,0.001026,2.766507,1.763335,0.467628


### Drop the rows with nans

In [10]:
df = df.dropna()

### Split into Train & Test

In [11]:
df_train, df_test = train_test_split(df, train_size = 0.8, random_state=123)

### EDA

In [12]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 36816 entries, 5683 to 15725
Data columns (total 26 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   ACCESS-CM2        36816 non-null  float64
 1   ACCESS-ESM1-5     36816 non-null  float64
 2   AWI-ESM-1-1-LR    36816 non-null  float64
 3   BCC-CSM2-MR       36816 non-null  float64
 4   BCC-ESM1          36816 non-null  float64
 5   CMCC-CM2-HR4      36816 non-null  float64
 6   CMCC-CM2-SR5      36816 non-null  float64
 7   CMCC-ESM2         36816 non-null  float64
 8   CanESM5           36816 non-null  float64
 9   EC-Earth3-Veg-LR  36816 non-null  float64
 10  FGOALS-g3         36816 non-null  float64
 11  GFDL-CM4          36816 non-null  float64
 12  INM-CM4-8         36816 non-null  float64
 13  INM-CM5-0         36816 non-null  float64
 14  KIOST-ESM         36816 non-null  float64
 15  MIROC6            36816 non-null  float64
 16  MPI-ESM-1-2-HAM   36816 non-null  flo

In [13]:
df_train.describe()

Unnamed: 0,ACCESS-CM2,ACCESS-ESM1-5,AWI-ESM-1-1-LR,BCC-CSM2-MR,BCC-ESM1,CMCC-CM2-HR4,CMCC-CM2-SR5,CMCC-ESM2,CanESM5,EC-Earth3-Veg-LR,...,MPI-ESM-1-2-HAM,MPI-ESM1-2-HR,MPI-ESM1-2-LR,MRI-ESM2-0,NESM3,NorESM2-LM,NorESM2-MM,SAM0-UNICON,TaiESM1,rain (mm/day)
count,36816.0,36816.0,36816.0,36816.0,36816.0,36816.0,36816.0,36816.0,36816.0,36816.0,...,36816.0,36816.0,36816.0,36816.0,36816.0,36816.0,36816.0,36816.0,36816.0,36816.0
mean,2.435598,2.911161,3.6851,2.19516,2.771609,3.116934,3.591418,3.490604,2.950611,2.559451,...,3.175946,1.328797,2.048747,1.536491,1.752144,2.451512,2.909697,3.391212,3.403765,2.736204
std,6.876014,6.951689,7.227256,6.502536,6.051221,6.466975,7.392305,7.076361,7.074549,5.739063,...,6.883672,4.955151,5.375858,4.993425,4.937174,5.796878,7.173033,7.960724,7.525256,8.108492
min,0.0,0.0,9.161142e-14,0.0,0.0,0.0,-3.479596e-18,-3.1861769999999997e-19,0.0,-9.934637e-19,...,3.315622e-13,1.089808e-13,9.155419e-14,9.479186000000001e-33,1.426891e-13,0.0,0.0,-3.6046730000000005e-17,-2.148475e-14,0.0
25%,0.053584,0.021379,0.0281984,0.000518,0.00237,0.138181,0.08941694,0.09016145,0.022656,0.01192093,...,0.0001005828,1.270362e-13,1.352331e-13,5.353678e-05,1.862711e-13,0.005547,0.010028,0.03754041,0.04883792,0.008082
50%,0.191574,0.494985,0.585113,0.096505,0.295341,0.643671,0.8435672,0.8216741,0.348699,0.4261732,...,0.2054757,0.001752656,0.114682,0.03193842,0.05167065,0.16797,0.256126,0.6540263,0.6658721,0.164671
75%,1.435693,2.398416,3.571731,1.323656,2.508854,3.219543,3.724556,3.630505,2.615149,2.294516,...,2.685723,0.3616506,1.18362,0.6686751,0.7920023,1.819091,2.502725,3.271716,3.217312,1.652147
max,149.967634,157.605713,89.46575,134.465223,87.134722,124.95239,140.1478,137.5916,135.569753,134.2262,...,93.06766,109.5008,80.05998,101.69,80.45783,103.367212,163.164524,154.9718,167.3562,192.93303


### Train ensemble model

In [19]:
X_train = df_train.drop(columns=["rain (mm/day)"])
y_train = df_train["rain (mm/day)"]

rfr = RandomForestRegressor()
rfr.fit(X_train, y_train)

RandomForestRegressor()

In [29]:
X_test = df_test.drop(columns=["rain (mm/day)"])
y_test = df_test["rain (mm/day)"]

train_error = mean_squared_error(y_train, rfr.predict(X_train), squared=False)
test_error = mean_squared_error(y_test, rfr.predict(X_test), squared=False)

print("Train Error:", train_error)
print("Test Error:", test_error)


Train Error: 3.132908402261298
Test Error: 8.718622264086912


In [38]:
errors = list()
for column in X_test.columns[:-1]: 
    errors.append(mean_squared_error(y_test, X_test[column], squared=False))

errors_by_model = pd.DataFrame({"RMSE": errors, "Model": X_train.columns[:-1]})
ensemble = pd.DataFrame({"RMSE": [test_error], "Model": ["RF Ensemble"]})

errors_by_model = pd.concat([errors_by_model, ensemble], ignore_index=True, axis=0)
errors_by_model.style.apply(lambda x: ['background: yellow' if x.name == 24 
                              else '' for i in x], 
                   axis=1)

Unnamed: 0,RMSE,Model
0,10.764463,ACCESS-CM2
1,10.84655,ACCESS-ESM1-5
2,11.187124,AWI-ESM-1-1-LR
3,10.796497,BCC-CSM2-MR
4,10.432164,BCC-ESM1
5,10.564945,CMCC-CM2-HR4
6,11.285054,CMCC-CM2-SR5
7,11.129111,CMCC-ESM2
8,10.638488,CanESM5
9,10.298655,EC-Earth3-Veg-LR


As you can see from the results above, our Random Forest Ensemble model is getting better MAE than the individual climate models. On our test data, the ensemble model produced an MAE of 8.71 which is better than the best individual model MAE of 9.41. 

## Part 2:

### Preparation for deploying model next week

***NOTE: Complete task 4 from the milestone3 before coming here***

We’ve found the best hyperparameter settings with MLlib (from the task 4 from milestone3), here we then use the same hyperparameters to train a scikit-learn model. 

In [41]:
model = RandomForestRegressor(n_estimators=100, max_depth=5, bootstrap=True)
model.fit(X_train, y_train)

RandomForestRegressor(max_depth=5)

In [42]:
print(f"Train RMSE: {mean_squared_error(y_train, model.predict(X_train), squared=False):.2f}")
print(f" Test RMSE: {mean_squared_error(y_test, model.predict(X_test), squared=False):.2f}")

Train RMSE: 7.91
 Test RMSE: 8.50


In [43]:
# ready to deploy
dump(model, "model.joblib")

['model.joblib']

***Upload model.joblib to s3 under output folder. You choose how you want to upload it (using CLI, SDK, or web console).***