In [0]:
import pickle
import sklearn
import numpy as np
import math
#from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

In [0]:
import gzip
import cPickle as pickle
from google.colab import files

In [3]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


Get the x and y values in the pickle **imr_x_y_countries_pickle** to be trained in the GPU in Google Colab

In [0]:
dbfile = open('/content/gdrive/My Drive/Massive Data Final Project/imr_x_y_countries_pickle', 'rb')      
x_y_values_db = pickle.load(dbfile) 
#print x_y_values_db['x_val']
x_val_array = x_y_values_db['x_val']
y_val_array = x_y_values_db['y_val']
dbfile.close() 

Load the **RandomForestRegressor model** and split the train and test sets using train_test_split function of sklearn. <br/>
Here, we try the Random Forest. To find the model's quality, we use the **RMSE** (Root Mean Square Error) and the **R2** score. R2 is the proportion of variability that is explained by our model. It ranges from 0-1 for maximum proportion of variability explained

In [5]:
forest_model = RandomForestRegressor(random_state=21)
train_X, test_X, train_y, test_y = train_test_split(x_val_array, y_val_array, test_size=0.25, random_state=21)
rmse= np.sqrt(np.mean(-cross_val_score(forest_model, train_X, train_y,cv=5,  scoring='neg_mean_squared_error')))
print("RMSE : %f" % (rmse))
r2_score1= np.mean(cross_val_score(forest_model, train_X, train_y,cv=5,  scoring='r2'))
print("R2 score: %s" % '{:.2}'.format(r2_score1))



RMSE : 0.030921
R2 score: 0.9


We need the best parameters for our **RandomForestRegressor**. We try to find them using **RandomSearchCV** followed by **GridSearchCV**. RandomSearchCV helps to narrow down the number and range of parameters, which can then be tested using GridSearchCV. Here we print the best params for the model using RandomSearchCV

In [0]:
number_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]
random_grid = {'n_estimators': number_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)

{'bootstrap': [True, False], 'min_samples_leaf': [1, 2, 4], 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'min_samples_split': [2, 5, 10], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None]}


In [0]:
forest_model = RandomForestRegressor(random_state=21)
rf_random = RandomizedSearchCV(estimator = forest_model, param_distributions = random_grid, 
                          cv = 5, n_jobs = -1,n_iter = 20, verbose = 0)
rf_random.fit(train_X, train_y)
print(rf_random.best_params_)



{'bootstrap': True, 'min_samples_leaf': 1, 'n_estimators': 400, 'max_features': 'auto', 'min_samples_split': 2, 'max_depth': None}


Random search helps to narrow down the range for all the hyperparameters. This is then followed by **GridSearch CV** where we can test all the combinations of the parameters unlike Random search. Here we find best params for the model using GridSearchCV

In [0]:
param_grid = {'max_depth': [10,20,30,40,50],
              'max_features': ['sqrt'],
              'min_samples_leaf': [1, 3, 4, 5],
              'min_samples_split': [2, 4, 8],
              'n_estimators': [200, 400, 600, 800],
             'bootstrap': [False, True]}
# Create a basic model
rf = RandomForestRegressor(random_state=21)
# Instantiate the grid search model
rf_grid = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 5, n_jobs = -1, verbose = 0)
rf_grid.fit(train_X, train_y)
print(rf_grid.best_params_)

{'bootstrap': False, 'min_samples_leaf': 1, 'n_estimators': 600, 'min_samples_split': 2, 'max_features': 'sqrt', 'max_depth': 20}


Print the **RMSE** and **R2 score** of the Random Forest Regressor model to predict **IMR**

In [6]:
rf_cv_random=RandomForestRegressor(random_state=21,n_estimators= 600, min_samples_split= 2, min_samples_leaf= 1, max_features= 'sqrt', max_depth= 20, bootstrap= False)
rf_cv_random.fit(train_X,train_y )
predictions=rf_cv_random.predict(test_X)
rmse3 = np.sqrt(mean_squared_error(test_y, predictions))
print("RMSE : %f" % (rmse3))
r23= r2_score(test_y,predictions)
print("R2 score: %s" % '{:.2}'.format(r23))

RMSE : 0.024873
R2 score: 0.94


Print the error and the accuracy of the model

In [0]:
imr_errors = math.fabs(np.mean(predictions/test_y - 1))
imr_mape = 100 * (imr_errors)
imr_accuracy = 100 - (imr_mape)

In [8]:
imr_accuracy

99.66394063561928

Save the states and the model in the **imr_randomForestRegressor_pickle** pickle and upload in gdrive

In [0]:
imr_randomForestRegressor_db = {} 
imr_randomForestRegressor_db['forest_model'] = forest_model
imr_randomForestRegressor_db['rf_random'] = rf_random
imr_randomForestRegressor_db['rf_grid'] = rf_grid
imr_randomForestRegressor_db['rf_cv_random'] = rf_cv_random
imr_randomForestRegressor_db['predictions'] = predictions
imr_randomForestRegressor_db['rmse3'] = rmse3
imr_randomForestRegressor_db['r23'] = r23

In [0]:
imr_randomForestRegressor_dbfile = open('imr_randomForestRegressor_pickle', 'ab') 
# source, destination 
pickle.dump(imr_randomForestRegressor_db, imr_randomForestRegressor_dbfile)                      
imr_randomForestRegressor_dbfile.close() 

In [0]:
# Authenticate and create the PyDrive client.
# This only needs to be done once in a notebook.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
gdrive = GoogleDrive(gauth)

# Create & upload a file.
uploaded = gdrive.CreateFile({'title': 'imr_randomForestRegressor_pickle'})
uploaded.SetContentFile('imr_randomForestRegressor_pickle')
uploaded.Upload()
print('Uploaded file with ID {}'.format(uploaded.get('id')))

Uploaded file with ID 1e6g-hIO1ZavNqCaMW7aqj-45duE4CYSc
