# Random Forest

In [1]:
import warnings
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import KFold, train_test_split, RandomizedSearchCV

import seaborn as sns
import matplotlib.pyplot as myplot
%matplotlib inline
warnings.filterwarnings('ignore')
df = pd.read_csv("Financial_Mexican_Firms.csv")

1. Train & Test Split

In [2]:
X  = df.iloc[:, 0:10]
y  = df.iloc[:, 10]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state = 0) 
df_train = pd.concat([X_train, y_train], axis = 1)
df_test  = pd.concat([X_test, y_test], axis = 1)

2. Handling missing values

In [3]:
from sklearn.impute import KNNImputer

knn       = KNNImputer(n_neighbors = 3)

df_train  = pd.DataFrame(knn.fit_transform(df_train), columns = ['ProposedIndex', 
                                                                 'IIHH',  
                                                                 'Shannon',
                                                                 'Size',
                                                                 'AssetTurnover',
                                                                 'Debt',
                                                                 'QuickRatio',
                                                                 'CashHoldings',
                                                                 'ROE',
                                                                 'ROI',
                                                                 'ROA'])
df_test  = pd.DataFrame(knn.fit_transform(df_test), columns = ['ProposedIndex', 
                                                                 'IIHH',  
                                                                 'Shannon',
                                                                 'Size',
                                                                 'AssetTurnover',
                                                                 'Debt',
                                                                 'QuickRatio',
                                                                 'CashHoldings',
                                                                 'ROE',
                                                                 'ROI', 
                                                                 'ROA'])

# Update the training set
X_train  = df_train.iloc[:,:-1]
y_train  = df_train.ROA

X_test  = df_test.iloc[:,:-1]
y_test  = df_test.ROA

5. Hyperparameter Tunning

In [4]:
# Specify different values for the tunning process
kfold             = KFold(n_splits = 5, random_state = None, shuffle = False)

n_estimators      = [int(x) for x in np.linspace(start = 50, stop = 200, num = 12)] 
max_features      = ['auto', 'sqrt'] 
max_depth         = [int(x) for x in np.linspace(5, 30, 6)] 
min_samples_split = [int(x) for x in np.linspace(2, 20, 6)] 
min_samples_leaf  = [int(x) for x in np.linspace(1, 20, 6)] 

#Create parameter grid
random_grid ={'n_estimators'     :n_estimators,
              'max_features'     :max_features,
              'max_depth'        :max_depth,
              'min_samples_split':min_samples_split,
              'min_samples_leaf' :min_samples_leaf}

#Create Random Forest object
rf = RandomForestRegressor()

#Randomized Search CV
rf_search = RandomizedSearchCV(rf, 
                               random_grid, 
                               scoring      = 'neg_mean_squared_error', 
                               n_iter       = 10, 
                               cv           = 10, 
                               n_jobs       = -1)

6. Fit the model and measure time to execute

In [5]:
from datetime import datetime

def timer(start_time=None):
    if not start_time:
        start_time = datetime.now()
        return start_time
    elif start_time:
        thour, temp_sec = divmod((datetime.now() - start_time).total_seconds(), 3600)
        tmin, tsec = divmod(temp_sec, 60)
        print('\n Time taken: %i hours %i minutes and %s seconds.' % (thour, tmin, round(tsec, 2)))
        
start_time = timer(None) # timing starts from this point for "start_time" variable
rf_search.fit(X_train,y_train)
timer(start_time) # timing ends here for "start_time" variable


 Time taken: 0 hours 0 minutes and 6.13 seconds.


7. Get best tunning paratemers 

In [6]:
rf_search.best_params_

{'n_estimators': 63,
 'min_samples_split': 16,
 'min_samples_leaf': 1,
 'max_features': 'auto',
 'max_depth': 30}

8. Specify the optimal model

In [10]:
optimal_model = RandomForestRegressor(n_estimators      = 63,
                                      min_samples_split = 16,
                                      min_samples_leaf  = 1,
                                      max_features      = 'auto',
                                      max_depth         = 30).fit(X_train, y_train)


9. Evaluate performance

In [11]:
import time
start_time = time.time()

y_pred     = optimal_model.predict(X_test)
    
print('R2  :', r2_score(y_test, y_pred))
print('MAE :', mean_absolute_error(y_test, y_pred))
print('MSE :', mean_squared_error(y_test, y_pred))
print('RMSE:', np.sqrt(mean_squared_error(y_test, y_pred)))
print('Processing time: %s seconds' % round((time.time() - start_time), 4))

R2  : 0.7831185810660819
MAE : 0.015633896146119295
MSE : 0.0009361321825694034
RMSE: 0.03059627726651403
Processing time: 0.0111 seconds


10. Deployment

In [13]:
import pickle
file = open("random_forest_diversification.pkl", 'wb')
pickle.dump(optimal_model, file)