# Random Forest Regression Model 

@Thiago Cunha Montenegro

This notebook will cover three aproches of a Random Forest Model.

1. Simple Random Forest Model

2. Random Forest Model using feature ranking with recursive feature elimination and cross-validated selection (RFECV)

3. Random Forest Model using Boruta Model Selection

In [0]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler 
from sklearn.ensemble import RandomForestRegressor

import math

# Simple  Random Forest Regression

In [0]:
df1 = pd.read_csv('/content/OUTPUT_WBI_exposer_cyclones_v10.csv')

In [45]:
df1.columns

Index(['Unnamed: 0', 'SID', 'NAME', 'ISO', 'YEAR', 'BASIN', 'SUB BASIN',
       'NATURE', 'ISO_TIME', 'COORDS', 'TOTAL_HRS', 'DAY_HRS', 'NIGHT_HRS',
       'USA_SSHS', 'WIND_CALC_MEAN', 'PRES_CALC_MEAN', 'STORM_SPD_MEAN',
       'STORM_DR_MEAN', 'V_LAND_KN', '34KN_POP', '34KN_ASSETS', '64KN_POP',
       '64KN_ASSETS', '96KN_POP', '96KN_ASSETS', 'CPI', 'TOTAL_DAMAGE_(000$)',
       'TOTAL_DEATHS', 'Air transport, freight (million ton-km)',
       'Arable land (hectares per person)', 'Cereal yield (kg per hectare)',
       'Food production index (2004-2006 = 100)', 'GDP growth (annual %)',
       'GDP per capita (constant 2010 US$)', 'Net flows from UN agencies US$',
       'Life expectancy at birth, total (years)',
       'Mobile cellular subscriptions (per 100 people)',
       'Population density (people per sq. km of land area)',
       'Adjusted savings: education expenditure (% of GNI)',
       'Rural population (% of total population)', 'Income_level_Final',
       'TOTAL_AFFECTED'

In [0]:
def process_month(df):
  """

  """
  # Adding month per Vamsi's code in SVM folder
  df['Month'] = df.ISO_TIME.apply(lambda x: x[7:9]).astype(int)
  return df

def process_emdat(df):
  df['CALC_assets_emdat'] = df['34KN_ASSETS'] + df['64KN_ASSETS'] + df['96KN_ASSETS']
  return df


def process_total_damage(df):
  """
  Process the Total Damage column into Total Damage Ajusted divided by CPI 
  Using Xavier logic
  Usage
  ------

  df = process_total_damaged(df)
  """
  df['TOTAL_DAMAGE_Adjusted'] = df['TOTAL_DAMAGE_(000$)'] / df['CPI']
  return df

def process_income_final(df):
  """
    Process the Income Final Level column into numerical

    Usage
    ------

    df = process_total_damaged(df)
  """
  #Arnab code
  unique_list = df['Income_level_Final'].unique()
  incomelevel_to_int = dict(zip(unique_list, [1,2,4,3])) # so we assign['Low' =1, 'Low_Middle' =2, 'High'=4, 'High_Middle'=3]
  df['Income_level_Final'] = df['Income_level_Final'].apply(lambda inc_level: incomelevel_to_int[inc_level])
  return df


def process_missing_data(df):
   """
    Process and handling missing data with accourd the year range and replace with mean.

    Usage
    -----
    df = process_missing_data(df)

   """
   df['SUB BASIN']= df['SUB BASIN'].replace('MM', np.nan) 
   df["BASIN"]= df["BASIN"].replace('MM', np.nan) 
   df["USA_SSHS"]= df["USA_SSHS"].replace(-5, np.nan)
   df["TOTAL_DAMAGE_Adjusted"]= df["TOTAL_DAMAGE_Adjusted"].replace(0, np.nan)
   return df

def create_dummies(df, column_name):
   """Create Dummy Columns (One Hot Encoding) from a single Column

    Usage
    ------

    df = create_dummies(train, categorical_column)
    """
   dummies = pd.get_dummies(df[column_name], prefix=column_name)
   df = pd.concat([df,dummies],axis=1)
   return df

def drop_columns(df):
  columns_to_drop = ['ISO_TIME','TOTAL_DAMAGE_(000$)','CPI']
  df.drop('ISO_TIME', axis=1, inplace=True)
  df = df.drop(["Unnamed: 0", 'SID', 'NAME','COORDS', 'YEAR',"TOTAL_DAMAGE_(000$)",'CPI','64KN_ASSETS', '34KN_ASSETS','96KN_ASSETS'], axis=1)
  df = df.dropna()
  return df

In [0]:
def preprocessing(df):
   df = process_month(df)
   df = process_emdat(df)
   df = process_total_damage(df)
   df = process_income_final(df)
   df = process_missing_data(df)
   df = drop_columns(df)
   
   categorical_columns = df.dtypes[df.dtypes == 'object'].index
   for col in categorical_columns:
     df = create_dummies(df,col)
     df = df.drop(col, axis =1)
  
   return df


In [0]:
df1 = preprocessing(df1)

In [49]:
df1.shape

(434, 92)

## Model

In [0]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

In [0]:
X = df1.drop(['TOTAL_AFFECTED'], axis = 1)
y = df1['TOTAL_AFFECTED']

In [0]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 101)

In [0]:
scaler = MinMaxScaler()
X_train =  scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)


In [55]:
rfr = RandomForestRegressor(random_state= 101)

rfr.fit(X_train,y_train)


RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=101, verbose=0, warm_start=False)

In [0]:
predictions = rfr.predict(X_test)

## Model Evaluation

In [57]:
from sklearn.metrics import r2_score, explained_variance_score

score = rfr.score(X_train,y_train)
def get_results(predictions, y_test, score):
 
  mse = mean_squared_error(y_test, predictions)
  rmse = np.sqrt(mse)
  print(f'Root Mean Squared Error: {rmse:.2f}')
  r2 = r2_score(y_test, predictions)
  print(f'Variance Score: {r2:.3f}')
  explained_variance = explained_variance_score(y_test, predictions)
  print(f'Explained Variance Score: {explained_variance:.3f}')
  print(f'R^2 Prediction Score: {score:.4f}')

get_results(predictions, y_test, score)

Root Mean Squared Error: 3814570.80
Variance Score: -2.805
Explained Variance Score: -0.299
R^2 Prediction Score: 0.9021


# Random Forest Model using feature ranking with recursive feature elimination and cross-validated selection (RFECV)

In [0]:
from sklearn.feature_selection import RFECV

In [0]:
def select_features(df):
    # Remove non-numeric columns, columns that have null values
    df = df.select_dtypes([np.number]).dropna(axis=1)
    all_X = df.drop(['TOTAL_AFFECTED'],axis=1)
    all_y = df["TOTAL_AFFECTED"]
    
    clf = RandomForestRegressor(random_state=1)
    selector = RFECV(clf,cv=10)
    selector.fit(all_X,all_y)
    
    best_columns = list(all_X.columns[selector.support_])
    print("Best Columns \n"+"-"*12+"\n{}\n".format(best_columns))
    
    return best_columns

cols = select_features(df1)


Best Columns 
------------
['TOTAL_HRS', 'NIGHT_HRS', 'WIND_CALC_MEAN', 'PRES_CALC_MEAN', 'STORM_SPD_MEAN', 'STORM_DR_MEAN', 'V_LAND_KN', '34KN_POP', '96KN_POP', 'TOTAL_DEATHS', 'Arable land (hectares per person)', 'Cereal yield (kg per hectare)', 'Food production index (2004-2006 = 100)', 'GDP growth (annual %)', 'Net flows from UN agencies US$', 'Mobile cellular subscriptions (per 100 people)', 'Population density (people per sq. km of land area)', 'Adjusted savings: education expenditure (% of GNI)', 'Rural population (% of total population)', 'pop_max_34', 'pop_max_50', 'pop_max_64', 'pop_max_34_adj', 'pop_max_50_adj', 'pop_max_64_adj', 'Month', 'CALC_assets_emdat', 'TOTAL_DAMAGE_Adjusted', 'ISO_CHN', 'ISO_IND']



In [0]:
from sklearn.model_selection import GridSearchCV

def select_model(df,features):
    
    all_X = df[features]
    all_y = df["TOTAL_AFFECTED"]

    # List of dictionaries, each containing a model name,
    # it's estimator and a dict of hyperparameters
    models = [
        
        {
            "name": "RandomForestRegressor",
            "estimator": RandomForestRegressor(random_state=1),
            "hyperparameters":
                {
                    "n_estimators": [100, 300,500, 700, 1000],
                    "criterion": ["mse"],
                    "max_depth": [2, 5, 10, 12],
                    "max_features": ["log2", "sqrt"],
                    "min_samples_leaf": [1, 5, 8],
                    "min_samples_split": [2, 3, 5]

                }
        }
    ]

    for model in models:
        print(model['name'])
        print('-'*len(model['name']))

        grid = GridSearchCV(model["estimator"],
                            param_grid=model["hyperparameters"],
                            cv=10)
        grid.fit(all_X,all_y)
        model["best_params"] = grid.best_params_
        model["best_score"] = grid.best_score_
        model["best_model"] = grid.best_estimator_

        print("Best Score: {}".format(model["best_score"]))
        print("Best Parameters: {}\n".format(model["best_params"]))

    return models

result = select_model(df1, df1.columns)

RandomForestRegressor
---------------------
Best Score: 0.5094916835195316
Best Parameters: {'criterion': 'mse', 'max_depth': 12, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 3, 'n_estimators': 300}



In [0]:
df1[cols]

Unnamed: 0,TOTAL_HRS,NIGHT_HRS,WIND_CALC_MEAN,PRES_CALC_MEAN,STORM_SPD_MEAN,STORM_DR_MEAN,V_LAND_KN,34KN_POP,96KN_POP,TOTAL_DEATHS,Arable land (hectares per person),Cereal yield (kg per hectare),Food production index (2004-2006 = 100),GDP growth (annual %),Net flows from UN agencies US$,Mobile cellular subscriptions (per 100 people),Population density (people per sq. km of land area),Adjusted savings: education expenditure (% of GNI),Rural population (% of total population),pop_max_34,pop_max_50,pop_max_64,pop_max_34_adj,pop_max_50_adj,pop_max_64_adj,Month,CALC_assets_emdat,TOTAL_DAMAGE_Adjusted,ISO_CHN,ISO_IND
46,21,21,106.551282,933.961538,11.076923,289.846154,152.4,45155148.0,2057152.0,137.0,0.130434,1350.3,34.36,3.764605,4.660000e+06,0.000000,120.041554,2.757732,67.021,13092821.0,3169652.25,1.272250e+06,6.010508e+06,1.455089e+06,5.840507e+05,9,4.708724e+11,154.941922,0,0
47,0,0,99.133333,937.875000,16.100000,273.800000,138.2,47135727.0,18280574.0,786.0,0.130434,1350.3,34.36,3.764605,4.660000e+06,0.000000,120.041554,2.757732,67.021,41400620.0,27055222.00,1.905231e+07,1.900574e+07,1.242021e+07,8.746325e+06,11,8.201000e+11,6430.517768,0,0
49,9,9,72.000000,972.115385,14.230769,285.461538,118.1,47513577.0,5038016.0,57.0,0.126026,1272.2,35.81,5.428631,7.110000e+06,0.000000,123.548833,2.800000,66.507,39594444.0,9378618.00,4.173788e+06,1.870765e+07,4.431225e+06,1.972038e+06,5,6.905024e+11,37.251579,0,0
52,18,18,53.162500,986.468750,9.083333,188.958333,109.0,25294940.0,1368797.0,204.0,0.123934,1163.9,35.26,5.446791,6.160000e+06,0.000000,127.148783,2.205623,65.988,22292644.0,7751457.50,3.707665e+06,1.083977e+07,3.769137e+06,1.802848e+06,1,3.249557e+11,1406.166674,0,0
54,6,2,65.064103,981.730769,16.076923,299.000000,100.5,54881884.0,118953.0,131.0,0.123934,1163.9,35.26,5.446791,6.160000e+06,0.000000,127.148783,2.205623,65.988,47985888.0,36608820.00,2.840762e+07,2.333308e+07,1.780099e+07,1.381317e+07,6,7.725440e+11,917.065222,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
882,6,5,32.059524,999.791667,13.642857,294.000000,39.2,1839116.0,0.0,2.0,0.054743,3556.1,116.33,6.066549,1.968888e+07,115.399439,342.466419,1.840000,53.716,37431028.0,8070387.50,3.796552e+06,3.743103e+07,8.070388e+06,3.796552e+06,10,2.147252e+10,14.022398,0,0
883,21,9,71.403846,966.884615,10.615385,313.923077,115.8,50063724.0,4139582.0,20.0,0.086784,5896.2,135.91,6.905317,5.549183e+07,91.835388,146.057642,1.790000,44.500,63259936.0,39423328.00,1.688337e+07,6.325994e+07,3.942333e+07,1.688337e+07,10,1.694590e+12,45303.131998,1,0
884,12,12,62.467742,972.934896,5.000000,166.718750,126.5,47240917.0,2604107.0,51.0,0.054743,3556.1,116.33,6.066549,1.968888e+07,115.399439,342.466419,1.840000,53.716,49231756.0,17767242.00,9.209509e+06,4.923176e+07,1.776724e+07,9.209509e+06,10,5.354291e+11,2275.781263,0,0
885,21,21,114.875000,935.375000,17.250000,23.875000,151.6,12359952.0,508252.0,14.0,0.188957,3470.1,119.88,3.287992,6.867377e+06,88.371756,62.685901,5.212839,20.715,11158190.0,1719007.25,7.712089e+05,1.115819e+07,1.719007e+06,7.712089e+05,10,3.431518e+11,8877.256580,0,0


## Model

In [0]:
X_best_model = df1[cols]
y_best_model = df1['TOTAL_AFFECTED']

In [0]:
X_train, X_test, y_train, y_test = train_test_split(X_best_model, y_best_model, test_size = 0.25)

In [0]:
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

In [0]:
rfc_best_model = RandomForestRegressor(criterion = 'mse', max_depth =  12, max_features ='sqrt', min_samples_leaf = 1, min_samples_split = 3, n_estimators= 300)
rfc_best_model.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=12, max_features='sqrt', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=3, min_weight_fraction_leaf=0.0,
                      n_estimators=300, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

## Model Evaluation

In [0]:
predictions = rfc_best_model.predict(X_test)
rfc_best_model_score = rfc_best_model.score(X_train, y_train)
get_results(predictions, y_test, rfc_best_model_score)

Root Mean Squared Error: 2812870.88
Variance Score: -1.901
Explained Variance Score: -0.392
R^2 Prediction Score: 0.8840


#Random Forest with Boruta Model Selection

For more info about boruta: [Boruta](http://danielhomola.com/2015/05/08/borutapy-an-all-relevant-feature-selection-method/)

In [0]:
df_boruta = pd.read_csv('/content/OUTPUT_WBI_exposer_cyclones_v10.csv')

In [0]:
pip install Boruta

Collecting Boruta
[?25l  Downloading https://files.pythonhosted.org/packages/b2/11/583f4eac99d802c79af9217e1eff56027742a69e6c866b295cce6a5a8fc2/Boruta-0.3-py3-none-any.whl (56kB)
[K     |█████▉                          | 10kB 16.7MB/s eta 0:00:01[K     |███████████▋                    | 20kB 3.1MB/s eta 0:00:01[K     |█████████████████▍              | 30kB 3.7MB/s eta 0:00:01[K     |███████████████████████▏        | 40kB 3.0MB/s eta 0:00:01[K     |█████████████████████████████   | 51kB 3.3MB/s eta 0:00:01[K     |████████████████████████████████| 61kB 2.8MB/s 
Installing collected packages: Boruta
Successfully installed Boruta-0.3


## Boruta Feature Selection

In [0]:
df_boruta = df_boruta.dropna()

In [0]:
df_boruta = pd.get_dummies(df_boruta,drop_first=True, dummy_na=True)

In [0]:
features = [f for f in df_boruta.columns if f not in ['TOTAL_AFFECTED']]
len(features)

1660

In [0]:
df_boruta.shape

(439, 1661)

In [0]:
X_boruta = df_boruta.drop('TOTAL_AFFECTED', axis = 1).values
y_boruta = df_boruta['TOTAL_AFFECTED'].values

In [0]:
from sklearn.ensemble import RandomForestClassifier
from boruta import BorutaPy  
rf = RandomForestClassifier(n_jobs=-1, class_weight='balanced', max_depth=5)
boruta_feature_selector = BorutaPy(rf, n_estimators='auto', verbose=2, random_state=4242, max_iter = 50, perc = 90)
boruta_feature_selector.fit(X_boruta, y_boruta)

Iteration: 	1 / 50
Confirmed: 	0
Tentative: 	1660
Rejected: 	0
Iteration: 	2 / 50
Confirmed: 	0
Tentative: 	1660
Rejected: 	0
Iteration: 	3 / 50
Confirmed: 	0
Tentative: 	1660
Rejected: 	0
Iteration: 	4 / 50
Confirmed: 	0
Tentative: 	1660
Rejected: 	0
Iteration: 	5 / 50
Confirmed: 	0
Tentative: 	1660
Rejected: 	0
Iteration: 	6 / 50
Confirmed: 	0
Tentative: 	1660
Rejected: 	0
Iteration: 	7 / 50
Confirmed: 	0
Tentative: 	1660
Rejected: 	0
Iteration: 	8 / 50
Confirmed: 	0
Tentative: 	789
Rejected: 	871
Iteration: 	9 / 50
Confirmed: 	31
Tentative: 	758
Rejected: 	871
Iteration: 	10 / 50
Confirmed: 	31
Tentative: 	758
Rejected: 	871
Iteration: 	11 / 50
Confirmed: 	31
Tentative: 	758
Rejected: 	871
Iteration: 	12 / 50
Confirmed: 	31
Tentative: 	378
Rejected: 	1251
Iteration: 	13 / 50
Confirmed: 	33
Tentative: 	376
Rejected: 	1251
Iteration: 	14 / 50
Confirmed: 	33
Tentative: 	376
Rejected: 	1251
Iteration: 	15 / 50
Confirmed: 	33
Tentative: 	376
Rejected: 	1251
Iteration: 	16 / 50
Confirmed:

BorutaPy(alpha=0.05,
         estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                          class_weight='balanced',
                                          criterion='gini', max_depth=5,
                                          max_features='auto',
                                          max_leaf_nodes=None, max_samples=None,
                                          min_impurity_decrease=0.0,
                                          min_impurity_split=None,
                                          min_samples_leaf=1,
                                          min_samples_split=2,
                                          min_weight_fraction_leaf=0.0,
                                          n_estimators=164, n_jobs=-1,
                                          oob_score=False,
                                          random_state=RandomState(MT19937) at 0x7F4E633CD888,
                                          verbose=0, warm_start=

In [0]:
X_filtered = boruta_feature_selector.transform(X_boruta)
X_filtered.shape

(439, 33)

In [0]:
final_features = list()
indexes = np.where(boruta_feature_selector.support_ == True)

for x in np.nditer(indexes):
    final_features.append(features[x])
final_features

['Unnamed: 0',
 'YEAR',
 'WIND_CALC_MEAN',
 'PRES_CALC_MEAN',
 'STORM_SPD_MEAN',
 'STORM_DR_MEAN',
 'V_LAND_KN',
 '34KN_POP',
 '34KN_ASSETS',
 '64KN_POP',
 '64KN_ASSETS',
 '96KN_POP',
 'CPI',
 'TOTAL_DAMAGE_(000$)',
 'TOTAL_DEATHS',
 'Air transport, freight (million ton-km)',
 'Arable land (hectares per person)',
 'Cereal yield (kg per hectare)',
 'Food production index (2004-2006 = 100)',
 'GDP growth (annual %)',
 'GDP per capita (constant 2010 US$)',
 'Net flows from UN agencies US$',
 'Life expectancy at birth, total (years)',
 'Mobile cellular subscriptions (per 100 people)',
 'Population density (people per sq. km of land area)',
 'Adjusted savings: education expenditure (% of GNI)',
 'Rural population (% of total population)',
 'pop_max_34',
 'pop_max_50',
 'pop_max_64',
 'pop_max_34_adj',
 'pop_max_50_adj',
 'pop_max_64_adj']

In [0]:
df_boruta_final = df_boruta[final_features]
df_boruta_final = pd.concat([df_boruta_final, df_boruta['TOTAL_AFFECTED']], axis =1)


In [0]:
df_boruta_final.columns

Index(['Unnamed: 0', 'YEAR', 'WIND_CALC_MEAN', 'PRES_CALC_MEAN',
       'STORM_SPD_MEAN', 'STORM_DR_MEAN', 'V_LAND_KN', '34KN_POP',
       '34KN_ASSETS', '64KN_POP', '64KN_ASSETS', '96KN_POP', 'CPI',
       'TOTAL_DAMAGE_(000$)', 'TOTAL_DEATHS',
       'Air transport, freight (million ton-km)',
       'Arable land (hectares per person)', 'Cereal yield (kg per hectare)',
       'Food production index (2004-2006 = 100)', 'GDP growth (annual %)',
       'GDP per capita (constant 2010 US$)', 'Net flows from UN agencies US$',
       'Life expectancy at birth, total (years)',
       'Mobile cellular subscriptions (per 100 people)',
       'Population density (people per sq. km of land area)',
       'Adjusted savings: education expenditure (% of GNI)',
       'Rural population (% of total population)', 'pop_max_34', 'pop_max_50',
       'pop_max_64', 'pop_max_34_adj', 'pop_max_50_adj', 'pop_max_64_adj',
       'TOTAL_AFFECTED'],
      dtype='object')

In [0]:

df_boruta_final = process_total_damage(df_boruta_final)

In [0]:
df_boruta_final = df_boruta_final.drop(['Unnamed: 0', 'TOTAL_DAMAGE_(000$)', 'CPI','YEAR'], axis = 1)

## Model

In [0]:
X_boruta = df_boruta_final.drop(['TOTAL_AFFECTED'], axis = 1)
y_boruta = df_boruta_final['TOTAL_AFFECTED']

In [0]:
X_train, X_test, y_train, y_test = train_test_split(X_boruta, y_boruta, test_size = 0.25)

In [0]:
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

In [0]:
rfr_boruta = RandomForestRegressor(criterion = 'mse', max_depth =  12, max_features ='sqrt', min_samples_leaf = 1, min_samples_split = 3, n_estimators= 300)
rfr_boruta.fit(X_train, y_train)
rfr_boruta_score = rfr_boruta.score(X_train,y_train)

## Model Evaluation

In [0]:
predictions = rfr_boruta.predict(X_test)
get_results(predictions, y_test, rfr_boruta_score)

Root Mean Squared Error: 2775910.84
Variance Score: 0.113
Explained Variance Score: 0.126
R^2 Prediction Score: 0.8832
