In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV, cross_val_score, cross_val_score, cross_val_predict
from sklearn.model_selection import cross_val_score

import warnings
warnings.filterwarnings('ignore')

In [2]:
# data

# import data
df = pd.read_csv('../../#task1-datacollection/OUTPUT_WBI_exposer_cyclones_v3.csv')
df.columns = [e.lower().replace(' ','_') for e in df.columns]

# separate X and y
X = df.drop(['total_affected'], axis=1)
y = df['total_affected']

# types of features
num_features = list(df.loc[:, df.dtypes != object].columns)
cat_features = list(df.loc[:, df.dtypes == object].columns)

In [3]:
# custom transformers

class DataFrameSelector(BaseEstimator, TransformerMixin):
    # selects a slice of the dataset based on feature_names
    def __init__(self, feature_names):
        self.feature_names = feature_names

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[set(X.columns).intersection(self.feature_names)]


class TransformerCat(BaseEstimator, TransformerMixin):
    # transforms categorical features
    
    def __init__(self):
        self.columns_to_drop = []

    def fit(self, X, y=None):
        self.columns_to_drop= ['sid', 'name', 'iso', 'sub_basin', 'nature', 'iso_time',
       'coords', 'in_susan', 'year_tiff_c']
        return self
    
    def _get_income_level(self, income):
        order = {'High': 4, 'High_Middle': 3, 'Low_Middle': 2, 'Low': 1 }
        return order[income]
        
    def transform(self, X, y=None):
        X['income_level_final'] = X['income_level_final'].apply(self._get_income_level)
        X = X.drop(self.columns_to_drop, axis = 1)
        return X
    
class TransformerNum(BaseEstimator, TransformerMixin):
    # transforms numerical features
    
    def __init__(self):
        self.columns_to_drop = []

    def fit(self, X, y=None):
        self.columns_to_drop = ['unnamed:_0', 'year', 'total_hrs', 'day_hrs', 'night_hrs',
                                '34kn_pop', '34kn_assets', '64kn_pop', '64kn_assets','total_damage_(000$)', 'total_deaths',
                               'air_transport,_freight_(million_ton-km)',
                               'arable_land_(hectares_per_person)', 'cereal_yield_(kg_per_hectare)',
                               'food_production_index_(2004-2006_=_100)', 'gdp_growth_(annual_%)','net_flows_from_un_agencies_us$',
                               'mobile_cellular_subscriptions_(per_100_people)',
                               'adjusted_savings:_education_expenditure_(%_of_gni)',
                               'population_2000', 'population_2005',
                               'population_2010', 'population_2015', 'population_2020', 'in_wbi',
                               'pop_max_34', 'pop_max_50', 'pop_max_64', 'numeric', 'pop_max_34_adj',
                               'pop_max_50_adj', 'year_tiff', 'coef_year']
        return self
        
    def transform(self, X, y=None):
        X = X.drop(self.columns_to_drop, axis = 1)
        return X

In [4]:
# preprocess pipeline

preprocess_pipeline = FeatureUnion(transformer_list =[
        ("num_pipeline", Pipeline([
            ('selector', DataFrameSelector(num_features)),
            ('num_transformer', TransformerNum()),
            ('imputer', SimpleImputer(strategy="most_frequent")),
            ("scaler", StandardScaler())
             ])),
        ("cat_pipeline", Pipeline([
            ('selector', DataFrameSelector(cat_features)),
            ('cat_transformer', TransformerCat()),
            ('imputer', SimpleImputer(strategy="most_frequent")),
            ('encoder', OneHotEncoder(handle_unknown='ignore', sparse = False)), # TODO: not encode income_level_final
            ("scaler", StandardScaler())
            ]))])

In [5]:
class ModelEvaluator:
    # evaluates different models using cross validation on the whole dataset
    
    def __init__(self, X, y):
        self.X = X
        self.y = y
        
    def evaluate_models(self, models):
        for model_name, model in models.items():
            print(f'***{model_name}***')
            self._evaluate_model(model_name, model)
            print('\n')
            
    def _display_scores(self, scores):
            print("Scores:", scores)
            print("Mean:", scores.mean())
            print("Standard deviation:", scores.std())

    def _evaluate_model(self, model_name, model):
        transformed_X = preprocess_pipeline.fit_transform(self.X)
        scores = cross_val_score(model, transformed_X, self.y, scoring="neg_mean_squared_error", cv=10)
        tree_rmse_scores = np.sqrt(-scores)
        self._display_scores(tree_rmse_scores)     

In [6]:
# grid search random_forest

param_grid = [
    {'n_estimators': list(range(10,200,30)), 'max_features': list(range(2, 10, 2)), 'min_samples_leaf':[2,4,6]},
    {'bootstrap': [False], 'n_estimators': list(range(10,200,30)), 'max_features': list(range(2, 10, 2)), 'min_samples_leaf':[2,4,6]},
  ]

forest_reg = RandomForestRegressor(random_state=42)
grid_search_forest = GridSearchCV(forest_reg, 
                                  param_grid, 
                                  cv=6, 
                                  scoring='neg_mean_squared_error', 
                                  return_train_score=True, 
                                  iid = True, 
                                  verbose=1) # higher verbose desactivated for Github visualization purposes
transformed_X = preprocess_pipeline.fit_transform(X)
grid_search_forest.fit(transformed_X, y)

print(grid_search_forest.score(transformed_X, y))
print(grid_search_forest.best_params_)

Fitting 6 folds for each of 168 candidates, totalling 1008 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


-2507844188752.267
{'max_features': 2, 'min_samples_leaf': 6, 'n_estimators': 10}


[Parallel(n_jobs=1)]: Done 1008 out of 1008 | elapsed:  3.2min finished


In [7]:
# evaluate models 

models = {'forest_grid_search': grid_search_forest.best_estimator_, 
          'forest_1': RandomForestRegressor(n_estimators=3, random_state=42), 
          'forest_2':RandomForestRegressor(max_depth=5, n_estimators=10, max_features=9, random_state=42)}

random_forest_evaluator = ModelEvaluator(X, y)

random_forest_evaluator.evaluate_models(models)

***forest_grid_search***
Scores: [ 801246.98124193 1192102.05630572 1415787.85084803 1649069.59585499
 1216158.82270432 3788041.75874604 1571856.45248941 1174365.29334794
 2206267.61955972 2158662.00958737]
Mean: 1717355.8440685451
Standard deviation: 806193.9304452471


***forest_1***
Scores: [1203935.6467219  1221914.54210311 1866030.4192242  2041583.1177857
 2446371.91911332 4023303.5768814  1727616.98042126 1375373.83548063
 2269022.99047438 3266021.47866986]
Mean: 2144117.450687577
Standard deviation: 865081.1179126535


***forest_2***
Scores: [1070183.89069505 1213526.78678144 1433558.66806751 1807821.43924724
 1229701.83592892 3792966.29749878 1380569.79691128 1163771.04404076
 2185099.52482229 2301506.80894139]
Mean: 1757870.609293467
Standard deviation: 791886.8804492861


