In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV, cross_val_score, cross_val_score, cross_val_predict
from sklearn.model_selection import cross_val_score

import warnings
warnings.filterwarnings('ignore')

In [2]:
# data

# import data
df = pd.read_csv('../../#task1-datacollection/OUTPUT_WBI_exposer_cyclones_v3.csv')
df.columns = [e.lower().replace(' ','_') for e in df.columns]

# separate X and y
X = df.drop(['total_affected'], axis=1)
y = df['total_affected']

# types of features
num_features = list(df.loc[:, df.dtypes != object].columns)
cat_features = list(df.loc[:, df.dtypes == object].columns)

In [3]:
# custom transformers

class DataFrameSelector(BaseEstimator, TransformerMixin):
    # selects a slice of the dataset based on feature_names
    def __init__(self, feature_names):
        self.feature_names = feature_names

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[set(X.columns).intersection(self.feature_names)]


class TransformerCat(BaseEstimator, TransformerMixin):
    # transforms categorical features
    
    def __init__(self):
        self.columns_to_drop = []

    def fit(self, X, y=None):
        self.columns_to_drop= ['sid', 'name', 'iso', 'sub_basin', 'nature', 'iso_time',
       'coords', 'in_susan', 'year_tiff_c']
        return self
    
    def _get_income_level(self, income):
        order = {'High': 4, 'High_Middle': 3, 'Low_Middle': 2, 'Low': 1 }
        return order[income]
        
    def transform(self, X, y=None):
        X['income_level_final'] = X['income_level_final'].apply(self._get_income_level)
        X = X.drop(self.columns_to_drop, axis = 1)
        print(X.columns)
        return X
    
class TransformerNum(BaseEstimator, TransformerMixin):
    # transforms numerical features
    
    def __init__(self):
        self.columns_to_drop = []

    def fit(self, X, y=None):
        self.columns_to_drop = ['unnamed:_0', 'year', 'total_hrs', 'day_hrs', 'night_hrs',
                                '34kn_pop', '34kn_assets', '64kn_pop', '64kn_assets','total_damage_(000$)', 'total_deaths',
                               'air_transport,_freight_(million_ton-km)',
                               'arable_land_(hectares_per_person)', 'cereal_yield_(kg_per_hectare)',
                               'food_production_index_(2004-2006_=_100)', 'gdp_growth_(annual_%)','net_flows_from_un_agencies_us$',
                               'mobile_cellular_subscriptions_(per_100_people)',
                               'adjusted_savings:_education_expenditure_(%_of_gni)',
                               'population_2000', 'population_2005',
                               'population_2010', 'population_2015', 'population_2020', 'in_wbi',
                               'pop_max_34', 'pop_max_50', 'pop_max_64', 'numeric', 'pop_max_34_adj',
                               'pop_max_50_adj', 'year_tiff', 'coef_year']
        return self
        
    def transform(self, X, y=None):
        X = X.drop(self.columns_to_drop, axis = 1)
        return X


In [4]:
# preprocess pipeline

preprocess_pipeline = FeatureUnion(transformer_list =[
        ("num_pipeline", Pipeline([
            ('selector', DataFrameSelector(num_features)),
            ('num_transformer', TransformerNum()),
            ('imputer', SimpleImputer(strategy="most_frequent")),
            ("scaler", StandardScaler())
             ])),
        ("cat_pipeline", Pipeline([
            ('selector', DataFrameSelector(cat_features)),
            ('cat_transformer', TransformerCat()),
            ('imputer', SimpleImputer(strategy="most_frequent")),
            ('encoder', OneHotEncoder(handle_unknown='ignore', sparse = False)), # TODO: not encode income_level_final
            ("scaler", StandardScaler())
            ]))])

In [5]:
class ModelEvaluator:
    # evaluates different models using cross validation on the whole dataset
    
    def __init__(self, X, y):
        self.X = X
        self.y = y
        
    def evaluate_models(self, models):
        for model_name, model in models.items():
            print(f'***{model_name}***')
            self._evaluate_model(model_name, model)
            print('\n')
            
    def _display_scores(self, scores):
            print("Scores:", scores)
            print("Mean:", scores.mean())
            print("Standard deviation:", scores.std())

    def _evaluate_model(self, model_name, model):
        transformed_X = preprocess_pipeline.fit_transform(self.X)
        scores = cross_val_score(model, transformed_X, self.y, scoring="neg_mean_squared_error", cv=10)
        tree_rmse_scores = np.sqrt(-scores)
        self._display_scores(tree_rmse_scores)     

In [6]:
# grid search random_forest

param_grid = [
    {'n_estimators': list(range(10,200,30)), 'max_features': list(range(2, 10, 2)), 'min_samples_leaf':[2,4,6]},
    {'bootstrap': [False], 'n_estimators': list(range(10,200,30)), 'max_features': list(range(2, 10, 2)), 'min_samples_leaf':[2,4,6]},
  ]

forest_reg = RandomForestRegressor(random_state=42)
grid_search_forest = GridSearchCV(forest_reg, param_grid, cv=6, scoring='neg_mean_squared_error', return_train_score=True, iid = True, verbose=20)
transformed_X = preprocess_pipeline.fit_transform(X)
grid_search_forest.fit(transformed_X, y)

print(grid_search_forest.score(transformed_X, y))
print(grid_search_forest.best_params_)

Index(['basin', 'income_level_final'], dtype='object')
Fitting 6 folds for each of 168 candidates, totalling 1008 fits
[CV] max_features=2, min_samples_leaf=2, n_estimators=10 .............
[CV]  max_features=2, min_samples_leaf=2, n_estimators=10, score=(train=-1952890264403.983, test=-1122379331860.334), total=   0.0s
[CV] max_features=2, min_samples_leaf=2, n_estimators=10 .............
[CV]  max_features=2, min_samples_leaf=2, n_estimators=10, score=(train=-1944632713548.494, test=-1832726104284.906), total=   0.0s
[CV] max_features=2, min_samples_leaf=2, n_estimators=10 .............
[CV]  max_features=2, min_samples_leaf=2, n_estimators=10, score=(train=-1879659145978.595, test=-2457356143563.456), total=   0.0s
[CV] max_features=2, min_samples_leaf=2, n_estimators=10 .............
[CV]  max_features=2, min_samples_leaf=2, n_estimators=10, score=(train=-1086248730179.304, test=-8229227963929.140), total=   0.0s
[CV] max_features=2, min_samples_leaf=2, n_estimators=10 ............

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    0.1s remaining:    0.0s


[CV]  max_features=2, min_samples_leaf=2, n_estimators=40, score=(train=-1767884808753.595, test=-1681952989201.200), total=   0.0s
[CV] max_features=2, min_samples_leaf=2, n_estimators=40 .............
[CV]  max_features=2, min_samples_leaf=2, n_estimators=40, score=(train=-1688731278558.806, test=-2441055404734.316), total=   0.1s
[CV] max_features=2, min_samples_leaf=2, n_estimators=40 .............
[CV]  max_features=2, min_samples_leaf=2, n_estimators=40, score=(train=-1128427219812.265, test=-8600899419567.726), total=   0.0s
[CV] max_features=2, min_samples_leaf=2, n_estimators=40 .............
[CV]  max_features=2, min_samples_leaf=2, n_estimators=40, score=(train=-1589545226856.272, test=-3908364775429.756), total=   0.0s
[CV] max_features=2, min_samples_leaf=2, n_estimators=40 .............


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    0.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    0.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  11 out of  11 | elapsed:    0.4s remaining:    0.0s


[CV]  max_features=2, min_samples_leaf=2, n_estimators=40, score=(train=-1483251276207.388, test=-3665064595595.898), total=   0.0s
[CV] max_features=2, min_samples_leaf=2, n_estimators=70 .............
[CV]  max_features=2, min_samples_leaf=2, n_estimators=70, score=(train=-1730355015845.103, test=-1076488848427.935), total=   0.1s
[CV] max_features=2, min_samples_leaf=2, n_estimators=70 .............
[CV]  max_features=2, min_samples_leaf=2, n_estimators=70, score=(train=-1665863728724.799, test=-1622484581601.118), total=   0.1s
[CV] max_features=2, min_samples_leaf=2, n_estimators=70 .............


[Parallel(n_jobs=1)]: Done  12 out of  12 | elapsed:    0.4s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  13 out of  13 | elapsed:    0.5s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  14 out of  14 | elapsed:    0.6s remaining:    0.0s


[CV]  max_features=2, min_samples_leaf=2, n_estimators=70, score=(train=-1588291888978.669, test=-2391840449995.520), total=   0.1s
[CV] max_features=2, min_samples_leaf=2, n_estimators=70 .............
[CV]  max_features=2, min_samples_leaf=2, n_estimators=70, score=(train=-1122390601946.421, test=-8464602011046.469), total=   0.1s
[CV] max_features=2, min_samples_leaf=2, n_estimators=70 .............
[CV]  max_features=2, min_samples_leaf=2, n_estimators=70, score=(train=-1552390370370.536, test=-3861773907660.551), total=   0.1s
[CV] max_features=2, min_samples_leaf=2, n_estimators=70 .............


[Parallel(n_jobs=1)]: Done  15 out of  15 | elapsed:    0.7s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  16 out of  16 | elapsed:    0.8s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  17 out of  17 | elapsed:    0.9s remaining:    0.0s


[CV]  max_features=2, min_samples_leaf=2, n_estimators=70, score=(train=-1486714889339.010, test=-3662254908615.454), total=   0.1s
[CV] max_features=2, min_samples_leaf=2, n_estimators=100 ............
[CV]  max_features=2, min_samples_leaf=2, n_estimators=100, score=(train=-1709346544767.101, test=-1044075228149.647), total=   0.1s
[CV] max_features=2, min_samples_leaf=2, n_estimators=100 ............


[Parallel(n_jobs=1)]: Done  18 out of  18 | elapsed:    1.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  19 out of  19 | elapsed:    1.1s remaining:    0.0s


[CV]  max_features=2, min_samples_leaf=2, n_estimators=100, score=(train=-1661970066410.949, test=-1573560411650.314), total=   0.1s
[CV] max_features=2, min_samples_leaf=2, n_estimators=100 ............
[CV]  max_features=2, min_samples_leaf=2, n_estimators=100, score=(train=-1557963652878.364, test=-2398304337296.113), total=   0.1s
[CV] max_features=2, min_samples_leaf=2, n_estimators=100 ............
[CV]  max_features=2, min_samples_leaf=2, n_estimators=100, score=(train=-1109591099533.465, test=-8429300793628.446), total=   0.1s
[CV] max_features=2, min_samples_leaf=2, n_estimators=100 ............
[CV]  max_features=2, min_samples_leaf=2, n_estimators=100, score=(train=-1526508274321.714, test=-3889344993494.215), total=   0.1s
[CV] max_features=2, min_samples_leaf=2, n_estimators=100 ............
[CV]  max_features=2, min_samples_leaf=2, n_estimators=100, score=(train=-1463607210993.779, test=-3706780054157.798), total=   0.1s
[CV] max_features=2, min_samples_leaf=2, n_estimato

[CV]  max_features=2, min_samples_leaf=4, n_estimators=100, score=(train=-2417792650881.958, test=-1399383213277.201), total=   0.1s
[CV] max_features=2, min_samples_leaf=4, n_estimators=100 ............
[CV]  max_features=2, min_samples_leaf=4, n_estimators=100, score=(train=-2403072858717.921, test=-2318955920255.617), total=   0.1s
[CV] max_features=2, min_samples_leaf=4, n_estimators=100 ............
[CV]  max_features=2, min_samples_leaf=4, n_estimators=100, score=(train=-1621852155757.430, test=-8627027901563.070), total=   0.1s
[CV] max_features=2, min_samples_leaf=4, n_estimators=100 ............
[CV]  max_features=2, min_samples_leaf=4, n_estimators=100, score=(train=-2211558934877.422, test=-3878847137124.962), total=   0.1s
[CV] max_features=2, min_samples_leaf=4, n_estimators=100 ............
[CV]  max_features=2, min_samples_leaf=4, n_estimators=100, score=(train=-2151972517037.477, test=-3563026881090.458), total=   0.1s
[CV] max_features=2, min_samples_leaf=4, n_estimato

[CV]  max_features=2, min_samples_leaf=6, n_estimators=100, score=(train=-2843960851644.970, test=-1395672714858.333), total=   0.1s
[CV] max_features=2, min_samples_leaf=6, n_estimators=100 ............
[CV]  max_features=2, min_samples_leaf=6, n_estimators=100, score=(train=-2671973489120.159, test=-2260275691500.306), total=   0.1s
[CV] max_features=2, min_samples_leaf=6, n_estimators=100 ............
[CV]  max_features=2, min_samples_leaf=6, n_estimators=100, score=(train=-1840498754915.473, test=-8851009859747.971), total=   0.1s
[CV] max_features=2, min_samples_leaf=6, n_estimators=100 ............
[CV]  max_features=2, min_samples_leaf=6, n_estimators=100, score=(train=-2541650235373.897, test=-3801288799022.066), total=   0.1s
[CV] max_features=2, min_samples_leaf=6, n_estimators=100 ............
[CV]  max_features=2, min_samples_leaf=6, n_estimators=100, score=(train=-2419876690682.226, test=-3636342824797.903), total=   0.1s
[CV] max_features=2, min_samples_leaf=6, n_estimato

[CV]  max_features=4, min_samples_leaf=2, n_estimators=100, score=(train=-1448878780448.907, test=-1041709391122.621), total=   0.1s
[CV] max_features=4, min_samples_leaf=2, n_estimators=100 ............
[CV]  max_features=4, min_samples_leaf=2, n_estimators=100, score=(train=-1401794732259.939, test=-1586580166752.289), total=   0.2s
[CV] max_features=4, min_samples_leaf=2, n_estimators=100 ............
[CV]  max_features=4, min_samples_leaf=2, n_estimators=100, score=(train=-1396575051967.877, test=-2406359932985.333), total=   0.2s
[CV] max_features=4, min_samples_leaf=2, n_estimators=100 ............
[CV]  max_features=4, min_samples_leaf=2, n_estimators=100, score=(train=-946806257770.698, test=-8293196600526.610), total=   0.1s
[CV] max_features=4, min_samples_leaf=2, n_estimators=100 ............
[CV]  max_features=4, min_samples_leaf=2, n_estimators=100, score=(train=-1330662015808.323, test=-3819788436537.795), total=   0.2s
[CV] max_features=4, min_samples_leaf=2, n_estimator

[CV]  max_features=4, min_samples_leaf=4, n_estimators=100, score=(train=-2224594214836.545, test=-985622796426.796), total=   0.1s
[CV] max_features=4, min_samples_leaf=4, n_estimators=100 ............
[CV]  max_features=4, min_samples_leaf=4, n_estimators=100, score=(train=-2193256577882.940, test=-1480348778883.030), total=   0.1s
[CV] max_features=4, min_samples_leaf=4, n_estimators=100 ............
[CV]  max_features=4, min_samples_leaf=4, n_estimators=100, score=(train=-2144453004650.992, test=-2518249596719.621), total=   0.1s
[CV] max_features=4, min_samples_leaf=4, n_estimators=100 ............
[CV]  max_features=4, min_samples_leaf=4, n_estimators=100, score=(train=-1474091154483.051, test=-8527420394687.578), total=   0.2s
[CV] max_features=4, min_samples_leaf=4, n_estimators=100 ............
[CV]  max_features=4, min_samples_leaf=4, n_estimators=100, score=(train=-2023625136739.962, test=-3790607344272.683), total=   0.1s
[CV] max_features=4, min_samples_leaf=4, n_estimator

[CV]  max_features=4, min_samples_leaf=6, n_estimators=100, score=(train=-2637748381973.717, test=-973199743789.019), total=   0.1s
[CV] max_features=4, min_samples_leaf=6, n_estimators=100 ............
[CV]  max_features=4, min_samples_leaf=6, n_estimators=100, score=(train=-2582536533236.917, test=-1387484887209.825), total=   0.1s
[CV] max_features=4, min_samples_leaf=6, n_estimators=100 ............
[CV]  max_features=4, min_samples_leaf=6, n_estimators=100, score=(train=-2496935973958.425, test=-2319679430471.216), total=   0.1s
[CV] max_features=4, min_samples_leaf=6, n_estimators=100 ............
[CV]  max_features=4, min_samples_leaf=6, n_estimators=100, score=(train=-1709331189646.137, test=-8531043731476.739), total=   0.1s
[CV] max_features=4, min_samples_leaf=6, n_estimators=100 ............
[CV]  max_features=4, min_samples_leaf=6, n_estimators=100, score=(train=-2297641301822.558, test=-3772494716798.402), total=   0.1s
[CV] max_features=4, min_samples_leaf=6, n_estimator

[CV]  max_features=6, min_samples_leaf=2, n_estimators=100, score=(train=-1363069030541.381, test=-1094372635910.081), total=   0.2s
[CV] max_features=6, min_samples_leaf=2, n_estimators=100 ............
[CV]  max_features=6, min_samples_leaf=2, n_estimators=100, score=(train=-1330922553815.572, test=-1607022322805.733), total=   0.2s
[CV] max_features=6, min_samples_leaf=2, n_estimators=100 ............
[CV]  max_features=6, min_samples_leaf=2, n_estimators=100, score=(train=-1233994988001.359, test=-2588758668174.327), total=   0.2s
[CV] max_features=6, min_samples_leaf=2, n_estimators=100 ............
[CV]  max_features=6, min_samples_leaf=2, n_estimators=100, score=(train=-900804268148.013, test=-8263057045327.480), total=   0.2s
[CV] max_features=6, min_samples_leaf=2, n_estimators=100 ............
[CV]  max_features=6, min_samples_leaf=2, n_estimators=100, score=(train=-1210488360045.053, test=-3693992945182.945), total=   0.2s
[CV] max_features=6, min_samples_leaf=2, n_estimator

[CV]  max_features=6, min_samples_leaf=4, n_estimators=70, score=(train=-1838335022857.528, test=-3720855985928.991), total=   0.1s
[CV] max_features=6, min_samples_leaf=4, n_estimators=100 ............
[CV]  max_features=6, min_samples_leaf=4, n_estimators=100, score=(train=-2123610148737.669, test=-1013488452225.979), total=   0.2s
[CV] max_features=6, min_samples_leaf=4, n_estimators=100 ............
[CV]  max_features=6, min_samples_leaf=4, n_estimators=100, score=(train=-2076061135074.530, test=-1504400130304.856), total=   0.2s
[CV] max_features=6, min_samples_leaf=4, n_estimators=100 ............
[CV]  max_features=6, min_samples_leaf=4, n_estimators=100, score=(train=-2021612113425.299, test=-2610390047065.331), total=   0.2s
[CV] max_features=6, min_samples_leaf=4, n_estimators=100 ............
[CV]  max_features=6, min_samples_leaf=4, n_estimators=100, score=(train=-1414189143071.910, test=-8419079111351.784), total=   0.2s
[CV] max_features=6, min_samples_leaf=4, n_estimator

[CV]  max_features=6, min_samples_leaf=6, n_estimators=70, score=(train=-2236569136825.932, test=-3629065806069.650), total=   0.1s
[CV] max_features=6, min_samples_leaf=6, n_estimators=100 ............
[CV]  max_features=6, min_samples_leaf=6, n_estimators=100, score=(train=-2534567842945.490, test=-1018501567748.618), total=   0.1s
[CV] max_features=6, min_samples_leaf=6, n_estimators=100 ............
[CV]  max_features=6, min_samples_leaf=6, n_estimators=100, score=(train=-2460291212527.109, test=-1448959557106.729), total=   0.1s
[CV] max_features=6, min_samples_leaf=6, n_estimators=100 ............
[CV]  max_features=6, min_samples_leaf=6, n_estimators=100, score=(train=-2417544563970.157, test=-2375727704971.675), total=   0.2s
[CV] max_features=6, min_samples_leaf=6, n_estimators=100 ............
[CV]  max_features=6, min_samples_leaf=6, n_estimators=100, score=(train=-1648726733453.746, test=-8465630360378.239), total=   0.1s
[CV] max_features=6, min_samples_leaf=6, n_estimator

[CV]  max_features=8, min_samples_leaf=2, n_estimators=70, score=(train=-1261528954757.387, test=-3754427450562.111), total=   0.2s
[CV] max_features=8, min_samples_leaf=2, n_estimators=70 .............
[CV]  max_features=8, min_samples_leaf=2, n_estimators=70, score=(train=-1100631591593.432, test=-3798457378894.153), total=   0.2s
[CV] max_features=8, min_samples_leaf=2, n_estimators=100 ............
[CV]  max_features=8, min_samples_leaf=2, n_estimators=100, score=(train=-1291678350830.568, test=-1071276425825.496), total=   0.2s
[CV] max_features=8, min_samples_leaf=2, n_estimators=100 ............
[CV]  max_features=8, min_samples_leaf=2, n_estimators=100, score=(train=-1292358036686.989, test=-1820438432383.042), total=   0.2s
[CV] max_features=8, min_samples_leaf=2, n_estimators=100 ............
[CV]  max_features=8, min_samples_leaf=2, n_estimators=100, score=(train=-1238001355728.716, test=-2852459893473.495), total=   0.2s
[CV] max_features=8, min_samples_leaf=2, n_estimators

[CV]  max_features=8, min_samples_leaf=4, n_estimators=70, score=(train=-1883726943397.418, test=-3802230247447.919), total=   0.1s
[CV] max_features=8, min_samples_leaf=4, n_estimators=70 .............
[CV]  max_features=8, min_samples_leaf=4, n_estimators=70, score=(train=-1833243571371.164, test=-3703382873161.473), total=   0.1s
[CV] max_features=8, min_samples_leaf=4, n_estimators=100 ............
[CV]  max_features=8, min_samples_leaf=4, n_estimators=100, score=(train=-2097441325515.119, test=-1037169137673.126), total=   0.2s
[CV] max_features=8, min_samples_leaf=4, n_estimators=100 ............
[CV]  max_features=8, min_samples_leaf=4, n_estimators=100, score=(train=-2021093630528.696, test=-1450886845619.485), total=   0.2s
[CV] max_features=8, min_samples_leaf=4, n_estimators=100 ............
[CV]  max_features=8, min_samples_leaf=4, n_estimators=100, score=(train=-1999042850327.583, test=-2608158926344.493), total=   0.2s
[CV] max_features=8, min_samples_leaf=4, n_estimators

[CV]  max_features=8, min_samples_leaf=6, n_estimators=70, score=(train=-1607886113404.703, test=-8324543794385.342), total=   0.1s
[CV] max_features=8, min_samples_leaf=6, n_estimators=70 .............
[CV]  max_features=8, min_samples_leaf=6, n_estimators=70, score=(train=-2188317003384.785, test=-3770831127781.552), total=   0.1s
[CV] max_features=8, min_samples_leaf=6, n_estimators=70 .............
[CV]  max_features=8, min_samples_leaf=6, n_estimators=70, score=(train=-2177916455192.976, test=-3783950085788.286), total=   0.1s
[CV] max_features=8, min_samples_leaf=6, n_estimators=100 ............
[CV]  max_features=8, min_samples_leaf=6, n_estimators=100, score=(train=-2461384419501.377, test=-1050600086343.670), total=   0.2s
[CV] max_features=8, min_samples_leaf=6, n_estimators=100 ............
[CV]  max_features=8, min_samples_leaf=6, n_estimators=100, score=(train=-2437350080571.671, test=-1405169813532.335), total=   0.2s
[CV] max_features=8, min_samples_leaf=6, n_estimators=

[CV]  bootstrap=False, max_features=2, min_samples_leaf=2, n_estimators=70, score=(train=-860002742961.834, test=-1811463519816.276), total=   0.1s
[CV] bootstrap=False, max_features=2, min_samples_leaf=2, n_estimators=70 
[CV]  bootstrap=False, max_features=2, min_samples_leaf=2, n_estimators=70, score=(train=-913472881376.016, test=-2499863462661.026), total=   0.1s
[CV] bootstrap=False, max_features=2, min_samples_leaf=2, n_estimators=70 
[CV]  bootstrap=False, max_features=2, min_samples_leaf=2, n_estimators=70, score=(train=-546129575090.188, test=-8274508497514.452), total=   0.1s
[CV] bootstrap=False, max_features=2, min_samples_leaf=2, n_estimators=70 
[CV]  bootstrap=False, max_features=2, min_samples_leaf=2, n_estimators=70, score=(train=-799724585834.410, test=-3890801398090.260), total=   0.1s
[CV] bootstrap=False, max_features=2, min_samples_leaf=2, n_estimators=70 
[CV]  bootstrap=False, max_features=2, min_samples_leaf=2, n_estimators=70, score=(train=-729358981625.173, 

[CV]  bootstrap=False, max_features=2, min_samples_leaf=4, n_estimators=40, score=(train=-1914409076873.927, test=-2326848819838.597), total=   0.0s
[CV] bootstrap=False, max_features=2, min_samples_leaf=4, n_estimators=40 
[CV]  bootstrap=False, max_features=2, min_samples_leaf=4, n_estimators=40, score=(train=-1251817324845.748, test=-8546507127866.162), total=   0.0s
[CV] bootstrap=False, max_features=2, min_samples_leaf=4, n_estimators=40 
[CV]  bootstrap=False, max_features=2, min_samples_leaf=4, n_estimators=40, score=(train=-1629013360463.336, test=-4048255927454.941), total=   0.0s
[CV] bootstrap=False, max_features=2, min_samples_leaf=4, n_estimators=40 
[CV]  bootstrap=False, max_features=2, min_samples_leaf=4, n_estimators=40, score=(train=-1643552773824.331, test=-3852215568654.701), total=   0.0s
[CV] bootstrap=False, max_features=2, min_samples_leaf=4, n_estimators=70 
[CV]  bootstrap=False, max_features=2, min_samples_leaf=4, n_estimators=70, score=(train=-1947191488087.

[CV]  bootstrap=False, max_features=2, min_samples_leaf=6, n_estimators=40, score=(train=-2344886556919.576, test=-1505532845001.763), total=   0.0s
[CV] bootstrap=False, max_features=2, min_samples_leaf=6, n_estimators=40 
[CV]  bootstrap=False, max_features=2, min_samples_leaf=6, n_estimators=40, score=(train=-2290944730972.734, test=-2406951396826.936), total=   0.0s
[CV] bootstrap=False, max_features=2, min_samples_leaf=6, n_estimators=40 
[CV]  bootstrap=False, max_features=2, min_samples_leaf=6, n_estimators=40, score=(train=-1531515406188.337, test=-8603197828655.758), total=   0.0s
[CV] bootstrap=False, max_features=2, min_samples_leaf=6, n_estimators=40 
[CV]  bootstrap=False, max_features=2, min_samples_leaf=6, n_estimators=40, score=(train=-2050318910515.547, test=-3955318537865.388), total=   0.0s
[CV] bootstrap=False, max_features=2, min_samples_leaf=6, n_estimators=40 
[CV]  bootstrap=False, max_features=2, min_samples_leaf=6, n_estimators=40, score=(train=-2033942772834.

[CV]  bootstrap=False, max_features=4, min_samples_leaf=2, n_estimators=40, score=(train=-663320297531.625, test=-1058914301299.738), total=   0.1s
[CV] bootstrap=False, max_features=4, min_samples_leaf=2, n_estimators=40 
[CV]  bootstrap=False, max_features=4, min_samples_leaf=2, n_estimators=40, score=(train=-671471669742.148, test=-1780248164994.506), total=   0.1s
[CV] bootstrap=False, max_features=4, min_samples_leaf=2, n_estimators=40 
[CV]  bootstrap=False, max_features=4, min_samples_leaf=2, n_estimators=40, score=(train=-605902569774.245, test=-2750654804121.766), total=   0.1s
[CV] bootstrap=False, max_features=4, min_samples_leaf=2, n_estimators=40 
[CV]  bootstrap=False, max_features=4, min_samples_leaf=2, n_estimators=40, score=(train=-366485052040.949, test=-8328413405861.141), total=   0.1s
[CV] bootstrap=False, max_features=4, min_samples_leaf=2, n_estimators=40 
[CV]  bootstrap=False, max_features=4, min_samples_leaf=2, n_estimators=40, score=(train=-611864000360.609, 

[CV]  bootstrap=False, max_features=4, min_samples_leaf=4, n_estimators=40, score=(train=-1584643622552.916, test=-1557506660060.229), total=   0.1s
[CV] bootstrap=False, max_features=4, min_samples_leaf=4, n_estimators=40 
[CV]  bootstrap=False, max_features=4, min_samples_leaf=4, n_estimators=40, score=(train=-1605165532815.251, test=-2493545822704.443), total=   0.1s
[CV] bootstrap=False, max_features=4, min_samples_leaf=4, n_estimators=40 
[CV]  bootstrap=False, max_features=4, min_samples_leaf=4, n_estimators=40, score=(train=-1038273558155.826, test=-8633242215697.062), total=   0.1s
[CV] bootstrap=False, max_features=4, min_samples_leaf=4, n_estimators=40 
[CV]  bootstrap=False, max_features=4, min_samples_leaf=4, n_estimators=40, score=(train=-1502096402655.057, test=-3831584734779.017), total=   0.1s
[CV] bootstrap=False, max_features=4, min_samples_leaf=4, n_estimators=40 
[CV]  bootstrap=False, max_features=4, min_samples_leaf=4, n_estimators=40, score=(train=-1351038838824.

[CV]  bootstrap=False, max_features=4, min_samples_leaf=6, n_estimators=40, score=(train=-2039924960558.292, test=-1570558253299.258), total=   0.1s
[CV] bootstrap=False, max_features=4, min_samples_leaf=6, n_estimators=40 
[CV]  bootstrap=False, max_features=4, min_samples_leaf=6, n_estimators=40, score=(train=-1985128115985.432, test=-2503221852507.694), total=   0.1s
[CV] bootstrap=False, max_features=4, min_samples_leaf=6, n_estimators=40 
[CV]  bootstrap=False, max_features=4, min_samples_leaf=6, n_estimators=40, score=(train=-1368669732688.168, test=-8495516152340.548), total=   0.1s
[CV] bootstrap=False, max_features=4, min_samples_leaf=6, n_estimators=40 
[CV]  bootstrap=False, max_features=4, min_samples_leaf=6, n_estimators=40, score=(train=-1860632581810.414, test=-3887626066687.228), total=   0.1s
[CV] bootstrap=False, max_features=4, min_samples_leaf=6, n_estimators=40 
[CV]  bootstrap=False, max_features=4, min_samples_leaf=6, n_estimators=40, score=(train=-1789164864927.

[CV]  bootstrap=False, max_features=6, min_samples_leaf=2, n_estimators=40, score=(train=-530243839628.847, test=-1027099488277.699), total=   0.1s
[CV] bootstrap=False, max_features=6, min_samples_leaf=2, n_estimators=40 
[CV]  bootstrap=False, max_features=6, min_samples_leaf=2, n_estimators=40, score=(train=-526939259431.727, test=-2344402113191.705), total=   0.1s
[CV] bootstrap=False, max_features=6, min_samples_leaf=2, n_estimators=40 
[CV]  bootstrap=False, max_features=6, min_samples_leaf=2, n_estimators=40, score=(train=-538045153600.515, test=-3019503899037.273), total=   0.1s
[CV] bootstrap=False, max_features=6, min_samples_leaf=2, n_estimators=40 
[CV]  bootstrap=False, max_features=6, min_samples_leaf=2, n_estimators=40, score=(train=-341669643205.870, test=-8598041977317.677), total=   0.1s
[CV] bootstrap=False, max_features=6, min_samples_leaf=2, n_estimators=40 
[CV]  bootstrap=False, max_features=6, min_samples_leaf=2, n_estimators=40, score=(train=-553013439955.069, 

[CV]  bootstrap=False, max_features=6, min_samples_leaf=4, n_estimators=40, score=(train=-1527557778396.804, test=-1009539593871.175), total=   0.1s
[CV] bootstrap=False, max_features=6, min_samples_leaf=4, n_estimators=40 
[CV]  bootstrap=False, max_features=6, min_samples_leaf=4, n_estimators=40, score=(train=-1392593253674.656, test=-2087735419336.156), total=   0.1s
[CV] bootstrap=False, max_features=6, min_samples_leaf=4, n_estimators=40 
[CV]  bootstrap=False, max_features=6, min_samples_leaf=4, n_estimators=40, score=(train=-1499332719311.298, test=-2781101251719.851), total=   0.1s
[CV] bootstrap=False, max_features=6, min_samples_leaf=4, n_estimators=40 
[CV]  bootstrap=False, max_features=6, min_samples_leaf=4, n_estimators=40, score=(train=-970561319883.762, test=-8469160416937.726), total=   0.1s
[CV] bootstrap=False, max_features=6, min_samples_leaf=4, n_estimators=40 
[CV]  bootstrap=False, max_features=6, min_samples_leaf=4, n_estimators=40, score=(train=-1375209958527.2

[CV]  bootstrap=False, max_features=6, min_samples_leaf=6, n_estimators=40, score=(train=-1981820391872.808, test=-998705679800.179), total=   0.1s
[CV] bootstrap=False, max_features=6, min_samples_leaf=6, n_estimators=40 
[CV]  bootstrap=False, max_features=6, min_samples_leaf=6, n_estimators=40, score=(train=-1912577848534.535, test=-1834623603337.135), total=   0.1s
[CV] bootstrap=False, max_features=6, min_samples_leaf=6, n_estimators=40 
[CV]  bootstrap=False, max_features=6, min_samples_leaf=6, n_estimators=40, score=(train=-1883189797725.898, test=-2663898139700.805), total=   0.1s
[CV] bootstrap=False, max_features=6, min_samples_leaf=6, n_estimators=40 
[CV]  bootstrap=False, max_features=6, min_samples_leaf=6, n_estimators=40, score=(train=-1294490603203.470, test=-8550753072588.786), total=   0.1s
[CV] bootstrap=False, max_features=6, min_samples_leaf=6, n_estimators=40 
[CV]  bootstrap=False, max_features=6, min_samples_leaf=6, n_estimators=40, score=(train=-1778192485594.9

[CV]  bootstrap=False, max_features=8, min_samples_leaf=2, n_estimators=10, score=(train=-478652247562.176, test=-3988692618537.810), total=   0.0s
[CV] bootstrap=False, max_features=8, min_samples_leaf=2, n_estimators=40 
[CV]  bootstrap=False, max_features=8, min_samples_leaf=2, n_estimators=40, score=(train=-546557866521.167, test=-1125914845910.736), total=   0.1s
[CV] bootstrap=False, max_features=8, min_samples_leaf=2, n_estimators=40 
[CV]  bootstrap=False, max_features=8, min_samples_leaf=2, n_estimators=40, score=(train=-469523163475.596, test=-2944658904806.251), total=   0.1s
[CV] bootstrap=False, max_features=8, min_samples_leaf=2, n_estimators=40 
[CV]  bootstrap=False, max_features=8, min_samples_leaf=2, n_estimators=40, score=(train=-518800684175.856, test=-3048668261622.874), total=   0.1s
[CV] bootstrap=False, max_features=8, min_samples_leaf=2, n_estimators=40 
[CV]  bootstrap=False, max_features=8, min_samples_leaf=2, n_estimators=40, score=(train=-328660786418.026, 

[CV]  bootstrap=False, max_features=8, min_samples_leaf=4, n_estimators=40, score=(train=-1432822905039.776, test=-1058387126735.655), total=   0.1s
[CV] bootstrap=False, max_features=8, min_samples_leaf=4, n_estimators=40 
[CV]  bootstrap=False, max_features=8, min_samples_leaf=4, n_estimators=40, score=(train=-1411760244760.959, test=-2242332955850.671), total=   0.1s
[CV] bootstrap=False, max_features=8, min_samples_leaf=4, n_estimators=40 
[CV]  bootstrap=False, max_features=8, min_samples_leaf=4, n_estimators=40, score=(train=-1395601525372.501, test=-3068639049412.573), total=   0.1s
[CV] bootstrap=False, max_features=8, min_samples_leaf=4, n_estimators=40 
[CV]  bootstrap=False, max_features=8, min_samples_leaf=4, n_estimators=40, score=(train=-891476812749.976, test=-8406139546308.864), total=   0.1s
[CV] bootstrap=False, max_features=8, min_samples_leaf=4, n_estimators=40 
[CV]  bootstrap=False, max_features=8, min_samples_leaf=4, n_estimators=40, score=(train=-1320157924129.5

[CV]  bootstrap=False, max_features=8, min_samples_leaf=6, n_estimators=40, score=(train=-1928873966108.976, test=-1023032543533.223), total=   0.1s
[CV] bootstrap=False, max_features=8, min_samples_leaf=6, n_estimators=40 
[CV]  bootstrap=False, max_features=8, min_samples_leaf=6, n_estimators=40, score=(train=-1830912659236.188, test=-2206688980682.568), total=   0.1s
[CV] bootstrap=False, max_features=8, min_samples_leaf=6, n_estimators=40 
[CV]  bootstrap=False, max_features=8, min_samples_leaf=6, n_estimators=40, score=(train=-1882925873192.715, test=-2877501532093.169), total=   0.1s
[CV] bootstrap=False, max_features=8, min_samples_leaf=6, n_estimators=40 
[CV]  bootstrap=False, max_features=8, min_samples_leaf=6, n_estimators=40, score=(train=-1260995829338.335, test=-8841054320640.588), total=   0.1s
[CV] bootstrap=False, max_features=8, min_samples_leaf=6, n_estimators=40 
[CV]  bootstrap=False, max_features=8, min_samples_leaf=6, n_estimators=40, score=(train=-1741530674374.

[Parallel(n_jobs=1)]: Done 1008 out of 1008 | elapsed:  3.3min finished


In [7]:
# evaluate models 

models = {'forest_grid_search': grid_search_forest.best_estimator_, 
          'forest_1': RandomForestRegressor(n_estimators=3, random_state=42), 
          'forest_2':RandomForestRegressor(max_depth=5, n_estimators=10, max_features=9, random_state=42)}

random_forest_evaluator = ModelEvaluator(X, y)

random_forest_evaluator.evaluate_models(models)

***forest_grid_search***
Index(['basin', 'income_level_final'], dtype='object')
Scores: [ 803822.23350209 1166260.83138995 1371370.6594198  1607944.87480684
 1170160.25182717 3770440.35286206 1504366.02073208 1092923.55546392
 2219815.69575327 2188448.69041575]
Mean: 1689555.3166172914
Standard deviation: 817707.2166923254


***forest_1***
Index(['basin', 'income_level_final'], dtype='object')
Scores: [1285644.09901694 1191174.46678079 2290930.60438599 2079677.4321966
 2179793.19728161 4057866.37765171 1328287.50550625 1351688.90955074
 2199649.19200848 3156808.39235801]
Mean: 2112152.017673712
Standard deviation: 874438.1898887912


***forest_2***
Index(['basin', 'income_level_final'], dtype='object')
Scores: [1251088.96066901 1226099.07708944 1522147.21741576 1751063.81345957
 1241616.0936574  3804063.29749966 1652006.29579458 1457939.5109553
 2309262.59423097 2335759.9899946 ]
Mean: 1855104.6850766286
Standard deviation: 754536.5606148882


