In [23]:
from sklearn.ensemble import RandomForestRegressor

import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error

from pprint import pprint
from IPython.display import clear_output

In [2]:
data_path = "/Users/oliverpaul/Data_Science/idiap/lucideles/data"
data = np.load(data_path + '/train.npy')

In [3]:
data_x, data_y = data[:,:-2].copy(), data[:,-2:].copy()

In [4]:
dummy = RandomForestRegressor()
print('Default parameters:\n')
pprint(dummy.get_params())

Default parameters:

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'mse',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}


In [6]:
#defining k-folds procedure 
kf = KFold(n_splits=5, random_state=None, shuffle=False)

In [15]:
benchmark = RandomForestRegressor(n_jobs=-1)
results=[]
for train_index, test_index in kf.split(data_x):
    X_train, X_test = data_x[train_index], data_x[test_index]
    Y_train, Y_test = data_y[train_index], data_y[test_index]

    benchmark.fit(X_train, Y_train)
    preds = benchmark.predict(X_test)

    results.append(mean_absolute_error(Y_test, preds))
print("Benchmark (non-optimised RF) has MAE of: %.5f" % (np.mean(results)))

Benchmark (non-optimised RF) has MAE of: 0.02740


In [16]:
params = dummy.get_params()

By defult not defining max_depth grows pure trees, will leave this and tune for n_estimators (default is 100)

In [25]:
#gridsearch params for max_depth and n_estimators 
gridsearch_params = [n_estimators for n_estimators in range(50,1050, 50)]

params_cv = params.copy()
#setting to multithread
params_cv['n_jobs'] = -1

min_mae = float("Inf")
best_params = None

for n_estimators in gridsearch_params:
    print("CV with n_estimators = %i" % (n_estimators))
    # Update our parameters
    params_cv['n_estimators'] = n_estimators

    errors = []

    for train_index, test_index in kf.split(data_x):
        X_train, X_test = data_x[train_index], data_x[test_index]
        Y_train, Y_test = data_y[train_index], data_y[test_index]

        model_rf = RandomForestRegressor(**params_cv)
        model_rf.fit(X_train, Y_train)
        preds = model_rf.predict(X_test)

        errors.append(mean_absolute_error(Y_test, preds))
    
    mean_mae = np.mean(errors)
    print("\tMAE %.5f" % (mean_mae))
    if mean_mae < min_mae:
        min_mae = mean_mae
        best_params = n_estimators
        
params_cv['n_estimators'] = best_params
clear_output()
print("Best params: n_estimators: %i, MAE: %.5f" % (best_params, min_mae))

Best params: n_estimators: 800, MAE: 0.02728


In [32]:
# Little difference between n_estimators... 500 trees had similar MAE as 800 so resetting this param to 500 for speed
params_cv['n_estimators'] = 500

Doing the same for min_samples_leaf and min_samples_split

In [33]:
#gridsearch params for max_depth and n_estimators 
gridsearch_params = [(min_samples_leaf, min_samples_split) for min_samples_leaf in range(1,8) for min_samples_split in range(2,8)]

min_mae = float("Inf")
best_params = None

for min_samples_leaf, min_samples_split in gridsearch_params:
    print("CV with min_samples_leaf = %i, min_samples_split = %i" % (min_samples_leaf, min_samples_split))
    # Update our parameters
    params_cv['min_samples_leaf'] = min_samples_leaf
    params_cv['min_samples_split'] = min_samples_split

    errors = []

    for train_index, test_index in kf.split(data_x):
        X_train, X_test = data_x[train_index], data_x[test_index]
        Y_train, Y_test = data_y[train_index], data_y[test_index]

        model_rf = RandomForestRegressor(**params_cv)
        model_rf.fit(X_train, Y_train)
        preds = model_rf.predict(X_test)

        errors.append(mean_absolute_error(Y_test, preds))
    
    mean_mae = np.mean(errors)
    print("\tMAE %.5f" % (mean_mae))
    if mean_mae < min_mae:
        min_mae = mean_mae
        best_params = (min_samples_leaf, min_samples_split)
        
params_cv['min_samples_leaf'] = best_params[0]
params_cv['min_samples_split'] = best_params[1]
clear_output()
print("Best params: min_samples_leaf: %i, min_samples_split: %i, MAE: %.5f" % (best_params[0], best_params[1], min_mae))

Best params: min_samples_leaf: 1, min_samples_split: 2, MAE: 0.02729


In [35]:
params_cv

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'mse',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 500,
 'n_jobs': -1,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [36]:
#saving updated params
np.save(data_path + '/RF_PARAMS.npy', params_cv) 