# Giving the Randomest Random Forest another try

Executing this notebook will most likely take one hour.

In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score, KFold

from sklearn.model_selection import RandomizedSearchCV
from pprint import pprint

In [2]:
df = pd.read_pickle('../training_dataset_task3/task_3_training_e8da4715deef7d56_f8b7378_pandas.pkl').reset_index()

# only low and mid level features in X and keep segment ID
X = df.loc[:, 'essentia_dissonance_mean':'mirtoolbox_roughness_pct_90']
y = df['quadrant']

# add segment ID to feature space
X['segment_id'] = df['segment_id']

# pick segment 26 for final evaluation
final_test_X = X[X['segment_id']==26]
final_test_y = y[X['segment_id']==26]

# drop segment ID column
final_test_X = final_test_X.drop(['segment_id'], axis=1)



# drop segment 26 from dataset to use for final evaluation, y first
y = y.drop(X[X['segment_id']==26].index, axis=0)
X = X.drop(X[X['segment_id']==26].index, axis=0)

# drop segment ID column
X = X.drop(['segment_id'], axis=1)



# preprocess dataset
X_std = StandardScaler().fit_transform(X)
X = pd.DataFrame(X_std, columns=X.columns)

In [3]:
# Use Train and Test Set instead of CV
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [4]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(random_state=42)

In [5]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]

# Number of features to consider at every split
max_features = ['auto', 'sqrt']

# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)

# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]

# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

pprint(random_grid)

{'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}


In [6]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestClassifier()

# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter=100, cv=3, verbose=2, random_state=42, n_jobs=-1)

# Fit the random search model
rf_random.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


RandomizedSearchCV(cv=3, estimator=RandomForestClassifier(), n_iter=100,
                   n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [10, 20, 30, 40, 50, 60,
                                                      70, 80, 90, 100, 110,
                                                      None],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [200, 400, 600, 800,
                                                         1000, 1200, 1400, 1600,
                                                         1800, 2000]},
                   random_state=42, verbose=2)

In [7]:
rf_random.best_params_

{'n_estimators': 200,
 'min_samples_split': 5,
 'min_samples_leaf': 2,
 'max_features': 'auto',
 'max_depth': 90,
 'bootstrap': False}

In [8]:
# def evaluate(model, test_features, test_labels):
#     predictions = model.predict(test_features)
#     errors = abs(predictions - test_labels)
#     mape = 100 * np.mean(errors / test_labels)
#     accuracy = 100 - mape
#     print('Model Performance')
#     print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
#     print('Accuracy = {:0.2f}%.'.format(accuracy))
    
#     return accuracy

In [9]:
base_model = RandomForestClassifier(n_estimators = 5)
base_model.fit(X_train, y_train)
# base_accuracy = evaluate(base_model, X_test, y_test)
crossval_base = cross_val_score(base_model, X_test, y_test)
print('Base CrossVal:', crossval_base.mean())

best_random = rf_random.best_estimator_
best_random.fit(X_train, y_train)
# random_accuracy = evaluate(best_random, X_test, y_test)
crossval_random = cross_val_score(best_random, X_test, y_test)
print('Random CrossVal:', crossval_random.mean())

print('\nCrossVal improvement of {:0.2f}%.'.format( 100 * (crossval_random.mean() - crossval_base.mean()) / crossval_base.mean()))

Base CrossVal: 0.4275862068965518
Random CrossVal: 0.5172413793103449

CrossVal improvement of 20.97%.


In [10]:
best_random

RandomForestClassifier(bootstrap=False, max_depth=90, min_samples_leaf=2,
                       min_samples_split=5, n_estimators=200)