In [13]:
import numpy as np
import warnings
from pprint import pprint
from sklearn.ensemble import RandomForestClassifier
from preprocess import main

warnings.filterwarnings('ignore')

# Chapter IV - Fit Machine Learning Models
Perform feature engineering on the combined training & test dataset before fitting machine learning models.

In [3]:
processed_df, features = main("input/train.csv", "input/test.csv")
print("Selected features: ", features)
print(processed_df.shape)
processed_df.head()

Index(['PassengerId', 'Age', 'Room', 'Fare', 'Parch', 'Pclass', 'SibSp',
       'Survived', 'Deck_A', 'Deck_B', 'Deck_C', 'Deck_D', 'Deck_E', 'Deck_F',
       'Deck_G', 'Deck_T', 'Deck_U', 'Embarked_C', 'Embarked_Q', 'Embarked_S',
       'Title_Master', 'Title_Miss', 'Title_Mr', 'Title_Mrs', 'Title_Other',
       'Sex_female', 'Sex_male'],
      dtype='object')
Number of passengers with age data: 1046
Number of passengers with no age data: 263
Selected features:  ['Room', 'Fare', 'Parch', 'Pclass', 'SibSp', 'Deck_A', 'Deck_B', 'Deck_C', 'Deck_D', 'Deck_E', 'Deck_F', 'Deck_G', 'Deck_T', 'Deck_U', 'Embarked_C', 'Embarked_Q', 'Embarked_S', 'Title_Master', 'Title_Miss', 'Title_Mr', 'Title_Mrs', 'Title_Other', 'Sex_female', 'Sex_male']
(1309, 27)


Unnamed: 0,PassengerId,Age,Room,Fare,Parch,Pclass,SibSp,Survived,Deck_A,Deck_B,...,Embarked_C,Embarked_Q,Embarked_S,Title_Master,Title_Miss,Title_Mr,Title_Mrs,Title_Other,Sex_female,Sex_male
0,1,22.0,49.615917,7.25,0,3,1,0.0,0,0,...,0,0,1,0,0,1,0,0,0,1
1,2,38.0,85.0,71.2833,0,1,1,1.0,0,0,...,1,0,0,0,0,0,1,0,1,0
2,3,26.0,49.615917,7.925,0,3,0,1.0,0,0,...,0,0,1,0,1,0,0,0,1,0
3,4,35.0,123.0,53.1,0,1,1,1.0,0,0,...,0,0,1,0,0,0,1,0,1,0
4,5,35.0,49.615917,8.05,0,3,0,0.0,0,0,...,0,0,1,0,0,1,0,0,0,1


## 4.1 Random Forest Model
Split the full dataset with all selected features well prepared.

In [4]:
train_df = processed_df[processed_df.Survived.notnull()]
test_df = processed_df[processed_df['Survived'].isnull()]
print("Dimension of training data: %s" %str(train_df.shape))
print("Dimension of test data: %s" %str(test_df.shape))

Dimension of training data: (891, 27)
Dimension of test data: (418, 27)


Fit baseline model.

In [5]:
survival_rf = RandomForestClassifier()
all_features = features.copy()
all_features.append('Age')
survival_rf.fit(train_df[all_features], train_df['Survived'])

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [6]:
test_df['Survived'] = survival_rf.predict(test_df[all_features])
test_df.head()

Unnamed: 0,PassengerId,Age,Room,Fare,Parch,Pclass,SibSp,Survived,Deck_A,Deck_B,...,Embarked_C,Embarked_Q,Embarked_S,Title_Master,Title_Miss,Title_Mr,Title_Mrs,Title_Other,Sex_female,Sex_male
0,892,34.5,49.615917,7.8292,0,3,0,0.0,0,0,...,0,1,0,0,0,1,0,0,0,1
1,893,47.0,49.615917,7.0,0,3,1,0.0,0,0,...,0,0,1,0,0,0,1,0,1,0
2,894,62.0,49.615917,9.6875,0,2,0,0.0,0,0,...,0,1,0,0,0,1,0,0,0,1
3,895,27.0,49.615917,8.6625,0,3,0,1.0,0,0,...,0,0,1,0,0,1,0,0,0,1
4,896,22.0,49.615917,12.2875,1,3,1,0.0,0,0,...,0,0,1,0,0,0,1,0,1,0


In [9]:
# Examine the parameters used by our current forest
print('Parameters currently in use:\n')
pprint(survival_rf.get_params())

Parameters currently in use:

{'bootstrap': True,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 10,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}


Perform grid search on model parameters

In [14]:
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
pprint(random_grid)

{'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}


In [16]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator=rf, param_distributions=random_grid,
                               n_iter=100, cv=3, verbose=2, random_state=42, n_jobs=-1)
# Fit the random search model
rf_random.fit(train_df[all_features], train_df['Survived'])

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   32.0s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  3.8min finished


RandomizedSearchCV(cv=3, error_score='raise-deprecating',
          estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
          fit_params=None, iid='warn', n_iter=100, n_jobs=-1,
          param_distributions={'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]},
          pre_dispatch='2*n_jobs', random_state=42, refit=True,
          return_train_score='warn', scoring=None, verbose=2)

In [17]:
rf_random.best_params_

{'n_estimators': 1000,
 'min_samples_split': 5,
 'min_samples_leaf': 4,
 'max_features': 'auto',
 'max_depth': 30,
 'bootstrap': False}

https://medium.com/all-things-ai/in-depth-parameter-tuning-for-random-forest-d67bb7e920d
    
https://towardsdatascience.com/hyperparameter-tuning-the-random-forest-in-python-using-scikit-learn-28d2aa77dd74