In [1]:
import pandas as pd
import numpy as np
import pickle

%matplotlib inline
import matplotlib.pyplot as plt

# Read data from disk

In [2]:
titanic_clf = pd.read_csv('../Datasets/titanic_classification.csv')
titanic_clf.head()

Unnamed: 0,Pclass,Gender,Age,SibSp,Parch,Fare,Emb_C,Emb_Q,Emb_S,Survived
0,3,0,22.0,1,0,7.25,0,0,1,0
1,1,0,38.0,1,0,71.2833,1,0,0,1
2,3,0,26.0,0,0,7.925,0,0,1,1
3,1,0,35.0,1,0,53.1,0,0,1,1
4,3,0,35.0,0,0,8.05,0,0,1,0


# Load the models required for this notebook

In [3]:
with open('../../Chapter06/Saved Models/random_forest_clf.pkl', 'rb') as f:
    rf = pickle.load(f)

# Hyperparameter tuning with Random Search

In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

In [5]:
X = titanic_clf.iloc[:, :-1].values
y = titanic_clf.iloc[:, -1].values

rf_rand = RandomForestClassifier()

In [6]:
def report(results, max_rank=3):
    for rank in range(1, max_rank+1):
        results_at_rank = np.flatnonzero(results['rank_test_score'] == i)

In [7]:
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

In [8]:
param_dist = {"n_estimators": list(range(10,210,10)),
              "max_depth": list(range(3,20)),
              "max_features": list(range(1, 10)),
              "min_samples_split": list(range(2, 11)),
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}

In [9]:
n_iter_search = 60
random_search = RandomizedSearchCV(rf_rand, param_distributions=param_dist, scoring='accuracy',
                                   n_iter=n_iter_search, cv=5)
random_search.fit(X, y)



RandomizedSearchCV(cv=5, error_score='raise-deprecating',
                   estimator=RandomForestClassifier(bootstrap=True,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features='auto',
                                                    max_leaf_nodes=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
                                                    n_estimators='warn',
                                                    n_jobs=None

In [10]:
results = pd.DataFrame(random_search.cv_results_).sort_values('rank_test_score')
for i, row in results.head().iterrows():
    print("Model rank: {}".format(row.rank_test_score))
    print("Mean validation score: {:.3f} (std: {:.3f})".format(row.mean_test_score, row.std_test_score))
    print("Model Hyperparameters: {}\n".format(row.params))

Model rank: 1
Mean validation score: 0.722 (std: 0.055)
Model Hyperparameters: {'n_estimators': 120, 'min_samples_split': 3, 'max_features': 5, 'max_depth': 6, 'criterion': 'entropy', 'bootstrap': True}

Model rank: 2
Mean validation score: 0.721 (std: 0.049)
Model Hyperparameters: {'n_estimators': 50, 'min_samples_split': 7, 'max_features': 5, 'max_depth': 7, 'criterion': 'entropy', 'bootstrap': True}

Model rank: 2
Mean validation score: 0.721 (std: 0.048)
Model Hyperparameters: {'n_estimators': 110, 'min_samples_split': 9, 'max_features': 7, 'max_depth': 8, 'criterion': 'entropy', 'bootstrap': True}

Model rank: 4
Mean validation score: 0.719 (std: 0.037)
Model Hyperparameters: {'n_estimators': 60, 'min_samples_split': 6, 'max_features': 5, 'max_depth': 10, 'criterion': 'entropy', 'bootstrap': True}

Model rank: 4
Mean validation score: 0.719 (std: 0.052)
Model Hyperparameters: {'n_estimators': 120, 'min_samples_split': 9, 'max_features': 4, 'max_depth': 7, 'criterion': 'gini', 'boo

In [11]:
report(random_search.cv_results_)

Model with rank: 1
Mean validation score: 0.722 (std: 0.055)
Parameters: {'n_estimators': 120, 'min_samples_split': 3, 'max_features': 5, 'max_depth': 6, 'criterion': 'entropy', 'bootstrap': True}

Model with rank: 2
Mean validation score: 0.721 (std: 0.049)
Parameters: {'n_estimators': 50, 'min_samples_split': 7, 'max_features': 5, 'max_depth': 7, 'criterion': 'entropy', 'bootstrap': True}

Model with rank: 2
Mean validation score: 0.721 (std: 0.048)
Parameters: {'n_estimators': 110, 'min_samples_split': 9, 'max_features': 7, 'max_depth': 8, 'criterion': 'entropy', 'bootstrap': True}

