# Hyperparameter Optimization

### Get data

In [1]:
import pandas as pd
import seaborn as sns

df = sns.load_dataset('titanic')
df.head(3)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True


In [2]:
df['age'].fillna(29.0, inplace=True)

In [3]:
X = df[['age', 'sibsp', 'parch']]
y = df['survived']

In [4]:
X.shape, y.shape

((891, 3), (891,))

### Train Test Split

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
Xtrain.shape, Xtest.shape

((712, 3), (179, 3))

In [8]:
ytrain.shape, ytest.shape

((712,), (179,))

### Exploratory Data Analysis

### Feature Engineering

In [9]:
# We cheated a bit by already filling in missing values in the beginning.
# You should not do that.

### Create a model

- Logistic Regression
- Decision Tree
- Random Forest

In [10]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(Xtrain, ytrain)   # train the model = find the best coefficients
model.score(Xtrain, ytrain)

0.6404494382022472

### Cross-Validation

In [11]:
from sklearn.model_selection import cross_val_score

# cv=number of splits
scores = cross_val_score(model, Xtrain, ytrain, cv=5, scoring="accuracy")
scores

array([0.65734266, 0.61538462, 0.63380282, 0.63380282, 0.61971831])

In [12]:
scores.mean().round(3), scores.std().round(3)

(0.632, 0.015)

### Hyperparameter Optimization

Hyperparameters are **parameters that are set before the learning process starts.**

Which hyperparameters have you seen?

Logistic Regression:
- C (inverse regularization strength)
...

Decision Tree:
- Depth of the tree
...

Random Forest:
- Nr. of trees
- Depth of the trees
...

In [13]:
from sklearn.linear_model import LogisticRegression

#### Logistic Regression

In [14]:
model = LogisticRegression(C=10, penalty='l2')
#<-- Hyperparameters are the parameters that are set when choosing the model

#### Decision Tree

In [15]:
from sklearn.tree import DecisionTreeClassifier

In [16]:
model_dt = DecisionTreeClassifier(max_depth=3)
# Maximum Depth of a tree is the number of questions that the model asks

## Example: Max Depth of 3
# 1) Male or Female?
# 2) Are you traveling alone?
# 3) Are you below 20 years old?

#### Random Forest

In [17]:
from sklearn.ensemble import RandomForestClassifier

In [18]:
model_rf = RandomForestClassifier(n_estimators=50, max_depth=3, min_samples_split=2)
# n_estimators: Nr. of trees

# One part of the randomness of a random forest is that, at each question of each tree,
# the random forest only has a random subset of the features available
# The number of features available is defined by max_features

## Use Grid Search (or Randomized Search)

Grid Search will take different combinations of hyperparameters and calculate cross-validation scores for your model. The possible combinations of hyperparameters are specified by you.

In [19]:
from sklearn.model_selection import GridSearchCV

In [36]:
# Perform Grid Search for the Random Forest

In [21]:
# param_grid is a dictionary with parameters as keys and values as values

param_grid = {
    'n_estimators': [1, 3, 10, 20, 50, 100],
    'max_depth': [1, 3, 5, 10, None]
}

In [22]:
gridcv = GridSearchCV(model_rf, param_grid=param_grid)

In [23]:
gridcv.fit(Xtrain, ytrain)

GridSearchCV(estimator=RandomForestClassifier(max_depth=3, n_estimators=50),
             param_grid={'max_depth': [1, 3, 5, 10, None],
                         'n_estimators': [1, 3, 10, 20, 50, 100]})

In [24]:
gridcv.best_estimator_

RandomForestClassifier(max_depth=10)

In [25]:
# Look at the results
results = pd.DataFrame(gridcv.cv_results_)
display(results)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.006835,0.001072,0.003045,0.000396,1.0,1,"{'max_depth': 1, 'n_estimators': 1}",0.643357,0.643357,0.626761,0.626761,0.619718,0.631991,0.00963,25
1,0.009791,5.9e-05,0.003182,0.000117,1.0,3,"{'max_depth': 1, 'n_estimators': 3}",0.657343,0.622378,0.626761,0.626761,0.626761,0.632,0.012784,24
2,0.022707,0.00115,0.00435,0.000819,1.0,10,"{'max_depth': 1, 'n_estimators': 10}",0.643357,0.622378,0.647887,0.65493,0.619718,0.637654,0.014077,21
3,0.043899,0.001163,0.007504,0.004513,1.0,20,"{'max_depth': 1, 'n_estimators': 20}",0.664336,0.629371,0.65493,0.633803,0.626761,0.64184,0.014999,18
4,0.093629,0.002188,0.008931,0.000343,1.0,50,"{'max_depth': 1, 'n_estimators': 50}",0.643357,0.629371,0.65493,0.647887,0.626761,0.640461,0.010803,19
5,0.193544,0.009141,0.015146,0.000722,1.0,100,"{'max_depth': 1, 'n_estimators': 100}",0.657343,0.629371,0.633803,0.647887,0.626761,0.639033,0.011704,20
6,0.005777,0.000213,0.002937,0.0001,3.0,1,"{'max_depth': 3, 'n_estimators': 1}",0.636364,0.657343,0.612676,0.619718,0.570423,0.619305,0.028884,29
7,0.009646,0.00037,0.003214,7.6e-05,3.0,3,"{'max_depth': 3, 'n_estimators': 3}",0.636364,0.664336,0.661972,0.647887,0.633803,0.648872,0.012611,14
8,0.024185,0.001698,0.004317,0.000352,3.0,10,"{'max_depth': 3, 'n_estimators': 10}",0.664336,0.664336,0.626761,0.676056,0.640845,0.654467,0.017967,12
9,0.042978,0.003333,0.006499,0.00138,3.0,20,"{'max_depth': 3, 'n_estimators': 20}",0.65035,0.65035,0.690141,0.647887,0.640845,0.655915,0.017464,11


In [26]:
# Let us look at the columns of the results DataFrame
results.columns

Index(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time',
       'param_max_depth', 'param_n_estimators', 'params', 'split0_test_score',
       'split1_test_score', 'split2_test_score', 'split3_test_score',
       'split4_test_score', 'mean_test_score', 'std_test_score',
       'rank_test_score'],
      dtype='object')

In [37]:
# Let's sort by mean_test score to see which model specifications performed best
columns = ['mean_test_score', 'std_test_score', 'mean_fit_time', 'param_max_depth',
          'param_n_estimators']
results[columns].sort_values('mean_test_score', ascending=False)

Unnamed: 0,mean_test_score,std_test_score,mean_fit_time,param_max_depth,param_n_estimators
23,0.668522,0.025162,0.200621,10.0,100
16,0.665734,0.020116,0.099259,5.0,50
17,0.662967,0.022405,0.192801,5.0,100
21,0.662947,0.02481,0.045011,10.0,20
10,0.661548,0.032491,0.099313,3.0,50
14,0.661529,0.01471,0.022667,5.0,10
13,0.661529,0.019346,0.011434,5.0,3
15,0.658722,0.019941,0.042141,5.0,20
20,0.657323,0.029756,0.023893,10.0,10
11,0.657293,0.013849,0.198995,3.0,100


# Randomized Search

Alternative to Grid Search if your parameter space is large or you do not have any expert knowledge on
which hyperparameters to choose.

It randomly chooses combinations of hyperparameters given some distribution function.

In [28]:
from sklearn.model_selection import RandomizedSearchCV

In [29]:
# Where do we get probability distributions from?
from scipy import stats

In [30]:
# stats

In [31]:
# randomized search cv needs distribution functions to sample from
param_distributions = {
    'n_estimators': list(range(1, 100)),
    'max_depth': list(range(1, 15))
}

In [32]:
randomizedcv = RandomizedSearchCV(model_rf, param_distributions=param_distributions)

In [33]:
randomizedcv.fit(Xtrain, ytrain)

RandomizedSearchCV(estimator=RandomForestClassifier(max_depth=3,
                                                    n_estimators=50),
                   param_distributions={'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9,
                                                      10, 11, 12, 13, 14],
                                        'n_estimators': [1, 2, 3, 4, 5, 6, 7, 8,
                                                         9, 10, 11, 12, 13, 14,
                                                         15, 16, 17, 18, 19, 20,
                                                         21, 22, 23, 24, 25, 26,
                                                         27, 28, 29, 30, ...]})

In [34]:
random_results = pd.DataFrame(randomizedcv.cv_results_)
display(random_results.sort_values('mean_test_score', ascending=False))

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_estimators,param_max_depth,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
4,0.146823,0.006962,0.012981,0.001855,73,3,"{'n_estimators': 73, 'max_depth': 3}",0.664336,0.678322,0.65493,0.669014,0.626761,0.658672,0.017649,1
8,0.178152,0.00373,0.01634,0.004222,80,9,"{'n_estimators': 80, 'max_depth': 9}",0.664336,0.657343,0.65493,0.690141,0.612676,0.655885,0.02496,2
2,0.172936,0.002077,0.014866,0.002643,89,4,"{'n_estimators': 89, 'max_depth': 4}",0.65035,0.643357,0.683099,0.65493,0.633803,0.653107,0.016602,3
3,0.134994,0.005327,0.011267,0.000852,63,10,"{'n_estimators': 63, 'max_depth': 10}",0.65035,0.671329,0.640845,0.683099,0.619718,0.653068,0.02238,4
5,0.065715,0.003912,0.00769,0.001441,28,11,"{'n_estimators': 28, 'max_depth': 11}",0.643357,0.65035,0.661972,0.676056,0.626761,0.651699,0.016686,5
9,0.009318,0.002,0.004078,0.001225,2,10,"{'n_estimators': 2, 'max_depth': 10}",0.643357,0.615385,0.676056,0.697183,0.598592,0.646114,0.036647,6
6,0.017353,0.002628,0.004967,0.00184,6,8,"{'n_estimators': 6, 'max_depth': 8}",0.657343,0.629371,0.661972,0.669014,0.598592,0.643258,0.026076,7
1,0.03337,0.000823,0.00539,0.001035,16,1,"{'n_estimators': 16, 'max_depth': 1}",0.622378,0.657343,0.65493,0.647887,0.626761,0.64186,0.014522,8
0,0.217456,0.012222,0.017693,0.002906,96,14,"{'n_estimators': 96, 'max_depth': 14}",0.643357,0.643357,0.640845,0.683099,0.598592,0.64185,0.026752,9
7,0.019063,0.002146,0.004592,0.001234,6,12,"{'n_estimators': 6, 'max_depth': 12}",0.657343,0.636364,0.612676,0.626761,0.605634,0.627755,0.018261,10


### Optimization finished: what next?

In [35]:
# Re-define the model with the optimal hyperparameters
# model_optimal = RandomForestClassifier(optimal_parameters)

model.fit(Xtrain, ytrain)
print('training score: ', model.score(Xtrain, ytrain).round(3))
print('test score    : ', model.score(Xtest, ytest).round(3))

training score:  0.64
test score    :  0.615


#### Interpretation

* training and test score are similar: all good
* training >> test score: overfitting
* traing < test score: random fluctuation; probably your dataset is very small or BUG

Check this out:
https://chrisalbon.com/machine_learning/model_selection/hyperparameter_tuning_using_grid_search/