# Day 09. Exercise 01
# Gridsearch

## 0. Imports

In [10]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold, ParameterGrid
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from tqdm.notebook import tqdm
import itertools
from sklearn.metrics import accuracy_score
import joblib

## 1. Preprocessing

1. Read the file [`day-of-week-not-scaled.csv`](https://drive.google.com/file/d/1AlGvsJDSzPT_70caausx8bFuupIEZkfh/view?usp=sharing). It is similar to the one from the previous exercise, but this time we did not scale continuous features (we are not going to use logreg anymore).
2. Using `train_test_split` with parameters `test_size=0.2`, `random_state=21` get `X_train`, `y_train`, `X_test`, `y_test`. Use the additional parameter `stratify`.

In [2]:
df = pd.read_csv('../data/day-of-week-not-scaled.csv')
df.head()

Unnamed: 0,numTrials,hour,dayofweek,uid_user_0,uid_user_1,uid_user_10,uid_user_11,uid_user_12,uid_user_13,uid_user_14,...,labname_lab02,labname_lab03,labname_lab03s,labname_lab05s,labname_laba04,labname_laba04s,labname_laba05,labname_laba06,labname_laba06s,labname_project1
0,1,5,4,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,2,5,4,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,3,5,4,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,4,5,4,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,5,5,4,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [3]:
X = df.drop(columns='dayofweek')
y = df['dayofweek']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21, stratify=y)

## 2. SVM gridsearch

1. Using `GridSearchCV` try different parameters of kernel (`linear`, `rbf`, `sigmoid`), C (`0.01`, `0.1`, `1`, `1.5`, `5`, `10`), gamma (`scale`, `auto`), class_weight (`balanced`, `None`) use `random_state=21` and `probability=True` and get the best combination of them in terms of accuracy.
2. Create a dataframe from the results of the gridsearch and sort it ascendingly by the `rank_test_score`. Check if there is a huge difference between different combinations (sometimes a simpler model may give a comparable result).

In [36]:
estimator = SVC(random_state=21, probability=True)

params = {
    'kernel': ['linear', 'rbf', 'sigmoid'],
    'C': [0.01, 0.1, 1, 1.5, 5, 10],
    'gamma': ['scale', 'auto'],
    'class_weight': [None, 'balanced'],
}

grid_search = GridSearchCV(estimator=estimator, param_grid=params, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

results = pd.DataFrame(grid_search.cv_results_)

In [37]:
results.sort_values('rank_test_score', ascending=True).head(10)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_class_weight,param_gamma,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
64,0.545203,0.067143,0.035405,0.006352,10.0,,auto,rbf,"{'C': 10, 'class_weight': None, 'gamma': 'auto...",0.9,0.848148,0.885185,0.884758,0.862454,0.876109,0.018419,1
70,0.531529,0.050599,0.037353,0.006091,10.0,balanced,auto,rbf,"{'C': 10, 'class_weight': 'balanced', 'gamma':...",0.877778,0.851852,0.862963,0.873606,0.851301,0.8635,0.01087,2
52,0.494874,0.058588,0.041312,0.005155,5.0,,auto,rbf,"{'C': 5, 'class_weight': None, 'gamma': 'auto'...",0.825926,0.811111,0.818519,0.821561,0.802974,0.816018,0.008116,3
58,0.540466,0.034873,0.044792,0.005972,5.0,balanced,auto,rbf,"{'C': 5, 'class_weight': 'balanced', 'gamma': ...",0.844444,0.785185,0.792593,0.817844,0.799257,0.807865,0.021257,4
69,38.949451,3.663495,0.006236,0.001133,10.0,balanced,auto,linear,"{'C': 10, 'class_weight': 'balanced', 'gamma':...",0.72963,0.7,0.755556,0.754647,0.665428,0.721052,0.034438,5
66,45.282948,4.602333,0.008726,0.000978,10.0,balanced,scale,linear,"{'C': 10, 'class_weight': 'balanced', 'gamma':...",0.72963,0.7,0.755556,0.754647,0.665428,0.721052,0.034438,5
63,37.940567,4.196725,0.009424,0.003092,10.0,,auto,linear,"{'C': 10, 'class_weight': None, 'gamma': 'auto...",0.737037,0.711111,0.707407,0.743494,0.698885,0.719587,0.017463,7
60,37.332926,4.13422,0.010351,0.001584,10.0,,scale,linear,"{'C': 10, 'class_weight': None, 'gamma': 'scal...",0.737037,0.711111,0.707407,0.743494,0.698885,0.719587,0.017463,7
57,26.467904,1.206721,0.00919,0.002919,5.0,balanced,auto,linear,"{'C': 5, 'class_weight': 'balanced', 'gamma': ...",0.725926,0.692593,0.696296,0.754647,0.66171,0.706234,0.031619,9
54,26.303837,1.257999,0.00891,0.002487,5.0,balanced,scale,linear,"{'C': 5, 'class_weight': 'balanced', 'gamma': ...",0.725926,0.692593,0.696296,0.754647,0.66171,0.706234,0.031619,9


In [38]:
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print(f"Best Parameters: {best_params}")
print(f"Best Accuracy: {best_score:.5f}")


Best Parameters: {'C': 10, 'class_weight': None, 'gamma': 'auto', 'kernel': 'rbf'}
Best Accuracy: 0.87611


## 3. Decision tree

1. Using `GridSearchCV` try different parameters of `max_depth` (from `1` to `49`), `class_weight` (`balanced`, `None`) and `criterion` (`entropy` and `gini`) and get the best combination of them in terms of accuracy. Use `random_state=21`.
2. Create a dataframe from the results of the gridsearch and sort it ascendingly by the `rank_test_score`, check if there is a huge difference between different combinations (sometimes a simpler model may give a comparable result).

In [39]:
param_grid = {
    'max_depth': list(range(1, 50)), 
    'class_weight': ['balanced', None],
    'criterion': ['gini', 'entropy']
}

tree_clf = DecisionTreeClassifier(random_state=21)

grid_search = GridSearchCV(
    tree_clf, param_grid, 
    cv=10, scoring='accuracy', 
    n_jobs=-1, verbose=1
)

grid_search.fit(X_train, y_train)

results_df = pd.DataFrame(grid_search.cv_results_)

results_df

Fitting 10 folds for each of 196 candidates, totalling 1960 fits


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_class_weight,param_criterion,param_max_depth,params,split0_test_score,split1_test_score,...,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
0,0.002503,0.001267,0.001444,0.000923,balanced,gini,1,"{'class_weight': 'balanced', 'criterion': 'gin...",0.266667,0.259259,...,0.348148,0.311111,0.281481,0.325926,0.311111,0.268657,0.313433,0.298209,0.027378,195
1,0.001911,0.000252,0.000833,0.000298,balanced,gini,2,"{'class_weight': 'balanced', 'criterion': 'gin...",0.377778,0.355556,...,0.400000,0.377778,0.370370,0.429630,0.392593,0.343284,0.388060,0.380542,0.022858,190
2,0.002147,0.000398,0.000807,0.000266,balanced,gini,3,"{'class_weight': 'balanced', 'criterion': 'gin...",0.377778,0.377778,...,0.318519,0.340741,0.466667,0.451852,0.400000,0.343284,0.365672,0.379784,0.045338,192
3,0.002332,0.000429,0.000745,0.000056,balanced,gini,4,"{'class_weight': 'balanced', 'criterion': 'gin...",0.385185,0.414815,...,0.385185,0.451852,0.481481,0.437037,0.370370,0.343284,0.417910,0.408712,0.038749,187
4,0.002271,0.000181,0.000731,0.000074,balanced,gini,5,"{'class_weight': 'balanced', 'criterion': 'gin...",0.540741,0.444444,...,0.600000,0.533333,0.548148,0.540741,0.451852,0.432836,0.440299,0.499165,0.056464,180
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
191,0.006192,0.002746,0.001299,0.001124,,entropy,45,"{'class_weight': None, 'criterion': 'entropy',...",0.844444,0.903704,...,0.903704,0.874074,0.881481,0.925926,0.859259,0.880597,0.880597,0.880564,0.023988,69
192,0.004767,0.002128,0.001151,0.000824,,entropy,46,"{'class_weight': None, 'criterion': 'entropy',...",0.844444,0.903704,...,0.903704,0.874074,0.881481,0.925926,0.859259,0.880597,0.880597,0.880564,0.023988,69
193,0.004175,0.001600,0.001038,0.000864,,entropy,47,"{'class_weight': None, 'criterion': 'entropy',...",0.844444,0.903704,...,0.903704,0.874074,0.881481,0.925926,0.859259,0.880597,0.880597,0.880564,0.023988,69
194,0.004686,0.002013,0.001147,0.000655,,entropy,48,"{'class_weight': None, 'criterion': 'entropy',...",0.844444,0.903704,...,0.903704,0.874074,0.881481,0.925926,0.859259,0.880597,0.880597,0.880564,0.023988,69


In [41]:
columns_to_show = [
    'rank_test_score', 'mean_test_score', 'std_test_score',
    'param_max_depth', 'param_class_weight', 'param_criterion'
]
sorted_results = results_df[columns_to_show].sort_values(by="rank_test_score")

sorted_results

Unnamed: 0,rank_test_score,mean_test_score,std_test_score,param_max_depth,param_class_weight,param_criterion
97,1,0.889442,0.016890,49,balanced,entropy
95,1,0.889442,0.016890,47,balanced,entropy
94,1,0.889442,0.016890,46,balanced,entropy
93,1,0.889442,0.016890,45,balanced,entropy
92,1,0.889442,0.016890,44,balanced,entropy
...,...,...,...,...,...,...
2,192,0.379784,0.045338,3,balanced,gini
147,193,0.355318,0.010945,1,,entropy
98,193,0.355318,0.010945,1,,gini
49,195,0.298209,0.027378,1,balanced,entropy


In [42]:
best_params = grid_search.best_params_
best_score = grid_search.best_score_
print(f"Best Parameters: {best_params}")
print(f"Best Accuracy: {best_score:.5f}")

Best Parameters: {'class_weight': 'balanced', 'criterion': 'entropy', 'max_depth': 25}
Best Accuracy: 0.88944


## 4. Random forest

1. Using `GridSearchCV` try different parameters of `n_estimators` (`5`, `10`, `50`, `100`), `max_depth` (from `1` to `49`), `class_weight` (`balanced`, `None`) and `criterion` (`entropy` and `gini`) and get the best combination of them in terms of accuracy. Use random_state=21.
2. Create a dataframe from the results of the gridsearch and sort it ascendengly by the `rank_test_score`, check if there is a huge difference between different combinations (sometimes a simpler model may give a comparable result).

In [43]:
param_grid = {
    'n_estimators': [5, 10, 50, 100],
    'max_depth': list(range(1, 50)),  
    'class_weight': ['balanced', None],
    'criterion': ['gini', 'entropy']
}

rf_clf = RandomForestClassifier(random_state=21)

grid_search = GridSearchCV(
    rf_clf, param_grid, 
    cv=10, scoring='accuracy', 
    n_jobs=-1, verbose=1
)

grid_search.fit(X_train, y_train)

results_df = pd.DataFrame(grid_search.cv_results_)
results_df

Fitting 10 folds for each of 784 candidates, totalling 7840 fits


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_class_weight,param_criterion,param_max_depth,param_n_estimators,params,split0_test_score,...,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
0,0.007530,0.002068,0.001721,0.000782,balanced,gini,1,5,"{'class_weight': 'balanced', 'criterion': 'gin...",0.244444,...,0.214815,0.229630,0.288889,0.244444,0.325926,0.253731,0.335821,0.268585,0.037664,784
1,0.008407,0.001286,0.001153,0.000069,balanced,gini,1,10,"{'class_weight': 'balanced', 'criterion': 'gin...",0.377778,...,0.370370,0.355556,0.377778,0.400000,0.333333,0.350746,0.373134,0.362758,0.019401,779
2,0.042712,0.008646,0.003146,0.001844,balanced,gini,1,50,"{'class_weight': 'balanced', 'criterion': 'gin...",0.362963,...,0.392593,0.377778,0.429630,0.451852,0.377778,0.440299,0.492537,0.408839,0.042772,769
3,0.092637,0.013135,0.006059,0.003766,balanced,gini,1,100,"{'class_weight': 'balanced', 'criterion': 'gin...",0.355556,...,0.540741,0.370370,0.370370,0.444444,0.377778,0.432836,0.432836,0.425086,0.058683,762
4,0.008344,0.003019,0.002041,0.001232,balanced,gini,2,5,"{'class_weight': 'balanced', 'criterion': 'gin...",0.370370,...,0.370370,0.266667,0.370370,0.340741,0.274074,0.276119,0.350746,0.325650,0.039988,782
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
779,0.204957,0.012225,0.006897,0.002135,,entropy,48,100,"{'class_weight': None, 'criterion': 'entropy',...",0.903704,...,0.940741,0.911111,0.911111,0.925926,0.903704,0.925373,0.895522,0.919127,0.017439,3
780,0.015963,0.002476,0.002143,0.001197,,entropy,49,5,"{'class_weight': None, 'criterion': 'entropy',...",0.837037,...,0.881481,0.896296,0.881481,0.911111,0.837037,0.888060,0.850746,0.875362,0.028127,532
781,0.022480,0.002193,0.002553,0.001186,,entropy,49,10,"{'class_weight': None, 'criterion': 'entropy',...",0.881481,...,0.888889,0.903704,0.925926,0.925926,0.888889,0.895522,0.873134,0.896866,0.022189,375
782,0.124151,0.015511,0.005118,0.002198,,entropy,49,50,"{'class_weight': None, 'criterion': 'entropy',...",0.903704,...,0.933333,0.911111,0.903704,0.925926,0.918519,0.925373,0.873134,0.916147,0.020604,47


In [44]:
columns_to_show = [
    'rank_test_score', 'mean_test_score', 'std_test_score',
    'param_n_estimators', 'param_max_depth', 'param_class_weight', 'param_criterion'
]
sorted_results = results_df[columns_to_show].sort_values(by="rank_test_score")

sorted_results

Unnamed: 0,rank_test_score,mean_test_score,std_test_score,param_n_estimators,param_max_depth,param_class_weight,param_criterion
87,1,0.919878,0.014760,100,22,balanced,gini
130,2,0.919856,0.024872,50,33,balanced,gini
783,3,0.919127,0.017439,100,49,,entropy
719,3,0.919127,0.017122,100,33,,entropy
723,3,0.919127,0.017439,100,34,,entropy
...,...,...,...,...,...,...,...
589,780,0.359022,0.014181,10,1,,entropy
200,781,0.340520,0.036346,5,2,balanced,entropy
4,782,0.325650,0.039988,5,2,balanced,gini
196,783,0.275218,0.042329,5,1,balanced,entropy


In [45]:
best_params = grid_search.best_params_
best_score = grid_search.best_score_
print(f"Best Parameters: {best_params}")
print(f"Best Accuracy: {best_score:.5f}")

Best Parameters: {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 22, 'n_estimators': 100}
Best Accuracy: 0.91988


## 5. Progress bar

Gridsearch can be a quite long process and you may find yourself wondering when it will end.
1. Create a manual gridsearch for the same parameters values of random forest iterating through the list of the possible values and calculating `cross_val_score` for each combination. Try to increase `n_jobs`. The value `cv` for `cross_val_score` is 5.
2. Track the progress using the library `tqdm.notebook`.
3. Create a dataframe from the results of the gridsearch with the columns corresponding to the names of the parameters and `mean_accuracy` and `std_accuracy`.
4. Sort it descendingly by the `mean_accuracy`, check if there is a huge difference between different combinations (sometimes a simpler model may give a comparable result).

In [4]:
params = {
    'n_estimators': [5, 10, 50, 100],
    'max_depth': np.arange(1, 50),
    'class_weight': [None, 'balanced'],
    'criterion': ['gini', 'entropy'],
    'random_state': [21],
}

params_list = list(ParameterGrid(params))
data = []

for params_model in tqdm(params_list):
    estimator = RandomForestClassifier(**params_model)
    cvs = cross_val_score(estimator, X_train, y_train, cv=5, n_jobs=-1)
    info = {**params_model, 'mean_accuracy': cvs.mean(), 'std_accuracy': cvs.std()}
    data.append(info)

  0%|          | 0/784 [00:00<?, ?it/s]

In [5]:
results =pd.DataFrame(data)
results.sort_values('mean_accuracy', ascending=False).head(10)

Unnamed: 0,class_weight,criterion,max_depth,n_estimators,random_state,mean_accuracy,std_accuracy
110,,gini,28,50,21,0.90429,0.010961
123,,gini,31,100,21,0.904287,0.015204
542,balanced,gini,38,50,21,0.903549,0.012503
586,balanced,gini,49,50,21,0.903549,0.012503
554,balanced,gini,41,50,21,0.903549,0.012503
546,balanced,gini,39,50,21,0.903549,0.012503
562,balanced,gini,43,50,21,0.903549,0.012503
566,balanced,gini,44,50,21,0.903549,0.012503
530,balanced,gini,35,50,21,0.903549,0.012503
558,balanced,gini,42,50,21,0.903549,0.012503


## 6. Predictions

1. Choose the best model and use it to make predictions for the test dataset.
2. Calculate the final accuracy.

best model is Random forest with parameters {'class_weight': None, 'criterion': 'gini', 'max_depth': 28, 'n_estimators': 50}

In [8]:
forest = RandomForestClassifier(random_state=21, n_estimators=100, max_depth=24, criterion='entropy', class_weight='balanced')
forest.fit(X_train, y_train)
forest.score(X_test, y_test)

0.9319526627218935

In [11]:
joblib.dump(forest, '../data/model_01.pkl')

['../data/model_01.pkl']