# Day 09. Exercise 01
# Gridsearch

## 0. Imports

In [84]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from collections import Counter

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedKFold
from sklearn import metrics, svm, linear_model, tree, ensemble
import tqdm.notebook
import pickle

## 1. Preprocessing

1. Read the file [`day-of-week-not-scaled.csv`](https://drive.google.com/file/d/1AlGvsJDSzPT_70caausx8bFuupIEZkfh/view?usp=sharing). It is similar to the one from the previous exercise, but this time we did not scale continuous features (we are not going to use logreg anymore).
2. Using `train_test_split` with parameters `test_size=0.2`, `random_state=21` get `X_train`, `y_train`, `X_test`, `y_test`. Use the additional parameter `stratify`.

In [3]:
df = pd.read_csv('../data/day-of-week-not-scaled.csv')
df2 = pd.read_csv('../data/dayofweek.csv', index_col='index')
df

Unnamed: 0,numTrials,hour,uid_user_0,uid_user_1,uid_user_10,uid_user_11,uid_user_12,uid_user_13,uid_user_14,uid_user_15,...,labname_lab02,labname_lab03,labname_lab03s,labname_lab05s,labname_laba04,labname_laba04s,labname_laba05,labname_laba06,labname_laba06s,labname_project1
0,1,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,2,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,3,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,4,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,5,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1681,9,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1682,6,20,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1683,7,20,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1684,8,20,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [4]:
df.columns

Index(['numTrials', 'hour', 'uid_user_0', 'uid_user_1', 'uid_user_10',
       'uid_user_11', 'uid_user_12', 'uid_user_13', 'uid_user_14',
       'uid_user_15', 'uid_user_16', 'uid_user_17', 'uid_user_18',
       'uid_user_19', 'uid_user_2', 'uid_user_20', 'uid_user_21',
       'uid_user_22', 'uid_user_23', 'uid_user_24', 'uid_user_25',
       'uid_user_26', 'uid_user_27', 'uid_user_28', 'uid_user_29',
       'uid_user_3', 'uid_user_30', 'uid_user_31', 'uid_user_4', 'uid_user_6',
       'uid_user_7', 'uid_user_8', 'labname_code_rvw', 'labname_lab02',
       'labname_lab03', 'labname_lab03s', 'labname_lab05s', 'labname_laba04',
       'labname_laba04s', 'labname_laba05', 'labname_laba06',
       'labname_laba06s', 'labname_project1'],
      dtype='object')

In [5]:
df['dayofweek'] = df2.dayofweek

In [6]:
X = df.drop(columns='dayofweek')
y = df['dayofweek']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21, stratify=y)

## 2. SVM gridsearch

1. Using `GridSearchCV` try different parameters of kernel (`linear`, `rbf`, `sigmoid`), C (`0.01`, `0.1`, `1`, `1.5`, `5`, `10`), gamma (`scale`, `auto`), class_weight (`balanced`, `None`) use `random_state=21` and `probability=True` and get the best combination of them in terms of accuracy.
2. Create a dataframe from the results of the gridsearch and sort it ascendingly by the `rank_test_score`. Check if there is a huge difference between different combinations (sometimes a simpler model may give a comparable result).

In [70]:
parameters = {'kernel': ['linear', 'rbf', 'sigmoid'],
			  'C': [0.01, 0.1, 1, 1.5, 5, 10],
			  'gamma': ['scale', 'auto'],
			  'class_weight': ['balanced', None]}
svc = svm.SVC(random_state=21, probability=True)
clf = GridSearchCV(svc, parameters, return_train_score=True, cv=2)
clf.fit(X_train, y_train)

GridSearchCV(cv=2, estimator=SVC(probability=True, random_state=21),
             param_grid={'C': [0.01, 0.1, 1, 1.5, 5, 10],
                         'class_weight': ['balanced', None],
                         'gamma': ['scale', 'auto'],
                         'kernel': ['linear', 'rbf', 'sigmoid']},
             return_train_score=True)

In [71]:
# print best parameter after tuning 
print(clf.best_params_) 
grid_predictions = clf.predict(X_test) 
   
# print classification report 
print(metrics.classification_report(y_test, grid_predictions)) 

{'C': 10, 'class_weight': None, 'gamma': 'auto', 'kernel': 'rbf'}
              precision    recall  f1-score   support

           0       0.95      0.70      0.81        27
           1       0.80      0.89      0.84        55
           2       1.00      0.83      0.91        30
           3       0.90      0.96      0.93        80
           4       0.95      0.86      0.90        21
           5       0.92      0.91      0.92        54
           6       0.85      0.89      0.87        71

    accuracy                           0.89       338
   macro avg       0.91      0.86      0.88       338
weighted avg       0.89      0.89      0.89       338



In [72]:
res = zip(clf.cv_results_['rank_test_score'], clf.cv_results_['mean_test_score'])

In [73]:
res = zip(clf.cv_results_['rank_test_score'], clf.cv_results_['mean_test_score'])
diction = dict(res)
l = []
for i in range(1, len(clf.cv_results_['rank_test_score'])):
	if i in diction:
		last = i
		l.append(diction[i])
	else:
		l.append(diction[last])

In [74]:
diction

{36: 0.32344213649851633,
 65: 0.15578635014836795,
 66: 0.1483679525222552,
 64: 0.1632047477744807,
 69: 0.12759643916913946,
 34: 0.3241839762611276,
 49: 0.2344213649851632,
 25: 0.4977744807121662,
 46: 0.2440652818991098,
 57: 0.1943620178041543,
 33: 0.32566765578635015,
 22: 0.543026706231454,
 43: 0.2959940652818991,
 42: 0.30044510385756673,
 40: 0.30118694362017806,
 19: 0.6142433234421365,
 44: 0.28857566765578635,
 61: 0.19287833827893175,
 28: 0.4465875370919881,
 71: 0.0741839762611276,
 15: 0.637240356083086,
 54: 0.2129080118694362,
 27: 0.4873887240356083,
 47: 0.23590504451038574,
 17: 0.6350148367952522,
 41: 0.3004451038575668,
 59: 0.19362017804154302,
 24: 0.5103857566765578,
 72: 0.050445103857566766,
 13: 0.6387240356083086,
 32: 0.336053412462908,
 55: 0.20623145400593473,
 21: 0.5600890207715133,
 45: 0.2507418397626113,
 9: 0.6869436201780414,
 4: 0.7373887240356083,
 67: 0.14614243323442136,
 11: 0.6594955489614244,
 30: 0.35682492581602376,
 56: 0.20548961

In [75]:
list(zip(list(range(1, len(l))), l))

[(1, 0.8093471810089021),
 (2, 0.8041543026706232),
 (3, 0.7477744807121662),
 (4, 0.7373887240356083),
 (5, 0.706973293768546),
 (6, 0.706973293768546),
 (7, 0.6951038575667656),
 (8, 0.6951038575667656),
 (9, 0.6869436201780414),
 (10, 0.6869436201780414),
 (11, 0.6594955489614244),
 (12, 0.6594955489614244),
 (13, 0.6387240356083086),
 (14, 0.6387240356083086),
 (15, 0.637240356083086),
 (16, 0.637240356083086),
 (17, 0.6350148367952522),
 (18, 0.6350148367952522),
 (19, 0.6142433234421365),
 (20, 0.6142433234421365),
 (21, 0.5600890207715133),
 (22, 0.543026706231454),
 (23, 0.543026706231454),
 (24, 0.5103857566765578),
 (25, 0.4977744807121662),
 (26, 0.4977744807121662),
 (27, 0.4873887240356083),
 (28, 0.4465875370919881),
 (29, 0.37833827893175076),
 (30, 0.35682492581602376),
 (31, 0.3486646884272997),
 (32, 0.336053412462908),
 (33, 0.32566765578635015),
 (34, 0.3241839762611276),
 (35, 0.3241839762611276),
 (36, 0.32344213649851633),
 (37, 0.32344213649851633),
 (38, 0.3234

## 3. Decision tree

1. Using `GridSearchCV` try different parameters of `max_depth` (from `1` to `49`), `class_weight` (`balanced`, `None`) and `criterion` (`entropy` and `gini`) and get the best combination of them in terms of accuracy. Use `random_state=21`.
2. Create a dataframe from the results of the gridsearch and sort it ascendingly by the `rank_test_score`, check if there is a huge difference between different combinations (sometimes a simpler model may give a comparable result).

In [52]:
parameters = {'max_depth': range(1, 50),
			  'class_weight': ['balanced', None],
			  'criterion': ['entropy', 'gini']}
tree_m = tree.DecisionTreeClassifier(random_state=21)
clf2 = GridSearchCV(tree_m, parameters, return_train_score=True, cv=2)
clf2.fit(X_train, y_train)

GridSearchCV(cv=2, estimator=DecisionTreeClassifier(random_state=21),
             param_grid={'class_weight': ['balanced', None],
                         'criterion': ['entropy', 'gini'],
                         'max_depth': range(1, 50)},
             return_train_score=True)

In [53]:
# print best parameter after tuning 
print(clf2.best_params_) 
grid_predictions2 = clf2.predict(X_test) 
   
# print classification report 
print(metrics.classification_report(y_test, grid_predictions2)) 

{'class_weight': None, 'criterion': 'gini', 'max_depth': 17}
              precision    recall  f1-score   support

           0       0.77      0.74      0.75        27
           1       0.96      0.80      0.87        55
           2       0.93      0.83      0.88        30
           3       0.85      0.94      0.89        80
           4       0.82      0.86      0.84        21
           5       0.75      0.85      0.80        54
           6       0.85      0.82      0.83        71

    accuracy                           0.85       338
   macro avg       0.85      0.83      0.84       338
weighted avg       0.85      0.85      0.85       338



In [56]:
res = zip(clf2.cv_results_['rank_test_score'], clf2.cv_results_['mean_test_score'])
diction = dict(res)
l = []
for i in range(1, len(clf2.cv_results_['rank_test_score'])):
	if i in diction:
		last = i
		l.append(diction[i])
	else:
		l.append(diction[last])

In [57]:
diction

{195: 0.29525222551928787,
 189: 0.3827893175074184,
 191: 0.3738872403560831,
 188: 0.40578635014836795,
 184: 0.4510385756676558,
 177: 0.5089020771513353,
 171: 0.6224035608308605,
 165: 0.6899109792284867,
 162: 0.7054896142433235,
 160: 0.7136498516320475,
 155: 0.7321958456973294,
 153: 0.75,
 146: 0.7789317507418397,
 142: 0.7922848664688427,
 107: 0.804154302670623,
 36: 0.814540059347181,
 98: 0.8093471810089021,
 35: 0.8152818991097923,
 37: 0.8130563798219586,
 187: 0.4102373887240356,
 182: 0.4562314540059347,
 172: 0.6135014836795252,
 168: 0.6416913946587537,
 161: 0.7084569732937686,
 159: 0.7173590504451038,
 157: 0.7270029673590505,
 151: 0.7596439169139466,
 152: 0.7514836795252225,
 147: 0.776706231454006,
 144: 0.7900593471810089,
 109: 0.8026706231454006,
 99: 0.8086053412462908,
 100: 0.806379821958457,
 102: 0.8056379821958457,
 68: 0.8108308605341246,
 193: 0.3553412462908012,
 186: 0.4109792284866469,
 183: 0.4547477744807122,
 180: 0.4792284866468843,
 176: 0.

In [67]:
list(zip(list(range(1, len(l))), l))

[(1, 0.8293768545994065),
 (2, 0.8286350148367952),
 (3, 0.8286350148367952),
 (4, 0.8286350148367952),
 (5, 0.8286350148367952),
 (6, 0.8286350148367952),
 (7, 0.8286350148367952),
 (8, 0.8286350148367952),
 (9, 0.8286350148367952),
 (10, 0.8286350148367952),
 (11, 0.8286350148367952),
 (12, 0.8286350148367952),
 (13, 0.8286350148367952),
 (14, 0.8286350148367952),
 (15, 0.8286350148367952),
 (16, 0.8286350148367952),
 (17, 0.8286350148367952),
 (18, 0.8286350148367952),
 (19, 0.8286350148367952),
 (20, 0.8286350148367952),
 (21, 0.8286350148367952),
 (22, 0.8286350148367952),
 (23, 0.8286350148367952),
 (24, 0.8286350148367952),
 (25, 0.8286350148367952),
 (26, 0.8286350148367952),
 (27, 0.8286350148367952),
 (28, 0.827893175074184),
 (29, 0.8271513353115727),
 (30, 0.8271513353115727),
 (31, 0.8249258160237389),
 (32, 0.8249258160237388),
 (33, 0.8219584569732938),
 (34, 0.8204747774480712),
 (35, 0.8152818991097923),
 (36, 0.814540059347181),
 (37, 0.8130563798219586),
 (38, 0.8130

## 4. Random forest

1. Using `GridSearchCV` try different parameters of `n_estimators` (`5`, `10`, `50`, `100`), `max_depth` (from `1` to `49`), `class_weight` (`balanced`, `None`) and `criterion` (`entropy` and `gini`) and get the best combination of them in terms of accuracy. Use random_state=21.
2. Create a dataframe from the results of the gridsearch and sort it ascendengly by the `rank_test_score`, check if there is a huge difference between different combinations (sometimes a simpler model may give a comparable result).

In [79]:
parameters = {'n_estimators': (5, 10, 50, 100),
			  'max_depth': range(1, 50),
			  'class_weight': ['balanced', None],
			  'criterion': ['entropy', 'gini']}
forest_m = ensemble.RandomForestClassifier(random_state=21)
clf3 = GridSearchCV(forest_m, parameters, return_train_score=True, cv=2)
clf3.fit(X_train, y_train)

GridSearchCV(cv=2, estimator=RandomForestClassifier(random_state=21),
             param_grid={'class_weight': ['balanced', None],
                         'criterion': ['entropy', 'gini'],
                         'max_depth': range(1, 50),
                         'n_estimators': (5, 10, 50, 100)},
             return_train_score=True)

In [80]:
# print best parameter after tuning 
print(clf3.best_params_) 
grid_predictions3 = clf3.predict(X_test) 
   
# print classification report 
print(metrics.classification_report(y_test, grid_predictions3))

{'class_weight': None, 'criterion': 'entropy', 'max_depth': 28, 'n_estimators': 100}
              precision    recall  f1-score   support

           0       0.87      0.74      0.80        27
           1       0.98      0.91      0.94        55
           2       0.93      0.93      0.93        30
           3       0.95      0.97      0.96        80
           4       0.95      0.86      0.90        21
           5       0.88      0.94      0.91        54
           6       0.93      0.99      0.96        71

    accuracy                           0.93       338
   macro avg       0.93      0.91      0.92       338
weighted avg       0.93      0.93      0.93       338



In [81]:
res = zip(clf3.cv_results_['rank_test_score'], clf3.cv_results_['mean_test_score'])
diction = dict(res)
l = []
for i in range(1, len(clf2.cv_results_['rank_test_score'])):
	if i in diction:
		last = i
		l.append(diction[i])
	else:
		l.append(diction[last])

In [82]:
diction

{783: 0.24777448071216618,
 782: 0.2967359050445104,
 764: 0.41246290801186947,
 749: 0.4540059347181009,
 780: 0.30934718100890213,
 767: 0.39243323442136496,
 745: 0.4903560830860534,
 737: 0.5081602373887241,
 774: 0.38130563798219586,
 756: 0.43694362017804156,
 725: 0.5474777448071217,
 718: 0.5786350148367952,
 755: 0.43991097922848665,
 738: 0.508160237388724,
 708: 0.6142433234421365,
 703: 0.629080118694362,
 729: 0.5304154302670623,
 719: 0.5764094955489614,
 680: 0.6973293768545994,
 676: 0.7084569732937684,
 717: 0.5801186943620178,
 698: 0.6416913946587537,
 668: 0.7247774480712166,
 652: 0.7618694362017804,
 688: 0.685459940652819,
 642: 0.7737388724035608,
 628: 0.7908011869436202,
 662: 0.7455489614243324,
 626: 0.7922848664688427,
 614: 0.8041543026706232,
 674: 0.7136498516320475,
 660: 0.7462908011869436,
 599: 0.8123145400593472,
 588: 0.8160237388724035,
 663: 0.7433234421364985,
 640: 0.7781899109792285,
 502: 0.8301186943620178,
 464: 0.836053412462908,
 643: 0.7

In [83]:
list(zip(list(range(1, len(l))), l))

[(1, 0.8746290801186943),
 (2, 0.8746290801186943),
 (3, 0.8746290801186943),
 (4, 0.8738872403560831),
 (5, 0.8738872403560831),
 (6, 0.8731454005934718),
 (7, 0.8731454005934718),
 (8, 0.8724035608308605),
 (9, 0.8724035608308605),
 (10, 0.8724035608308605),
 (11, 0.8716617210682492),
 (12, 0.8716617210682492),
 (13, 0.8716617210682492),
 (14, 0.8716617210682492),
 (15, 0.8716617210682492),
 (16, 0.8716617210682492),
 (17, 0.8716617210682492),
 (18, 0.8716617210682492),
 (19, 0.8716617210682492),
 (20, 0.8716617210682492),
 (21, 0.8716617210682492),
 (22, 0.8716617210682492),
 (23, 0.8716617210682492),
 (24, 0.8716617210682492),
 (25, 0.8716617210682492),
 (26, 0.8716617210682492),
 (27, 0.8716617210682492),
 (28, 0.870919881305638),
 (29, 0.870919881305638),
 (30, 0.870919881305638),
 (31, 0.8701780415430267),
 (32, 0.8701780415430267),
 (33, 0.8701780415430267),
 (34, 0.8701780415430267),
 (35, 0.8701780415430267),
 (36, 0.8701780415430267),
 (37, 0.8701780415430267),
 (38, 0.86943

## 5. Progress bar

Gridsearch can be a quite long process and you may find yourself wondering when it will end.
1. Create a manual gridsearch for the same parameters values of random forest iterating through the list of the possible values and calculating `cross_val_score` for each combination. Try to increase `n_jobs`. The value `cv` for `cross_val_score` is 5.
2. Track the progress using the library `tqdm.notebook`.
3. Create a dataframe from the results of the gridsearch with the columns corresponding to the names of the parameters and `mean_accuracy` and `std_accuracy`.
4. Sort it descendingly by the `mean_accuracy`, check if there is a huge difference between different combinations (sometimes a simpler model may give a comparable result).

In [87]:
parameters = {'n_estimators': [5, 10, 50, 100],
			  'max_depth': range(1, 50),
			  'class_weight': ['balanced', None],
			  'criterion': ['entropy', 'gini']}
forest_m = ensemble.RandomForestClassifier(random_state=21)
clf3 = GridSearchCV(forest_m, parameters, return_train_score=True, cv=5, n_jobs=4, verbose=10)
clf3.fit(X_train, y_train)

Fitting 5 folds for each of 784 candidates, totalling 3920 fits
[CV 1/5; 1/784] START class_weight=balanced, criterion=entropy, max_depth=1, n_estimators=5
[CV 4/5; 1/784] START class_weight=balanced, criterion=entropy, max_depth=1, n_estimators=5
[CV 2/5; 1/784] START class_weight=balanced, criterion=entropy, max_depth=1, n_estimators=5
[CV 3/5; 1/784] START class_weight=balanced, criterion=entropy, max_depth=1, n_estimators=5
[CV 2/5; 1/784] END class_weight=balanced, criterion=entropy, max_depth=1, n_estimators=5;, score=(train=0.301, test=0.293) total time=   0.1s
[CV 3/5; 1/784] END class_weight=balanced, criterion=entropy, max_depth=1, n_estimators=5;, score=(train=0.233, test=0.226) total time=   0.1s
[CV 1/5; 1/784] END class_weight=balanced, criterion=entropy, max_depth=1, n_estimators=5;, score=(train=0.304, test=0.263) total time=   0.1s
[CV 4/5; 1/784] END class_weight=balanced, criterion=entropy, max_depth=1, n_estimators=5;, score=(train=0.278, test=0.283) total time=   0

GridSearchCV(cv=5, estimator=RandomForestClassifier(random_state=21), n_jobs=4,
             param_grid={'class_weight': ['balanced', None],
                         'criterion': ['entropy', 'gini'],
                         'max_depth': range(1, 50),
                         'n_estimators': [5, 10, 50, 100]},
             return_train_score=True, verbose=10)

## 6. Predictions

1. Choose the best model and use it to make predictions for the test dataset.
2. Calculate the final accuracy.

{'class_weight': None, 'criterion': 'entropy', 'max_depth': 28, 'n_estimators': 100}

In [88]:
random_forest = ensemble.RandomForestClassifier(n_estimators=100, max_depth=28, class_weight=None, criterion='entropy', random_state=21)
random_forest.fit(X_train, y_train)

RandomForestClassifier(criterion='entropy', max_depth=28, random_state=21)

In [90]:
y_pred = random_forest.predict(X_test)
print('Final accuracy:', metrics.accuracy_score(y_pred, y_test))

Final accuracy: 0.9319526627218935
