# Day 09. Exercise 01
# Gridsearch

## 0. Imports

In [104]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, ParameterGrid
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from tqdm.notebook import tqdm



## 1. Preprocessing

1. Read the file [`day-of-week-not-scaled.csv`](https://drive.google.com/file/d/1AlGvsJDSzPT_70caausx8bFuupIEZkfh/view?usp=sharing). It is similar to the one from the previous exercise, but this time we did not scale continuous features (we are not going to use logreg anymore).
2. Using `train_test_split` with parameters `test_size=0.2`, `random_state=21` get `X_train`, `y_train`, `X_test`, `y_test`. Use the additional parameter `stratify`.

Рандомно сгенерировала день недели, т.к его нет в данной нам таблице

In [105]:
df = pd.read_csv('../data/day-of-week-not-scaled.csv')

df['dayofweek'] = np.random.choice(np.arange(7), size=1686, replace=True)
df

Unnamed: 0,numTrials,hour,uid_user_0,uid_user_1,uid_user_10,uid_user_11,uid_user_12,uid_user_13,uid_user_14,uid_user_15,...,labname_lab03,labname_lab03s,labname_lab05s,labname_laba04,labname_laba04s,labname_laba05,labname_laba06,labname_laba06s,labname_project1,dayofweek
0,1,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4
1,2,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4
2,3,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0
3,4,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,5
4,5,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1681,9,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1
1682,6,20,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,5
1683,7,20,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,6
1684,8,20,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,4


In [106]:
X = df.drop('dayofweek', axis=1)
y = df['dayofweek']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21, stratify=y)


## 2. SVM gridsearch

1. Using `GridSearchCV` try different parameters of kernel (`linear`, `rbf`, `sigmoid`), C (`0.01`, `0.1`, `1`, `1.5`, `5`, `10`), gamma (`scale`, `auto`), class_weight (`balanced`, `None`) use `random_state=21` and `probability=True` and get the best combination of them in terms of accuracy.
2. Create a dataframe from the results of the gridsearch and sort it ascendingly by the `rank_test_score`. Check if there is a huge difference between different combinations (sometimes a simpler model may give a comparable result).

In [107]:
svc = SVC(random_state=21, probability=True)


In [108]:
param_grid = { 'C' : [0.01, 0.1, 1, 1.5, 5, 10],
              'gamma' : ['scale', 'auto'],
              'kernel': ['linear', 'rbf', 'sigmoid'],
              'class_weight' : ['balanced', None]
}
gs = GridSearchCV(svc, param_grid, scoring='accuracy', n_jobs=-1)
gs.fit(X_train, y_train)
print(f'Лучшие параметры: {gs.best_params_}')
print(f'Лучшая точность: {gs.best_score_}')

Лучшие параметры: {'C': 1, 'class_weight': None, 'gamma': 'auto', 'kernel': 'rbf'}
Лучшая точность: 0.16470053696819495


In [109]:
results = pd.DataFrame(gs.cv_results_)
results = results.sort_values('rank_test_score', ascending=True)
results.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_class_weight,param_gamma,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
34,0.871951,0.050487,0.039717,0.000626,1.0,,auto,rbf,"{'C': 1, 'class_weight': None, 'gamma': 'auto'...",0.155556,0.155556,0.166667,0.163569,0.182156,0.164701,0.009768,1
28,0.922191,0.015514,0.040128,0.000695,1.0,balanced,auto,rbf,"{'C': 1, 'class_weight': 'balanced', 'gamma': ...",0.155556,0.155556,0.159259,0.152416,0.185874,0.161732,0.012264,2
19,0.887055,0.041397,0.037827,0.002416,0.1,,scale,rbf,"{'C': 0.1, 'class_weight': None, 'gamma': 'sca...",0.159259,0.177778,0.159259,0.148699,0.159851,0.160969,0.009383,3
46,1.03166,0.046989,0.044957,0.002171,1.5,,auto,rbf,"{'C': 1.5, 'class_weight': None, 'gamma': 'aut...",0.151852,0.137037,0.17037,0.144981,0.193309,0.15951,0.020174,4
58,1.118342,0.060061,0.049871,0.001818,5.0,,auto,rbf,"{'C': 5, 'class_weight': None, 'gamma': 'auto'...",0.144444,0.166667,0.118519,0.141264,0.211896,0.156558,0.031598,5


## 3. Decision tree

1. Using `GridSearchCV` try different parameters of `max_depth` (from `1` to `49`), `class_weight` (`balanced`, `None`) and `criterion` (`entropy` and `gini`) and get the best combination of them in terms of accuracy. Use `random_state=21`.
2. Create a dataframe from the results of the gridsearch and sort it ascendingly by the `rank_test_score`, check if there is a huge difference between different combinations (sometimes a simpler model may give a comparable result).

In [110]:
model_tree = DecisionTreeClassifier(random_state=21)


In [111]:
param_grid = { 'criterion' : ['gini', 'entropy'],
              'max_depth' : np.arange(1,50),
              'class_weight' : ['balanced', None]}
gs = GridSearchCV(model_tree, param_grid, scoring = 'accuracy', n_jobs=-1)
gs.fit(X_train, y_train)
print(f'Лучшие параметры: {gs.best_params_}')
print(f'Лучшая точность: {gs.best_score_}')

Лучшие параметры: {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 14}
Лучшая точность: 0.163202533388407


  _data = np.array(data, dtype=dtype, copy=copy,


In [112]:
results = pd.DataFrame(gs.cv_results_)
results = results.sort_values('rank_test_score', ascending=True)
results.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_class_weight,param_criterion,param_max_depth,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
13,0.007776,0.001215,0.002352,0.000563,balanced,gini,14,"{'class_weight': 'balanced', 'criterion': 'gin...",0.155556,0.174074,0.162963,0.152416,0.171004,0.163203,0.008413,1
159,0.00958,0.000404,0.002922,0.000142,,entropy,13,"{'class_weight': None, 'criterion': 'entropy',...",0.155556,0.174074,0.155556,0.126394,0.189591,0.160234,0.021177,2
59,0.009338,0.000264,0.00284,0.000186,balanced,entropy,11,"{'class_weight': 'balanced', 'criterion': 'ent...",0.148148,0.166667,0.203704,0.100372,0.182156,0.160209,0.035046,3
61,0.011252,0.00175,0.00271,0.000119,balanced,entropy,13,"{'class_weight': 'balanced', 'criterion': 'ent...",0.155556,0.177778,0.159259,0.122677,0.182156,0.159485,0.021066,4
156,0.007971,0.000159,0.002969,0.000216,,entropy,10,"{'class_weight': None, 'criterion': 'entropy',...",0.151852,0.151852,0.188889,0.115242,0.182156,0.157998,0.026236,5


## 4. Random forest

1. Using `GridSearchCV` try different parameters of `n_estimators` (`5`, `10`, `50`, `100`), `max_depth` (from `1` to `49`), `class_weight` (`balanced`, `None`) and `criterion` (`entropy` and `gini`) and get the best combination of them in terms of accuracy. Use random_state=21.
2. Create a dataframe from the results of the gridsearch and sort it ascendengly by the `rank_test_score`, check if there is a huge difference between different combinations (sometimes a simpler model may give a comparable result).

In [113]:
model_forest = RandomForestClassifier(random_state=21)

In [114]:
param_grid = {'n_estimators' : [5, 10, 50, 100],
              'max_depth' : np.arange(1, 50),
              'criterion' : ['gini', 'entropy'],
              'class_weight' : ['balanced', None]
}
gs = GridSearchCV(model_forest, param_grid, scoring='accuracy', n_jobs=-1)
gs.fit(X_train, y_train)

print(f'Лучшие параметры: {gs.best_params_}')
print(f'Лучшая точность: {gs.best_score_}')

Лучшие параметры: {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 6, 'n_estimators': 5}
Лучшая точность: 0.1654275092936803


In [115]:
results = pd.DataFrame(gs.cv_results_)
results = results.sort_values('rank_test_score', ascending=True)
results.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_class_weight,param_criterion,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
20,0.016796,0.00024,0.003416,0.000188,balanced,gini,6,5,"{'class_weight': 'balanced', 'criterion': 'gin...",0.151852,0.17037,0.177778,0.156134,0.171004,0.165428,0.009785,1
605,0.030822,0.004245,0.005221,0.001552,,entropy,5,10,"{'class_weight': None, 'criterion': 'entropy',...",0.166667,0.177778,0.181481,0.144981,0.152416,0.164665,0.014115,2
413,0.037624,0.008614,0.005179,0.001731,,gini,6,10,"{'class_weight': None, 'criterion': 'gini', 'm...",0.177778,0.177778,0.162963,0.133829,0.163569,0.163183,0.016049,3
219,0.294318,0.028975,0.012564,0.001436,balanced,entropy,6,100,"{'class_weight': 'balanced', 'criterion': 'ent...",0.151852,0.162963,0.137037,0.178439,0.182156,0.162489,0.016747,4
606,0.13579,0.009414,0.007242,0.000199,,entropy,5,50,"{'class_weight': None, 'criterion': 'entropy',...",0.166667,0.17037,0.144444,0.159851,0.171004,0.162467,0.009845,5


## 5. Progress bar

Gridsearch can be a quite long process and you may find yourself wondering when it will end.
1. Create a manual gridsearch for the same parameters values of random forest iterating through the list of the possible values and calculating `cross_val_score` for each combination. Try to increase `n_jobs`. The value `cv` for `cross_val_score` is 5.
2. Track the progress using the library `tqdm.notebook`.
3. Create a dataframe from the results of the gridsearch with the columns corresponding to the names of the parameters and `mean_accuracy` and `std_accuracy`.
4. Sort it descendingly by the `mean_accuracy`, check if there is a huge difference between different combinations (sometimes a simpler model may give a comparable result).

In [116]:
grid = list(ParameterGrid(param_grid))
print(f'Количество комбинаций суперпараметров: {len(grid)}')

Количество комбинаций суперпараметров: 784


Модуль tqdm предназначен для быстрого и расширяемого внедрения индикаторов выполнения (progressbar) во внешние интерфейсы программ на Python, предоставляя конечным пользователям визуальную индикацию хода вычислений или передачи данных.

In [117]:
data = []

for params in tqdm(grid):
    d = {}
    estimator = RandomForestClassifier(**params)
    sc = cross_val_score(estimator, X_train, y_train, cv=5, n_jobs=1)
    d = {**params, 'mean_accuracy': np.mean(sc), 'std_accuracy': np.std(sc)}
    data.append(d)

  0%|          | 0/784 [00:00<?, ?it/s]

In [118]:
result = pd.DataFrame(data)
result = result.sort_values('mean_accuracy', ascending=False)
result

Unnamed: 0,class_weight,criterion,max_depth,n_estimators,mean_accuracy,std_accuracy
633,,entropy,12,10,0.163208,0.007834
264,balanced,entropy,18,5,0.160259,0.022694
411,,gini,5,100,0.159515,0.016283
432,,gini,11,5,0.159504,0.028044
593,,entropy,2,10,0.158733,0.020498
...,...,...,...,...,...,...
296,balanced,entropy,26,5,0.124648,0.023167
684,,entropy,25,5,0.124643,0.017889
153,balanced,gini,39,10,0.124637,0.011244
6,balanced,gini,2,50,0.122420,0.020025


## 6. Predictions

1. Choose the best model and use it to make predictions for the test dataset.
2. Calculate the final accuracy.

In [119]:
model_forest = RandomForestClassifier(n_estimators=10, max_depth=32, criterion='gini', class_weight='balanced', random_state=21)
model_forest.fit(X_train, y_train)
y_pred = model_forest.predict(X_test)

In [120]:
accuracy_score(y_test, y_pred)

0.15384615384615385