# Day 09. Exercise 01
# Gridsearch

## 0. Imports

In [1]:
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split, GridSearchCV, ParameterGrid, cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

## 1. Preprocessing

1. Read the file `day-of-week-not-scaled.csv`. It is similar to the one from the previous exercise, but this time we did not scale continuous features (we are not going to use logreg anymore). Don't forget to enrich the table with the 'dayofweek' column from the previous day's .csv-file.
2. Using `train_test_split` with parameters `test_size=0.2`, `random_state=21` get `X_train`, `y_train`, `X_test`, `y_test`. Use the additional parameter `stratify`.

In [2]:
df = pd.read_csv('../data/day-of-week-not-scaled.csv')
dayofweek = pd.read_csv('../data/dayofweek.csv')['dayofweek']
df['dayofweek'] = dayofweek
df.head()

Unnamed: 0,numTrials,hour,uid_user_0,uid_user_1,uid_user_10,uid_user_11,uid_user_12,uid_user_13,uid_user_14,uid_user_15,...,labname_lab03,labname_lab03s,labname_lab05s,labname_laba04,labname_laba04s,labname_laba05,labname_laba06,labname_laba06s,labname_project1,dayofweek
0,1,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4
1,2,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4
2,3,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4
3,4,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4
4,5,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4


In [3]:
X = df.drop(columns=['dayofweek'])
y = df['dayofweek']

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21, stratify=y)

## 2. SVM gridsearch

1. Using `GridSearchCV` try different parameters of kernel (`linear`, `rbf`, `sigmoid`), C (`0.01`, `0.1`, `1`, `1.5`, `5`, `10`), gamma (`scale`, `auto`), class_weight (`balanced`, `None`) use `random_state=21` and `probability=True` and get the best combination of them in terms of accuracy.
2. Create a dataframe from the results of the gridsearch and sort it ascendingly by the `rank_test_score`. Check if there is a huge difference between different combinations (sometimes a simpler model may give a comparable result).

In [5]:
def optimize(model, param_grid, X_train, y_train):
    grid_search = GridSearchCV(
        estimator=model,
        param_grid=param_grid,
        scoring='accuracy',
        cv=10,
        n_jobs=-1
    )

    grid_search.fit(X_train, y_train)
    results_df = pd.DataFrame(grid_search.cv_results_)
    results_df = results_df.sort_values('rank_test_score')

    return results_df

In [6]:
param_grid = {
    'kernel': ['linear', 'rbf', 'sigmoid'],
    'C': [0.01, 0.1, 1, 1.5, 5, 10],
    'gamma': ['scale', 'auto'],
    'class_weight': [None, 'balanced']
}

svc = SVC(random_state=21, probability=True)
df = optimize(svc, param_grid, X_train, y_train)

In [7]:
df.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_class_weight,param_gamma,param_kernel,params,split0_test_score,...,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
64,0.75181,0.011399,0.016381,0.003538,10.0,,auto,rbf,"{'C': 10, 'class_weight': None, 'gamma': 'auto...",0.874074,...,0.859259,0.903704,0.903704,0.881481,0.881481,0.880597,0.880597,0.885749,0.018843,1
70,0.761852,0.02013,0.018602,0.003762,10.0,balanced,auto,rbf,"{'C': 10, 'class_weight': 'balanced', 'gamma':...",0.844444,...,0.866667,0.866667,0.903704,0.888889,0.859259,0.880597,0.850746,0.873875,0.022551,2
52,0.680324,0.022163,0.020879,0.003862,5.0,,auto,rbf,"{'C': 5, 'class_weight': None, 'gamma': 'auto'...",0.814815,...,0.807407,0.822222,0.866667,0.874074,0.77037,0.828358,0.843284,0.832349,0.036143,3
58,0.776055,0.048799,0.023879,0.004544,5.0,balanced,auto,rbf,"{'C': 5, 'class_weight': 'balanced', 'gamma': ...",0.822222,...,0.785185,0.807407,0.851852,0.844444,0.785185,0.820896,0.791045,0.820453,0.03153,4
60,54.405774,3.256316,0.006509,0.004363,10.0,,scale,linear,"{'C': 10, 'class_weight': None, 'gamma': 'scal...",0.725926,...,0.733333,0.777778,0.696296,0.740741,0.688889,0.716418,0.69403,0.728452,0.043573,5


In [8]:
df.tail()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_class_weight,param_gamma,param_kernel,params,split0_test_score,...,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
59,0.812524,0.063646,0.013258,0.007633,5.0,balanced,auto,sigmoid,"{'C': 5, 'class_weight': 'balanced', 'gamma': ...",0.133333,...,0.081481,0.140741,0.125926,0.133333,0.118519,0.08209,0.141791,0.12461,0.022866,68
23,0.907105,0.054453,0.012044,0.003898,0.1,balanced,auto,sigmoid,"{'C': 0.1, 'class_weight': 'balanced', 'gamma'...",0.088889,...,0.088889,0.088889,0.088889,0.088889,0.088889,0.208955,0.208955,0.112902,0.048027,69
71,0.693494,0.021459,0.012577,0.003931,10.0,balanced,auto,sigmoid,"{'C': 10, 'class_weight': 'balanced', 'gamma':...",0.118519,...,0.074074,0.125926,0.111111,0.103704,0.118519,0.08209,0.104478,0.110509,0.019328,70
47,0.925262,0.055347,0.013959,0.003404,1.5,balanced,auto,sigmoid,"{'C': 1.5, 'class_weight': 'balanced', 'gamma'...",0.051852,...,0.096296,0.066667,0.044444,0.088889,0.081481,0.097015,0.08209,0.080133,0.01875,71
35,0.88961,0.070078,0.01093,0.002991,1.0,balanced,auto,sigmoid,"{'C': 1, 'class_weight': 'balanced', 'gamma': ...",0.044444,...,0.074074,0.074074,0.037037,0.059259,0.074074,0.059701,0.059701,0.062311,0.013315,72


## 3. Decision tree

1. Using `GridSearchCV` try different parameters of `max_depth` (from `1` to `49`), `class_weight` (`balanced`, `None`) and `criterion` (`entropy` and `gini`) and get the best combination of them in terms of accuracy. Use `random_state=21`.
2. Create a dataframe from the results of the gridsearch and sort it ascendingly by the `rank_test_score`, check if there is a huge difference between different combinations (sometimes a simpler model may give a comparable result).

In [9]:
param_grid = {
    'max_depth': list(range(1, 50)),          # от 1 до 49
    'class_weight': ['balanced', None],       # два варианта
    'criterion': ['entropy', 'gini']          # два варианта
}
dtree = DecisionTreeClassifier(random_state=21)
df = optimize(dtree, param_grid, X_train, y_train)

In [10]:
df.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_class_weight,param_criterion,param_max_depth,params,split0_test_score,split1_test_score,...,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
24,0.012078,0.004755,0.000954,0.002158,balanced,entropy,25,"{'class_weight': 'balanced', 'criterion': 'ent...",0.874074,0.918519,...,0.896296,0.881481,0.888889,0.911111,0.911111,0.873134,0.873134,0.890182,0.017012,1
25,0.010376,0.005129,0.002203,0.004755,balanced,entropy,26,"{'class_weight': 'balanced', 'criterion': 'ent...",0.874074,0.918519,...,0.896296,0.881481,0.888889,0.911111,0.911111,0.873134,0.873134,0.890182,0.017012,1
26,0.008547,0.006312,0.002129,0.003404,balanced,entropy,27,"{'class_weight': 'balanced', 'criterion': 'ent...",0.874074,0.918519,...,0.896296,0.881481,0.888889,0.911111,0.911111,0.873134,0.873134,0.890182,0.017012,1
27,0.004703,0.003983,0.004786,0.004614,balanced,entropy,28,"{'class_weight': 'balanced', 'criterion': 'ent...",0.874074,0.918519,...,0.896296,0.881481,0.888889,0.911111,0.911111,0.873134,0.873134,0.890182,0.017012,1
28,0.007831,0.005598,0.002572,0.00342,balanced,entropy,29,"{'class_weight': 'balanced', 'criterion': 'ent...",0.874074,0.918519,...,0.896296,0.881481,0.888889,0.911111,0.911111,0.873134,0.873134,0.890182,0.017012,1


In [11]:
df.tail()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_class_weight,param_criterion,param_max_depth,params,split0_test_score,split1_test_score,...,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
51,0.004105,0.004715,0.00229,0.003964,balanced,gini,3,"{'class_weight': 'balanced', 'criterion': 'gin...",0.377778,0.377778,...,0.318519,0.340741,0.466667,0.451852,0.4,0.343284,0.365672,0.379784,0.045338,192
98,0.00458,0.006323,0.0,0.0,,entropy,1,"{'class_weight': None, 'criterion': 'entropy',...",0.37037,0.362963,...,0.362963,0.355556,0.37037,0.355556,0.348148,0.343284,0.335821,0.355318,0.010945,193
147,0.004475,0.004156,0.001864,0.003125,,gini,1,"{'class_weight': None, 'criterion': 'gini', 'm...",0.37037,0.362963,...,0.362963,0.355556,0.37037,0.355556,0.348148,0.343284,0.335821,0.355318,0.010945,193
0,0.00476,0.000615,0.002047,0.000938,balanced,entropy,1,"{'class_weight': 'balanced', 'criterion': 'ent...",0.266667,0.259259,...,0.348148,0.311111,0.281481,0.325926,0.311111,0.268657,0.313433,0.298209,0.027378,195
49,0.003166,0.005276,0.002473,0.003919,balanced,gini,1,"{'class_weight': 'balanced', 'criterion': 'gin...",0.266667,0.259259,...,0.348148,0.311111,0.281481,0.325926,0.311111,0.268657,0.313433,0.298209,0.027378,195


## 4. Random forest

1. Using `GridSearchCV` try different parameters of `n_estimators` (`5`, `10`, `50`, `100`), `max_depth` (from `1` to `49`), `class_weight` (`balanced`, `None`) and `criterion` (`entropy` and `gini`) and get the best combination of them in terms of accuracy. Use random_state=21.
2. Create a dataframe from the results of the gridsearch and sort it ascendengly by the `rank_test_score`, check if there is a huge difference between different combinations (sometimes a simpler model may give a comparable result).

In [12]:
param_grid = {
    'n_estimators': [5, 10, 50, 100],
    'max_depth': list(range(1, 50)),  # от 1 до 49
    'class_weight': ['balanced', None],
    'criterion': ['entropy', 'gini']
}
rfor = RandomForestClassifier(random_state=21)
df = optimize(rfor, param_grid, X_train, y_train)

In [13]:
df.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_class_weight,param_criterion,param_max_depth,param_n_estimators,params,split0_test_score,...,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
283,0.264028,0.004188,0.006988,0.002472,balanced,gini,22,100,"{'class_weight': 'balanced', 'criterion': 'gin...",0.911111,...,0.925926,0.918519,0.911111,0.925926,0.918519,0.932836,0.910448,0.920625,0.015201,1
519,0.293225,0.007254,0.009138,0.006068,,entropy,32,100,"{'class_weight': None, 'criterion': 'entropy',...",0.903704,...,0.940741,0.911111,0.903704,0.925926,0.903704,0.932836,0.895522,0.919132,0.018341,2
527,0.286965,0.007782,0.010968,0.002739,,entropy,34,100,"{'class_weight': None, 'criterion': 'entropy',...",0.903704,...,0.940741,0.911111,0.911111,0.925926,0.903704,0.925373,0.895522,0.919127,0.017439,3
507,0.294042,0.006515,0.009817,0.004981,,entropy,29,100,"{'class_weight': None, 'criterion': 'entropy',...",0.903704,...,0.933333,0.911111,0.911111,0.925926,0.911111,0.925373,0.895522,0.919127,0.018057,3
503,0.288834,0.006431,0.00838,0.0042,,entropy,28,100,"{'class_weight': None, 'criterion': 'entropy',...",0.903704,...,0.933333,0.911111,0.911111,0.925926,0.911111,0.925373,0.895522,0.919127,0.018057,3


In [14]:
df.tail()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_class_weight,param_criterion,param_max_depth,param_n_estimators,params,split0_test_score,...,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
393,0.019445,0.003462,0.00218,0.002057,,entropy,1,10,"{'class_weight': None, 'criterion': 'entropy',...",0.37037,...,0.362963,0.37037,0.385185,0.362963,0.348148,0.343284,0.335821,0.359022,0.014181,780
4,0.012806,0.005429,0.001521,0.003255,balanced,entropy,2,5,"{'class_weight': 'balanced', 'criterion': 'ent...",0.288889,...,0.318519,0.318519,0.37037,0.414815,0.296296,0.335821,0.365672,0.34052,0.036346,781
200,0.011072,0.003923,0.004737,0.004775,balanced,gini,2,5,"{'class_weight': 'balanced', 'criterion': 'gin...",0.37037,...,0.37037,0.266667,0.37037,0.340741,0.274074,0.276119,0.350746,0.32565,0.039988,782
0,0.014087,0.005651,0.005575,0.004841,balanced,entropy,1,5,"{'class_weight': 'balanced', 'criterion': 'ent...",0.340741,...,0.192593,0.22963,0.251852,0.281481,0.325926,0.253731,0.291045,0.275218,0.042329,783
196,0.012726,0.003997,0.001657,0.003099,balanced,gini,1,5,"{'class_weight': 'balanced', 'criterion': 'gin...",0.244444,...,0.214815,0.22963,0.288889,0.244444,0.325926,0.253731,0.335821,0.268585,0.037664,784


## 5. Progress bar

Gridsearch can be a quite long process and you may find yourself wondering when it will end.
1. Create a manual gridsearch for the same parameters values of random forest iterating through the list of the possible values and calculating `cross_val_score` for each combination. Try to increase `n_jobs`. The value `cv` for `cross_val_score` is 5.
2. Track the progress using the library `tqdm.notebook`.
3. Create a dataframe from the results of the gridsearch with the columns corresponding to the names of the parameters and `mean_accuracy` and `std_accuracy`.
4. Sort it descendingly by the `mean_accuracy`, check if there is a huge difference between different combinations (sometimes a simpler model may give a comparable result).

In [15]:
def optimize_with_tqdm(model, param_grid, X_train, y_train):
    param_list = list(ParameterGrid(param_grid))
    results = []

    for params in tqdm(param_list):
        model.set_params(**params)
        model.fit(X_train, y_train)
        scores = cross_val_score(model, X_train, y_train, cv=5, n_jobs=-1)
        results.append({
            **params,
            'mean_accuracy': scores.mean(),
            'std_accuracy': scores.std()
        })
    results_df = pd.DataFrame(results)
    results_df = results_df.sort_values(by='mean_accuracy', ascending=False)
    return results_df

In [22]:
df = optimize_with_tqdm(rfor, param_grid, X_train, y_train)

  7%|▋         | 52/784 [00:18<04:22,  2.79it/s]


KeyboardInterrupt: 

In [17]:
df.head()

Unnamed: 0,class_weight,criterion,max_depth,n_estimators,mean_accuracy,std_accuracy
698,,gini,28,50,0.90429,0.010961
711,,gini,31,100,0.903547,0.01438
314,balanced,gini,30,50,0.902817,0.013554
330,balanced,gini,34,50,0.902809,0.01301
735,,gini,37,100,0.902806,0.01046


In [18]:
df.tail()

Unnamed: 0,class_weight,criterion,max_depth,n_estimators,mean_accuracy,std_accuracy
392,,entropy,1,5,0.353832,0.016467
4,balanced,entropy,2,5,0.35311,0.021165
200,balanced,gini,2,5,0.346419,0.029749
196,balanced,gini,1,5,0.28339,0.011062
0,balanced,entropy,1,5,0.270794,0.024718


## 6. Predictions

1. Choose the best model and use it to make predictions for the test dataset.
2. Calculate the final accuracy.

In [19]:
rfor = RandomForestClassifier(random_state=21, class_weight='balanced', criterion='entropy', max_depth=16, n_estimators=100)

In [20]:
rfor.fit(X_train, y_train)
pred = rfor.predict(X_test)

In [21]:
accuracy_score(y_test, pred)

0.9201183431952663