# Day 09. Exercise 01
# Gridsearch

## 0. Imports

In [34]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from itertools import product
from tqdm.notebook import tqdm
import numpy as np
from sklearn.metrics import accuracy_score

## 1. Preprocessing

1. Read the file [`day-of-week-not-scaled.csv`](https://drive.google.com/file/d/1AlGvsJDSzPT_70caausx8bFuupIEZkfh/view?usp=sharing). It is similar to the one from the previous exercise, but this time we did not scale continuous features (we are not going to use logreg anymore).
2. Using `train_test_split` with parameters `test_size=0.2`, `random_state=21` get `X_train`, `y_train`, `X_test`, `y_test`. Use the additional parameter `stratify`.

In [39]:
data = pd.read_csv('../data/day-of-week-not-scaled.csv')
df = pd.read_csv("../data/dayofweek.csv")

X = data
y = df['dayofweek']

df.columns

Index(['numTrials', 'hour', 'dayofweek', 'uid_user_0', 'uid_user_1',
       'uid_user_10', 'uid_user_11', 'uid_user_12', 'uid_user_13',
       'uid_user_14', 'uid_user_15', 'uid_user_16', 'uid_user_17',
       'uid_user_18', 'uid_user_19', 'uid_user_2', 'uid_user_20',
       'uid_user_21', 'uid_user_22', 'uid_user_23', 'uid_user_24',
       'uid_user_25', 'uid_user_26', 'uid_user_27', 'uid_user_28',
       'uid_user_29', 'uid_user_3', 'uid_user_30', 'uid_user_31', 'uid_user_4',
       'uid_user_6', 'uid_user_7', 'uid_user_8', 'labname_code_rvw',
       'labname_lab02', 'labname_lab03', 'labname_lab03s', 'labname_lab05s',
       'labname_laba04', 'labname_laba04s', 'labname_laba05', 'labname_laba06',
       'labname_laba06s', 'labname_project1'],
      dtype='object')

In [40]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=21, stratify=y)

## 2. SVM gridsearch

1. Using `GridSearchCV` try different parameters of kernel (`linear`, `rbf`, `sigmoid`), C (`0.01`, `0.1`, `1`, `1.5`, `5`, `10`), gamma (`scale`, `auto`), class_weight (`balanced`, `None`) use `random_state=21` and `probability=True` and get the best combination of them in terms of accuracy.
2. Create a dataframe from the results of the gridsearch and sort it ascendingly by the `rank_test_score`. Check if there is a huge difference between different combinations (sometimes a simpler model may give a comparable result).

In [5]:
param_grid = {
    'kernel': ['linear', 'rbf', 'sigmoid'],
    'C': [0.01, 0.1, 1, 1.5, 5, 10],
    'gamma': ['scale', 'auto'],
    'class_weight': ['balanced', None]
}


In [6]:
svm_model = SVC(random_state=21, probability=True)

grid_search = GridSearchCV(
    estimator=svm_model,
    param_grid=param_grid,
    scoring='accuracy',
    cv=5,
    verbose=1,
    n_jobs=-1
)

grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 72 candidates, totalling 360 fits


GridSearchCV(cv=5, estimator=SVC(probability=True, random_state=21), n_jobs=-1,
             param_grid={'C': [0.01, 0.1, 1, 1.5, 5, 10],
                         'class_weight': ['balanced', None],
                         'gamma': ['scale', 'auto'],
                         'kernel': ['linear', 'rbf', 'sigmoid']},
             scoring='accuracy', verbose=1)

In [7]:
results = pd.DataFrame(grid_search.cv_results_)


results_sorted = results.sort_values(by='rank_test_score', ascending=True)

In [10]:
results_sorted.head(10)




Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_class_weight,param_gamma,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
70,1.382769,0.07527,0.098863,0.008125,10,,auto,rbf,"{'C': 10, 'class_weight': None, 'gamma': 'auto...",0.9,0.848148,0.885185,0.884758,0.862454,0.876109,0.018419,1
64,1.488274,0.148946,0.093292,0.016933,10,balanced,auto,rbf,"{'C': 10, 'class_weight': 'balanced', 'gamma':...",0.877778,0.851852,0.862963,0.873606,0.851301,0.8635,0.01087,2
58,1.338499,0.045946,0.100745,0.015948,5,,auto,rbf,"{'C': 5, 'class_weight': None, 'gamma': 'auto'...",0.825926,0.811111,0.818519,0.821561,0.802974,0.816018,0.008116,3
52,1.43254,0.097448,0.107165,0.010297,5,balanced,auto,rbf,"{'C': 5, 'class_weight': 'balanced', 'gamma': ...",0.844444,0.785185,0.792593,0.817844,0.802974,0.808608,0.021007,4
63,76.428372,7.446059,0.034714,0.006306,10,balanced,auto,linear,"{'C': 10, 'class_weight': 'balanced', 'gamma':...",0.72963,0.7,0.755556,0.754647,0.665428,0.721052,0.034438,5
60,73.589682,7.164392,0.033902,0.005537,10,balanced,scale,linear,"{'C': 10, 'class_weight': 'balanced', 'gamma':...",0.72963,0.7,0.755556,0.754647,0.665428,0.721052,0.034438,5
66,63.59428,7.139836,0.034997,0.006871,10,,scale,linear,"{'C': 10, 'class_weight': None, 'gamma': 'scal...",0.737037,0.711111,0.707407,0.743494,0.698885,0.719587,0.017463,7
69,52.671366,5.232098,0.020037,0.004714,10,,auto,linear,"{'C': 10, 'class_weight': None, 'gamma': 'auto...",0.737037,0.711111,0.707407,0.743494,0.698885,0.719587,0.017463,7
51,45.085103,2.373876,0.03681,0.019477,5,balanced,auto,linear,"{'C': 5, 'class_weight': 'balanced', 'gamma': ...",0.725926,0.692593,0.696296,0.754647,0.66171,0.706234,0.031619,9
48,44.929767,2.289463,0.041873,0.013991,5,balanced,scale,linear,"{'C': 5, 'class_weight': 'balanced', 'gamma': ...",0.725926,0.692593,0.696296,0.754647,0.66171,0.706234,0.031619,9


In [11]:
grid_search.best_params_

{'C': 10, 'class_weight': None, 'gamma': 'auto', 'kernel': 'rbf'}

## 3. Decision tree

1. Using `GridSearchCV` try different parameters of `max_depth` (from `1` to `49`), `class_weight` (`balanced`, `None`) and `criterion` (`entropy` and `gini`) and get the best combination of them in terms of accuracy. Use `random_state=21`.
2. Create a dataframe from the results of the gridsearch and sort it ascendingly by the `rank_test_score`, check if there is a huge difference between different combinations (sometimes a simpler model may give a comparable result).

In [12]:
dt = DecisionTreeClassifier(random_state=21)


param_grid = {
    "max_depth": range(1, 50),
    "class_weight": [None, "balanced"],
    "criterion": ["gini", "entropy"]
}


In [13]:
grid_search = GridSearchCV(dt, param_grid, scoring="accuracy", cv=5, n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 196 candidates, totalling 980 fits


GridSearchCV(cv=5, estimator=DecisionTreeClassifier(random_state=21), n_jobs=-1,
             param_grid={'class_weight': [None, 'balanced'],
                         'criterion': ['gini', 'entropy'],
                         'max_depth': range(1, 50)},
             scoring='accuracy', verbose=1)

In [14]:
results_df = pd.DataFrame(grid_search.cv_results_)

In [15]:
results_df = results_df.sort_values(by="rank_test_score")

In [19]:
results_df[["rank_test_score", "mean_test_score", "param_max_depth", "param_class_weight", "param_criterion"]].head(10)

Unnamed: 0,rank_test_score,mean_test_score,param_max_depth,param_class_weight,param_criterion
119,1,0.873121,22,balanced,gini
118,2,0.873121,21,balanced,gini
133,3,0.873116,36,balanced,gini
139,3,0.873116,42,balanced,gini
138,3,0.873116,41,balanced,gini
137,3,0.873116,40,balanced,gini
136,3,0.873116,39,balanced,gini
135,3,0.873116,38,balanced,gini
134,3,0.873116,37,balanced,gini
142,3,0.873116,45,balanced,gini


In [17]:
grid_search.best_params_

{'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 22}

## 4. Random forest

1. Using `GridSearchCV` try different parameters of `n_estimators` (`5`, `10`, `50`, `100`), `max_depth` (from `1` to `49`), `class_weight` (`balanced`, `None`) and `criterion` (`entropy` and `gini`) and get the best combination of them in terms of accuracy. Use random_state=21.
2. Create a dataframe from the results of the gridsearch and sort it ascendengly by the `rank_test_score`, check if there is a huge difference between different combinations (sometimes a simpler model may give a comparable result).

In [21]:
rf = RandomForestClassifier(random_state=21)


param_grid = {
    "n_estimators": [5, 10, 50, 100],
    "max_depth": range(1, 50),
    "class_weight": [None, "balanced"],
    "criterion": ["gini", "entropy"]
}


In [22]:
grid_search = GridSearchCV(rf, param_grid, scoring="accuracy", cv=5, n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 784 candidates, totalling 3920 fits


GridSearchCV(cv=5, estimator=RandomForestClassifier(random_state=21), n_jobs=-1,
             param_grid={'class_weight': [None, 'balanced'],
                         'criterion': ['gini', 'entropy'],
                         'max_depth': range(1, 50),
                         'n_estimators': [5, 10, 50, 100]},
             scoring='accuracy', verbose=1)

In [23]:
results_df = pd.DataFrame(grid_search.cv_results_)

results_df = results_df.sort_values(by="rank_test_score")


In [24]:
grid_search.best_params_

{'class_weight': None,
 'criterion': 'gini',
 'max_depth': 28,
 'n_estimators': 50}

In [26]:
results_df[["rank_test_score", "mean_test_score", "param_n_estimators", "param_max_depth", "param_class_weight", "param_criterion"]].head(10)

Unnamed: 0,rank_test_score,mean_test_score,param_n_estimators,param_max_depth,param_class_weight,param_criterion
110,1,0.90429,50,28,,gini
123,2,0.903547,100,31,,gini
510,3,0.902817,50,30,balanced,gini
526,4,0.902809,50,34,balanced,gini
191,5,0.902806,100,48,,gini
143,5,0.902806,100,36,,gini
187,5,0.902806,100,47,,gini
155,5,0.902806,100,39,,gini
147,5,0.902806,100,37,,gini
183,5,0.902806,100,46,,gini


## 5. Progress bar

Gridsearch can be a quite long process and you may find yourself wondering when it will end.
1. Create a manual gridsearch for the same parameters values of random forest iterating through the list of the possible values and calculating `cross_val_score` for each combination. Try to increase `n_jobs`. The value `cv` for `cross_val_score` is 5.
2. Track the progress using the library `tqdm.notebook`.
3. Create a dataframe from the results of the gridsearch with the columns corresponding to the names of the parameters and `mean_accuracy` and `std_accuracy`.
4. Sort it descendingly by the `mean_accuracy`, check if there is a huge difference between different combinations (sometimes a simpler model may give a comparable result).

In [29]:
n_estimators = [5, 10, 50, 100]
max_depth = range(1, 50)
class_weight = [None, "balanced"]
criterion = ["gini", "entropy"]

param_combinations = list(product(n_estimators, max_depth, class_weight, criterion))

In [32]:
results = []

for n_est, depth, weight, crit in tqdm(param_combinations, desc="Grid Search Progress"):
    model = RandomForestClassifier(n_estimators=n_est, max_depth=depth, class_weight=weight, criterion=crit, random_state=21, n_jobs=-1)
    scores = cross_val_score(model, X_train, y_train, cv=5, scoring="accuracy", n_jobs=-1) 
    
    results.append({
        "n_estimators": n_est,
        "max_depth": depth,
        "class_weight": weight,
        "criterion": crit,
        "mean_accuracy": np.mean(scores),
        "std_accuracy": np.std(scores)
    })


results_df = pd.DataFrame(results)
results_df = results_df.sort_values(by="mean_accuracy", ascending=False)

results_df.iloc[0]

Grid Search Progress:   0%|          | 0/784 [00:00<?, ?it/s]

n_estimators           50
max_depth              28
class_weight         None
criterion            gini
mean_accuracy     0.90429
std_accuracy     0.010961
Name: 500, dtype: object

In [33]:
results_df.head(10)

Unnamed: 0,n_estimators,max_depth,class_weight,criterion,mean_accuracy,std_accuracy
500,50,28,,gini,0.90429,0.010961
708,100,31,,gini,0.903547,0.01438
510,50,30,balanced,gini,0.902817,0.013554
526,50,34,balanced,gini,0.902809,0.01301
744,100,40,,gini,0.902806,0.01046
748,100,41,,gini,0.902806,0.01046
736,100,38,,gini,0.902806,0.01046
756,100,43,,gini,0.902806,0.01046
504,50,29,,gini,0.902806,0.011698
760,100,44,,gini,0.902806,0.01046


## 6. Predictions

1. Choose the best model and use it to make predictions for the test dataset.
2. Calculate the final accuracy.

In [35]:

best_params = results_df.iloc[0]  # The top row has the best parameters

best_model = RandomForestClassifier(
    n_estimators=best_params["n_estimators"],
    max_depth=best_params["max_depth"],
    class_weight=best_params["class_weight"],
    criterion=best_params["criterion"],
    random_state=21,
    n_jobs=-1
)


In [36]:
best_model.fit(X_train, y_train)

y_pred = best_model.predict(X_test)

In [37]:
accuracy_score(y_test, y_pred)

0.9289940828402367