# Day 09. Exercise 01
# Gridsearch

## 0. Imports

In [3]:
import pandas as pd 
from sklearn.model_selection import train_test_split
from math import sqrt
from sklearn.metrics import mean_squared_error
import numpy as np
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from tqdm.notebook import tqdm
from sklearn.model_selection import cross_val_score

## 1. Preprocessing

1. Read the file [`day-of-week-not-scaled.csv`](https://drive.google.com/file/d/1AlGvsJDSzPT_70caausx8bFuupIEZkfh/view?usp=sharing). It is similar to the one from the previous exercise, but this time we did not scale continuous features (we are not going to use logreg anymore).
2. Using `train_test_split` with parameters `test_size=0.2`, `random_state=21` get `X_train`, `y_train`, `X_test`, `y_test`. Use the additional parameter `stratify`.

In [4]:
df = pd.read_csv("../data/day-of-week-not-scaled.csv")

In [5]:
df.head()

Unnamed: 0,numTrials,hour,uid_user_0,uid_user_1,uid_user_10,uid_user_11,uid_user_12,uid_user_13,uid_user_14,uid_user_15,...,labname_lab02,labname_lab03,labname_lab03s,labname_lab05s,labname_laba04,labname_laba04s,labname_laba05,labname_laba06,labname_laba06s,labname_project1
0,1,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,2,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,3,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,4,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,5,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [6]:
X = df.drop("hour",axis=1)
y = df["hour"]
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=21,stratify=y)

## 2. SVM gridsearch

1. Using `GridSearchCV` try different parameters of kernel (`linear`, `rbf`, `sigmoid`), C (`0.01`, `0.1`, `1`, `1.5`, `5`, `10`), gamma (`scale`, `auto`), class_weight (`balanced`, `None`) use `random_state=21` and `probability=True` and get the best combination of them in terms of accuracy.
2. Create a dataframe from the results of the gridsearch and sort it ascendingly by the `rank_test_score`. Check if there is a huge difference between different combinations (sometimes a simpler model may give a comparable result).

In [7]:
param_grid = {
    'kernel': ['linear', 'rbf', 'sigmoid'],
    'C': [0.01, 0.1, 1, 1.5, 5, 10],
    'gamma': ['scale', 'auto'],
    'class_weight': ['balanced', None]
}

In [8]:
svm_model = SVC(probability=True, random_state=21)

grid_search = GridSearchCV(svm_model, param_grid, cv=5, n_jobs=-1, scoring='accuracy')
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
best_score = grid_search.best_score_

print(f"Best parameters: {best_params}")
print(f"Best accuracy: {best_score}")



Best parameters: {'C': 10, 'class_weight': None, 'gamma': 'auto', 'kernel': 'rbf'}
Best accuracy: 0.5059452017072834


In [9]:
results = grid_search.cv_results_

results_df = pd.DataFrame(results)

sorted_results_df = results_df.sort_values(by='rank_test_score')

print(sorted_results_df[['rank_test_score', 'param_kernel', 'param_C', 'param_gamma', 'param_class_weight', 'mean_test_score']])

    rank_test_score param_kernel  param_C param_gamma param_class_weight  \
70                1          rbf     10.0        auto               None   
64                2          rbf     10.0        auto           balanced   
58                3          rbf      5.0        auto               None   
52                4          rbf      5.0        auto           balanced   
69                5       linear     10.0        auto               None   
..              ...          ...      ...         ...                ...   
17               68      sigmoid      0.1        auto           balanced   
41               69      sigmoid      1.5        auto           balanced   
29               70      sigmoid      1.0        auto           balanced   
53               71      sigmoid      5.0        auto           balanced   
65               72      sigmoid     10.0        auto           balanced   

    mean_test_score  
70         0.505945  
64         0.465879  
58         0.422118  

## 3. Decision tree

1. Using `GridSearchCV` try different parameters of `max_depth` (from `1` to `49`), `class_weight` (`balanced`, `None`) and `criterion` (`entropy` and `gini`) and get the best combination of them in terms of accuracy. Use `random_state=21`.
2. Create a dataframe from the results of the gridsearch and sort it ascendingly by the `rank_test_score`, check if there is a huge difference between different combinations (sometimes a simpler model may give a comparable result).

In [23]:
tree_class = DecisionTreeClassifier(random_state=21)

In [24]:
param_grid = {
    'max_depth': range(1, 50),
    'class_weight': ['balanced', None],
    'criterion': ['entropy', 'gini']
}

In [25]:
grid_search_tree = GridSearchCV(estimator=tree_class, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)

grid_search_tree.fit(X_train, y_train)

print("Best parameters:", grid_search_tree.best_params_)
print(f"Best accuracy: {grid_search_tree.best_score_}")



Best parameters: {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 39}
Best accuracy: 0.6550791683877185


In [13]:
results_df = pd.DataFrame(grid_search.cv_results_)

sorted_results = results_df.sort_values(by='rank_test_score')

print(sorted_results[['params', 'mean_test_score', 'rank_test_score']])

                                               params  mean_test_score  \
97  {'class_weight': 'balanced', 'criterion': 'gin...         0.655079   
87  {'class_weight': 'balanced', 'criterion': 'gin...         0.655079   
88  {'class_weight': 'balanced', 'criterion': 'gin...         0.655079   
89  {'class_weight': 'balanced', 'criterion': 'gin...         0.655079   
90  {'class_weight': 'balanced', 'criterion': 'gin...         0.655079   
..                                                ...              ...   
2   {'class_weight': 'balanced', 'criterion': 'ent...         0.048974   
0   {'class_weight': 'balanced', 'criterion': 'ent...         0.048203   
51  {'class_weight': 'balanced', 'criterion': 'gin...         0.034875   
1   {'class_weight': 'balanced', 'criterion': 'ent...         0.023762   
50  {'class_weight': 'balanced', 'criterion': 'gin...         0.004453   

    rank_test_score  
97                1  
87                1  
88                1  
89                1  
9

## 4. Random forest

1. Using `GridSearchCV` try different parameters of `n_estimators` (`5`, `10`, `50`, `100`), `max_depth` (from `1` to `49`), `class_weight` (`balanced`, `None`) and `criterion` (`entropy` and `gini`) and get the best combination of them in terms of accuracy. Use random_state=21.
2. Create a dataframe from the results of the gridsearch and sort it ascendengly by the `rank_test_score`, check if there is a huge difference between different combinations (sometimes a simpler model may give a comparable result).

In [14]:
tree_rand_class = RandomForestClassifier(random_state=21)

In [15]:
param_grid = {
    'n_estimators': [5, 10, 50, 100],
    'max_depth': list(range(1, 50)),
    'class_weight': ['balanced', None],
    'criterion': ['entropy', 'gini']
}

In [16]:
grid_search = GridSearchCV(estimator=tree_rand_class, param_grid=param_grid, scoring='accuracy', cv=5)
grid_search.fit(X, y)



In [17]:
results_df = pd.DataFrame(grid_search.cv_results_)

results_df_sorted = results_df.sort_values(by='rank_test_score', ascending=True)

print(results_df_sorted[['params', 'mean_test_score', 'rank_test_score']])

                                                params  mean_test_score  \
469  {'class_weight': None, 'criterion': 'entropy',...         0.263400   
467  {'class_weight': None, 'criterion': 'entropy',...         0.263391   
273  {'class_weight': 'balanced', 'criterion': 'gin...         0.262803   
95   {'class_weight': 'balanced', 'criterion': 'ent...         0.262796   
107  {'class_weight': 'balanced', 'criterion': 'ent...         0.262796   
..                                                 ...              ...   
200  {'class_weight': 'balanced', 'criterion': 'gin...         0.026697   
197  {'class_weight': 'balanced', 'criterion': 'gin...         0.025509   
0    {'class_weight': 'balanced', 'criterion': 'ent...         0.021346   
196  {'class_weight': 'balanced', 'criterion': 'gin...         0.019565   
4    {'class_weight': 'balanced', 'criterion': 'ent...         0.016015   

     rank_test_score  
469                1  
467                2  
273                3  
95     

In [18]:
best_combination = grid_search.best_params_
best_score = grid_search.best_score_
print(f"Best combination of parameters: {best_combination}")
print(f"Best accuracy score: {best_score}")

Best combination of parameters: {'class_weight': None, 'criterion': 'entropy', 'max_depth': 20, 'n_estimators': 10}
Best accuracy score: 0.26339964532158094


## 5. Progress bar

Gridsearch can be a quite long process and you may find yourself wondering when it will end.
1. Create a manual gridsearch for the same parameters values of random forest iterating through the list of the possible values and calculating `cross_val_score` for each combination. Try to increase `n_jobs`. The value `cv` for `cross_val_score` is 5.
2. Track the progress using the library `tqdm.notebook`.
3. Create a dataframe from the results of the gridsearch with the columns corresponding to the names of the parameters and `mean_accuracy` and `std_accuracy`.
4. Sort it descendingly by the `mean_accuracy`, check if there is a huge difference between different combinations (sometimes a simpler model may give a comparable result).

In [19]:
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}

In [20]:
results = []
tqdm_notebook = tqdm(param_grid['n_estimators'], desc="Grid Search", leave=True)

for n_estimators in param_grid['n_estimators']:
    for max_depth in param_grid['max_depth']:
        for min_samples_split in param_grid['min_samples_split']:
            tree_rand_class = RandomForestClassifier(
                n_estimators=n_estimators,
                max_depth=max_depth,
                min_samples_split=min_samples_split,
                n_jobs=-1
            )
            cv_scores = cross_val_score(tree_rand_class, X, y, cv=5)
            results.append({
                'n_estimators': n_estimators,
                'max_depth': max_depth,
                'min_samples_split': min_samples_split,
                'mean_accuracy': np.mean(cv_scores),
                'std_accuracy': np.std(cv_scores)
            })

Grid Search:   0%|          | 0/3 [00:00<?, ?it/s]



In [21]:
results_df = pd.DataFrame(results)
results_df.sort_values(by='mean_accuracy', ascending=False)

Unnamed: 0,n_estimators,max_depth,min_samples_split,mean_accuracy,std_accuracy
9,100,,2,0.260429,0.064313
6,50,20.0,2,0.259834,0.062057
24,200,20.0,2,0.25983,0.06009
18,200,,2,0.25924,0.061543
15,100,20.0,2,0.259233,0.05772
0,50,,2,0.253911,0.069405
1,50,,5,0.244398,0.059498
19,200,,5,0.241431,0.053541
10,100,,5,0.240835,0.056599
7,50,20.0,5,0.240834,0.056407


## 6. Predictions

1. Choose the best model and use it to make predictions for the test dataset.
2. Calculate the final accuracy.

In [27]:
best_model = grid_search_tree

In [28]:
y_pred = best_model.predict(X_test)

print(f"accuracy: {accuracy_score(y_pred,y_test)}")

accuracy: 0.6893491124260355
