# Day 09. Exercise 00
# Regularization

## 0. Imports

In [72]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from collections import Counter

from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn import metrics, svm, linear_model, tree, ensemble

import pickle

## 1. Preprocessing

1. Read the file `dayofweek.csv` that you used in the previous day to a dataframe.
2. Using `train_test_split` with parameters `test_size=0.2`, `random_state=21` get `X_train`, `y_train`, `X_test`, `y_test`. Use the additional parameter `stratify`.

In [107]:
df = pd.read_csv('../data/dayofweek.csv', index_col='index')
df

Unnamed: 0_level_0,numTrials,hour,dayofweek,uid_user_0,uid_user_1,uid_user_10,uid_user_11,uid_user_12,uid_user_13,uid_user_14,...,labname_lab02,labname_lab03,labname_lab03s,labname_lab05s,labname_laba04,labname_laba04s,labname_laba05,labname_laba06,labname_laba06s,labname_project1
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,-0.788667,-2.562352,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,-0.756764,-2.562352,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,-0.724861,-2.562352,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,-0.692958,-2.562352,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,-0.661055,-2.562352,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1681,-0.533442,0.945382,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1682,-0.629151,0.945382,3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1683,-0.597248,0.945382,3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1684,-0.565345,0.945382,3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [74]:
X = df.drop(columns='dayofweek').values
y = df.dayofweek.values

In [75]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21, stratify=y)

## 2. Logreg regularization

### a. Default regularization

1. Train a baseline model with the only parameters `random_state=21`, `fit_intercept=False`.
2. Use stratified K-fold cross-validation with `10` splits to evaluate the accuracy of the model


The result of the code where you trained and evaluated the baseline model should be exactly like this (use `%%time` to get the info about how long it took to run the cell):

```
train -  0.62902   |   valid -  0.59259
train -  0.64633   |   valid -  0.62963
train -  0.63479   |   valid -  0.56296
train -  0.65622   |   valid -  0.61481
train -  0.63397   |   valid -  0.57778
train -  0.64056   |   valid -  0.59259
train -  0.64138   |   valid -  0.65926
train -  0.65952   |   valid -  0.56296
train -  0.64333   |   valid -  0.59701
train -  0.63674   |   valid -  0.62687
Average accuracy on crossval is 0.60165
Std is 0.02943
```

In [76]:
def xval(model, splits=10):
    kf = StratifiedKFold(splits)
    for train, test in kf.split(X_train, y_train):
        model.fit(X_train[train], y_train[train])
        train_ac = metrics.accuracy_score(model.predict(X_train[train]), y_train[train])
        test_ac = metrics.accuracy_score(model.predict(X_train[test]), y_train[test])
        print(f'train -  {train_ac:1.5f}  |  valid -  {test_ac:1.5f}')
    acc_info = cross_val_score(model, X_train, y_train, cv=kf, n_jobs=-1)
    print(f'Average accuracy on crossval is {acc_info.mean():1.5f}')
    print(f'Std is {acc_info.std():1.5f}\n')

In [77]:
%%timeit -n 1 -r 1

lr_model = linear_model.LogisticRegression(random_state=21, fit_intercept=False)
xval(lr_model)

train -  0.62902  |  valid -  0.59259
train -  0.64633  |  valid -  0.62963
train -  0.63479  |  valid -  0.56296
train -  0.65622  |  valid -  0.61481
train -  0.63397  |  valid -  0.57778
train -  0.64056  |  valid -  0.59259
train -  0.64138  |  valid -  0.65926
train -  0.65952  |  valid -  0.56296
train -  0.64333  |  valid -  0.59701
train -  0.63674  |  valid -  0.62687
Average accuracy on crossval is 0.60165
Std is 0.02943

7.6 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


### b. Optimizing regularization parameters

1. In the cells below try different values of penalty: `none`, `l1`, `l2` – you can change the values of solver too.

solver{‘newton-cg’, ‘lbfgs’, ‘liblinear’, ‘sag’, ‘saga’}, default=’lbfgs’

In [78]:
%%timeit -n 1 -r 1

lr_model = linear_model.LogisticRegression(random_state=21, fit_intercept=False, max_iter=1000, penalty='none')
xval(lr_model, 10)

train -  0.66612  |  valid -  0.63704
train -  0.65787  |  valid -  0.65926
train -  0.66694  |  valid -  0.57778
train -  0.66529  |  valid -  0.62963
train -  0.66694  |  valid -  0.62222
train -  0.65952  |  valid -  0.57778
train -  0.65045  |  valid -  0.69630
train -  0.68425  |  valid -  0.61481
train -  0.66474  |  valid -  0.62687
train -  0.65651  |  valid -  0.60448
Average accuracy on crossval is 0.62462
Std is 0.03379

4.9 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [79]:
%%timeit -n 1 -r 1

lr_model = linear_model.LogisticRegression(random_state=21, fit_intercept=False, max_iter=1000, penalty='l1', solver='liblinear')
xval(lr_model, 10)

train -  0.61830  |  valid -  0.54815
train -  0.62737  |  valid -  0.62222
train -  0.60511  |  valid -  0.54074
train -  0.63644  |  valid -  0.62222
train -  0.62407  |  valid -  0.55556
train -  0.62325  |  valid -  0.58519
train -  0.61253  |  valid -  0.63704
train -  0.64716  |  valid -  0.58519
train -  0.63015  |  valid -  0.59701
train -  0.61367  |  valid -  0.59701
Average accuracy on crossval is 0.58903
Std is 0.03129

344 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [80]:
%%timeit -n 1 -r 1

lr_model = linear_model.LogisticRegression(random_state=21, fit_intercept=False, max_iter=1000, penalty='l2', solver='liblinear')
xval(lr_model)

train -  0.61006  |  valid -  0.56296
train -  0.61665  |  valid -  0.61481
train -  0.61336  |  valid -  0.59259
train -  0.62902  |  valid -  0.60741
train -  0.60923  |  valid -  0.55556
train -  0.61500  |  valid -  0.57778
train -  0.61665  |  valid -  0.61481
train -  0.64056  |  valid -  0.53333
train -  0.62109  |  valid -  0.58209
train -  0.61120  |  valid -  0.57463
Average accuracy on crossval is 0.58160
Std is 0.02532

250 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


## 3. SVM regularization

### a. Default regularization

1. Train a baseline model with the only parameters `probability=True`, `kernel='linear'`, `random_state=21`.
2. Use stratified K-fold cross-validation with `10` splits to evaluate the accuracy of the model.
3. The format of the result of the code where you trained and evaluated the baseline model should be similar to what you have got for the logreg.

‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’

In [81]:
%%timeit -n 1 -r 1
svc_model = svm.SVC(probability=True, kernel='linear', random_state=21)
xval(svc_model)

train -  0.70486  |  valid -  0.65926
train -  0.69662  |  valid -  0.75556
train -  0.69415  |  valid -  0.62222
train -  0.70239  |  valid -  0.65185
train -  0.69085  |  valid -  0.65185
train -  0.68920  |  valid -  0.64444
train -  0.69250  |  valid -  0.72593
train -  0.70074  |  valid -  0.62222
train -  0.69605  |  valid -  0.61940
train -  0.71087  |  valid -  0.63433
Average accuracy on crossval is 0.65871
Std is 0.04359

8.55 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


### b. Optimizing regularization parameters

1. In the cells below try different values of the parameter `C`.

In [82]:
%%timeit -n 1 -r 1
svc_model = svm.SVC(probability=True, kernel='linear', random_state=21, C=0.1)
xval(svc_model)

train -  0.58120  |  valid -  0.55556
train -  0.57543  |  valid -  0.56296
train -  0.57378  |  valid -  0.57037
train -  0.59275  |  valid -  0.57037
train -  0.58120  |  valid -  0.54815
train -  0.57955  |  valid -  0.54815
train -  0.57296  |  valid -  0.61481
train -  0.59192  |  valid -  0.54815
train -  0.59967  |  valid -  0.52985
train -  0.57825  |  valid -  0.57463
Average accuracy on crossval is 0.56230
Std is 0.02177

7.72 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [83]:
%%timeit -n 1 -r 1
svc_model = svm.SVC(probability=True, kernel='linear', random_state=21, C=0.5)
xval(svc_model)

train -  0.66694  |  valid -  0.63704
train -  0.66612  |  valid -  0.73333
train -  0.67271  |  valid -  0.60741
train -  0.67354  |  valid -  0.62963
train -  0.67766  |  valid -  0.64444
train -  0.66529  |  valid -  0.61481
train -  0.66200  |  valid -  0.68889
train -  0.66529  |  valid -  0.57037
train -  0.67463  |  valid -  0.59701
train -  0.66804  |  valid -  0.61194
Average accuracy on crossval is 0.63349
Std is 0.04471

6.73 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [84]:
%%timeit -n 1 -r 1
svc_model = svm.SVC(probability=True, kernel='linear', random_state=21, C=2)
xval(svc_model)

train -  0.70734  |  valid -  0.65926
train -  0.71393  |  valid -  0.75556
train -  0.74526  |  valid -  0.63704
train -  0.71558  |  valid -  0.66667
train -  0.71146  |  valid -  0.67407
train -  0.70157  |  valid -  0.63704
train -  0.70651  |  valid -  0.71852
train -  0.70981  |  valid -  0.64444
train -  0.72405  |  valid -  0.64925
train -  0.72488  |  valid -  0.64179
Average accuracy on crossval is 0.66836
Std is 0.03721

7.67 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [85]:
%%timeit -n 1 -r 1
svc_model = svm.SVC(probability=True, kernel='linear', random_state=21, C=4)
xval(svc_model)

train -  0.71723  |  valid -  0.65185
train -  0.75598  |  valid -  0.82222
train -  0.77741  |  valid -  0.67407
train -  0.73619  |  valid -  0.66667
train -  0.72218  |  valid -  0.71111
train -  0.72960  |  valid -  0.69630
train -  0.72383  |  valid -  0.71852
train -  0.74444  |  valid -  0.63704
train -  0.75371  |  valid -  0.67164
train -  0.75700  |  valid -  0.68657
Average accuracy on crossval is 0.69360
Std is 0.04905

9.76 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


## 4. Tree

### a. Default regularization

1. Train a baseline model with the only parameter `max_depth=10` and `random_state=21`.
2. Use stratified K-fold cross-validation with `10` splits to evaluate the accuracy of the model.
3. The format of the result of the code where you trained and evaluated the baseline model should be similar to what you have got for the logreg.

In [86]:
%%timeit -n 1 -r 1
tree_model = tree.DecisionTreeClassifier(max_depth=10, random_state=21)
xval(tree_model)

train -  0.81039  |  valid -  0.74074
train -  0.77741  |  valid -  0.74074
train -  0.83347  |  valid -  0.70370
train -  0.79720  |  valid -  0.76296
train -  0.82440  |  valid -  0.75556
train -  0.80379  |  valid -  0.68889
train -  0.80709  |  valid -  0.76296
train -  0.80132  |  valid -  0.65926
train -  0.80807  |  valid -  0.75373
train -  0.80478  |  valid -  0.68657
Average accuracy on crossval is 0.72551
Std is 0.03562

223 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


### b. Optimizing regularization parameters

1. In the cells below try different values of the parameter `max_depth`.
2. As a bonus, play with other regularization parameters trying to find the best combination.

In [87]:
%%timeit -n 1 -r 1
tree_model = tree.DecisionTreeClassifier(max_depth=5, random_state=21)
xval(tree_model)

train -  0.59522  |  valid -  0.53333
train -  0.56307  |  valid -  0.53333
train -  0.60181  |  valid -  0.55556
train -  0.59604  |  valid -  0.57037
train -  0.60264  |  valid -  0.57778
train -  0.57955  |  valid -  0.53333
train -  0.58368  |  valid -  0.54815
train -  0.59275  |  valid -  0.51111
train -  0.58237  |  valid -  0.56716
train -  0.60132  |  valid -  0.50000
Average accuracy on crossval is 0.54301
Std is 0.02423

136 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [88]:
%%timeit -n 1 -r 1
tree_model = tree.DecisionTreeClassifier(max_depth=15, random_state=21)
xval(tree_model)

train -  0.95796  |  valid -  0.82222
train -  0.93075  |  valid -  0.83704
train -  0.95631  |  valid -  0.83704
train -  0.95301  |  valid -  0.86667
train -  0.95136  |  valid -  0.88889
train -  0.94724  |  valid -  0.82222
train -  0.95466  |  valid -  0.90370
train -  0.94971  |  valid -  0.87407
train -  0.95305  |  valid -  0.83582
train -  0.94316  |  valid -  0.85821
Average accuracy on crossval is 0.85459
Std is 0.02682

158 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [89]:
%%timeit -n 1 -r 1
tree_model = tree.DecisionTreeClassifier(max_depth=20, random_state=21)
xval(tree_model)

train -  0.98846  |  valid -  0.86667
train -  0.99011  |  valid -  0.91111
train -  0.98681  |  valid -  0.85926
train -  0.98763  |  valid -  0.91111
train -  0.98928  |  valid -  0.88148
train -  0.98186  |  valid -  0.85926
train -  0.98846  |  valid -  0.91852
train -  0.99176  |  valid -  0.89630
train -  0.99094  |  valid -  0.88060
train -  0.98847  |  valid -  0.88060
Average accuracy on crossval is 0.88649
Std is 0.02075

180 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [90]:
%%timeit -n 1 -r 1
tree_model = tree.DecisionTreeClassifier(max_depth=10, max_leaf_nodes= 5, random_state=21)
xval(tree_model)

train -  0.44930  |  valid -  0.48889
train -  0.45260  |  valid -  0.45926
train -  0.45837  |  valid -  0.40741
train -  0.44930  |  valid -  0.48889
train -  0.45342  |  valid -  0.45185
train -  0.45177  |  valid -  0.46667
train -  0.45177  |  valid -  0.46667
train -  0.45672  |  valid -  0.42222
train -  0.42916  |  valid -  0.38806
train -  0.45552  |  valid -  0.43284
Average accuracy on crossval is 0.44727
Std is 0.03211

149 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [91]:
%%timeit -n 1 -r 1
tree_model = tree.DecisionTreeClassifier(max_depth=10, max_leaf_nodes= 45, random_state=21)
xval(tree_model)

train -  0.73619  |  valid -  0.71111
train -  0.70816  |  valid -  0.69630
train -  0.74279  |  valid -  0.67407
train -  0.72712  |  valid -  0.68148
train -  0.74114  |  valid -  0.70370
train -  0.73454  |  valid -  0.66667
train -  0.72877  |  valid -  0.71111
train -  0.72712  |  valid -  0.62222
train -  0.73970  |  valid -  0.69403
train -  0.73229  |  valid -  0.64179
Average accuracy on crossval is 0.68025
Std is 0.02824

165 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [92]:
%%timeit -n 1 -r 1
tree_model = tree.DecisionTreeClassifier(max_depth=10, min_samples_leaf= 5, random_state=21)
xval(tree_model)

train -  0.75515  |  valid -  0.71111
train -  0.71476  |  valid -  0.66667
train -  0.78153  |  valid -  0.66667
train -  0.75268  |  valid -  0.73333
train -  0.76752  |  valid -  0.73333
train -  0.75268  |  valid -  0.67407
train -  0.74608  |  valid -  0.70370
train -  0.74279  |  valid -  0.61481
train -  0.75206  |  valid -  0.68657
train -  0.74465  |  valid -  0.64179
Average accuracy on crossval is 0.68321
Std is 0.03641

166 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [93]:
%%timeit -n 1 -r 1
tree_model = tree.DecisionTreeClassifier(max_depth=10, min_samples_leaf= 2, random_state=21)
xval(tree_model)

train -  0.79637  |  valid -  0.72593
train -  0.75845  |  valid -  0.71852
train -  0.81863  |  valid -  0.67407
train -  0.78318  |  valid -  0.74074
train -  0.81039  |  valid -  0.73333
train -  0.79060  |  valid -  0.68889
train -  0.79143  |  valid -  0.74815
train -  0.78895  |  valid -  0.65185
train -  0.79077  |  valid -  0.73134
train -  0.79407  |  valid -  0.67910
Average accuracy on crossval is 0.70919
Std is 0.03129

168 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [94]:
%%timeit -n 1 -r 1
tree_model = tree.DecisionTreeClassifier(max_depth=20, min_samples_leaf= 2, random_state=21)
xval(tree_model)

train -  0.95878  |  valid -  0.85185
train -  0.95054  |  valid -  0.84444
train -  0.95466  |  valid -  0.81481
train -  0.94889  |  valid -  0.86667
train -  0.95054  |  valid -  0.88148
train -  0.94394  |  valid -  0.84444
train -  0.94724  |  valid -  0.90370
train -  0.94971  |  valid -  0.81481
train -  0.94646  |  valid -  0.81343
train -  0.95552  |  valid -  0.84328
Average accuracy on crossval is 0.84789
Std is 0.02834

162 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


## 5. Random forest

### a. Default regularization

1. Train a baseline model with the only parameters `n_estimators=50`, `max_depth=14`, `random_state=21`.
2. Use stratified K-fold cross-validation with `10` splits to evaluate the accuracy of the model.
3. The format of the result of the code where you trained and evaluated the baseline model should be similar to what you have got for the logreg.

In [95]:
%%timeit -n 1 -r 1
rf_model = ensemble.RandomForestClassifier(n_estimators=50, max_depth=14, random_state=21)
xval(rf_model)

train -  0.96455  |  valid -  0.88148
train -  0.96208  |  valid -  0.91852
train -  0.96785  |  valid -  0.86667
train -  0.96455  |  valid -  0.89630
train -  0.96538  |  valid -  0.91111
train -  0.96538  |  valid -  0.88148
train -  0.97115  |  valid -  0.91852
train -  0.96867  |  valid -  0.85185
train -  0.97364  |  valid -  0.88060
train -  0.97941  |  valid -  0.86567
Average accuracy on crossval is 0.88722
Std is 0.02204

2.62 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


### b. Optimizing regularization parameters

1. In the new cells try different values of the parameters `max_depth` and `n_estimators`.
2. As a bonus, play with other regularization parameters trying to find the best combination.

In [96]:
%%timeit -n 1 -r 1
rf_model = ensemble.RandomForestClassifier(n_estimators=10, max_depth=14, random_state=21)
xval(rf_model)

train -  0.95548  |  valid -  0.85926
train -  0.93157  |  valid -  0.88148
train -  0.95960  |  valid -  0.85926
train -  0.92910  |  valid -  0.87407
train -  0.95136  |  valid -  0.89630
train -  0.94889  |  valid -  0.84444
train -  0.93405  |  valid -  0.85185
train -  0.93899  |  valid -  0.83704
train -  0.95552  |  valid -  0.87313
train -  0.95387  |  valid -  0.85075
Average accuracy on crossval is 0.86276
Std is 0.01731

645 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [97]:
%%timeit -n 1 -r 1
rf_model = ensemble.RandomForestClassifier(n_estimators=30, max_depth=14, random_state=21)
xval(rf_model)

train -  0.96208  |  valid -  0.88148
train -  0.95383  |  valid -  0.90370
train -  0.95631  |  valid -  0.84444
train -  0.95713  |  valid -  0.89630
train -  0.96373  |  valid -  0.90370
train -  0.96125  |  valid -  0.85926
train -  0.96538  |  valid -  0.90370
train -  0.96455  |  valid -  0.88148
train -  0.96211  |  valid -  0.88060
train -  0.97035  |  valid -  0.86567
Average accuracy on crossval is 0.88203
Std is 0.01953

1.28 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [98]:
%%timeit -n 1 -r 1
rf_model = ensemble.RandomForestClassifier(n_estimators=80, max_depth=14, random_state=21)
xval(rf_model)

train -  0.96538  |  valid -  0.88148
train -  0.96785  |  valid -  0.91111
train -  0.96950  |  valid -  0.88148
train -  0.96620  |  valid -  0.89630
train -  0.97032  |  valid -  0.91111
train -  0.96620  |  valid -  0.88148
train -  0.97279  |  valid -  0.91111
train -  0.96538  |  valid -  0.86667
train -  0.97529  |  valid -  0.88060
train -  0.97611  |  valid -  0.85821
Average accuracy on crossval is 0.88795
Std is 0.01788

4.27 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [99]:
%%timeit -n 1 -r 1
rf_model = ensemble.RandomForestClassifier(n_estimators=50, max_depth=8, random_state=21)
xval(rf_model)

train -  0.79555  |  valid -  0.72593
train -  0.77246  |  valid -  0.77037
train -  0.75103  |  valid -  0.65926
train -  0.77494  |  valid -  0.74074
train -  0.78071  |  valid -  0.75556
train -  0.79637  |  valid -  0.68889
train -  0.80874  |  valid -  0.78519
train -  0.78895  |  valid -  0.66667
train -  0.79160  |  valid -  0.71642
train -  0.77430  |  valid -  0.70149
Average accuracy on crossval is 0.72105
Std is 0.04045

2.57 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [100]:
%%timeit -n 1 -r 1
rf_model = ensemble.RandomForestClassifier(n_estimators=50, max_depth=25, random_state=21)
xval(rf_model)

train -  0.99835  |  valid -  0.91111
train -  1.00000  |  valid -  0.94815
train -  1.00000  |  valid -  0.88148
train -  1.00000  |  valid -  0.94074
train -  0.99835  |  valid -  0.92593
train -  0.99835  |  valid -  0.88889
train -  1.00000  |  valid -  0.92593
train -  1.00000  |  valid -  0.89630
train -  1.00000  |  valid -  0.93284
train -  0.99918  |  valid -  0.89552
Average accuracy on crossval is 0.91469
Std is 0.02206

2.17 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


## 6. Predictions

1. Choose the best model and use it to make predictions for the test dataset.
2. Calculate the final accuracy.
3. Analyze: for which weekday your model makes the most errors (in % of the total number of samples of that class in your test dataset).
4. Save the model.

In [101]:
rf_model = ensemble.RandomForestClassifier(n_estimators=80, max_depth=25, random_state=21)
rf_model.fit(X_train, y_train)

RandomForestClassifier(max_depth=25, n_estimators=80, random_state=21)

In [102]:
y_pred = rf_model.predict(X_test)
metrics.accuracy_score(y_pred, y_test)

0.9289940828402367

In [103]:
errors_in = y_test[y_test != y_pred]
day, counts = np.unique(errors_in, return_counts=True)
d = dict(zip(day, counts))
max = -1
for key, val in d.items():
	if val > max:
		max = val
		key_max = key
print(f'max errors for day {key_max}')

max errors for day 0


In [104]:
days, full_counts = np.unique(y_test, return_counts=True)
d2 = dict(zip(day, full_counts))
d2

{0: 27, 1: 55, 2: 30, 3: 80, 4: 21, 5: 54, 6: 71}

In [105]:
print(f'that is {d[key_max]/d2[key_max]*100:.2f}%')

that is 25.93%


In [106]:
with open('../data/final_model.pkl', 'wb') as mod:
    pickle.dump(rf_model, mod)