# Day 09. Exercise 00
# Regularization

## 0. Imports

In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import (
    LogisticRegression
)
from sklearn.svm import (
    SVC
)
from sklearn.tree import (
    DecisionTreeClassifier
)
from sklearn.ensemble import (
    RandomForestClassifier
)
from sklearn.metrics import (
    accuracy_score
)
from sklearn.model_selection import (
    train_test_split,
    cross_validate,
    StratifiedKFold
)
import joblib

## 1. Preprocessing

1. Read the file `dayofweek.csv` that you used in the previous day to a dataframe.
2. Using `train_test_split` with parameters `test_size=0.2`, `random_state=21` get `X_train`, `y_train`, `X_test`, `y_test`. Use the additional parameter `stratify`.

In [2]:
df = pd.read_csv('../data/dayofweek.csv')

In [3]:
X = df.drop('weekday', axis=1)
y = df['weekday']
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    stratify=y
)

## 2. Logreg regularization

### a. Default regularization

1. Train a baseline model with the only parameters `random_state=21`, `fit_intercept=False`.
2. Use stratified K-fold cross-validation with `10` splits to evaluate the accuracy of the model


The result of the code where you trained and evaluated the baseline model should be exactly like this (use `%%time` to get the info about how long it took to run the cell):

```
train -  0.62902   |   valid -  0.59259
train -  0.64633   |   valid -  0.62963
train -  0.63479   |   valid -  0.56296
train -  0.65622   |   valid -  0.61481
train -  0.63397   |   valid -  0.57778
train -  0.64056   |   valid -  0.59259
train -  0.64138   |   valid -  0.65926
train -  0.65952   |   valid -  0.56296
train -  0.64333   |   valid -  0.59701
train -  0.63674   |   valid -  0.62687
Average accuracy on crossval is 0.60165
Std is 0.02943
```

In [4]:
logreg = LogisticRegression(
    random_state=21,
    fit_intercept=False
)

kf = StratifiedKFold(
    n_splits=10,
)

scores = cross_validate(
    logreg,
    X_train,
    y_train,
    cv=kf,
    return_train_score=True
)

for train, test in zip(scores['train_score'], scores['test_score']):
    print(f"train - {train:.5f}\t{test:.5f}")
print(f"Average accuracy on crossval is {np.mean(scores['test_score']):.5f}", f"Std is {np.std(scores['test_score']):.5f}", sep="\n")

train - 0.62819	0.60000
train - 0.62242	0.62963
train - 0.62242	0.54074
train - 0.63974	0.52593
train - 0.61748	0.59259
train - 0.62819	0.65185
train - 0.63974	0.56296
train - 0.63232	0.62222
train - 0.63015	0.63433
train - 0.62356	0.63433
Average accuracy on crossval is 0.59946
Std is 0.04104


In [15]:
%%timeit

logreg.fit(X_train, y_train)

26.7 ms ± 7.76 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


### b. Optimizing regularization parameters

1. In the cells below try different values of penalty: `none`, `l1`, `l2` – you can change the values of solver too.

In [5]:
import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings("ignore", category=ConvergenceWarning)

In [6]:
logreg = LogisticRegression(
    random_state=21,
    fit_intercept=False,
    penalty=None,
    solver='saga'
)

kf = StratifiedKFold(
    n_splits=10
)

scores = cross_validate(
    logreg,
    X_train,
    y_train,
    cv=kf,
    return_train_score=True
)

for train, test in zip(scores['train_score'], scores['test_score']):
    print(f"train - {train:.5f}\t{test:.5f}")
print(f"Average accuracy on crossval is {np.mean(scores['test_score']):.5f}", f"Std is {np.std(scores['test_score']):.5f}", sep="\n")

train - 0.65622	0.62963
train - 0.66364	0.67407
train - 0.67848	0.63704
train - 0.66200	0.57778
train - 0.68178	0.64444
train - 0.65540	0.67407
train - 0.67519	0.59259
train - 0.65458	0.64444
train - 0.66557	0.65672
train - 0.65980	0.64925
Average accuracy on crossval is 0.63800
Std is 0.02987


In [48]:
%%timeit

logreg.fit(X_train, y_train)

130 ms ± 2.41 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [7]:
logreg = LogisticRegression(
    random_state=21,
    fit_intercept=False,
    penalty='l2',
    solver='saga'
)

kf = StratifiedKFold(
    n_splits=10
)

scores = cross_validate(
    logreg,
    X_train,
    y_train,
    cv=kf,
    return_train_score=True
)

for train, test in zip(scores['train_score'], scores['test_score']):
    print(f"train - {train:.5f}\t{test:.5f}")
print(f"Average accuracy on crossval is {np.mean(scores['test_score']):.5f}", f"Std is {np.std(scores['test_score']):.5f}", sep="\n")

train - 0.62902	0.60000
train - 0.62242	0.62963
train - 0.62325	0.54074
train - 0.64056	0.52593
train - 0.61748	0.59259
train - 0.62737	0.65185
train - 0.64056	0.56296
train - 0.63397	0.62222
train - 0.62932	0.63433
train - 0.62191	0.63433
Average accuracy on crossval is 0.59946
Std is 0.04104


In [49]:
%%timeit

logreg.fit(X_train, y_train)

131 ms ± 4.35 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [8]:
logreg = LogisticRegression(
    random_state=21,
    fit_intercept=False,
    penalty='l1',
    solver='saga'
)

kf = StratifiedKFold(
    n_splits=10
)

scores = cross_validate(
    logreg,
    X_train,
    y_train,
    cv=kf,
    return_train_score=True
)

for train, test in zip(scores['train_score'], scores['test_score']):
    print(f"train - {train:.5f}\t{test:.5f}")
print(f"Average accuracy on crossval is {np.mean(scores['test_score']):.5f}", f"Std is {np.std(scores['test_score']):.5f}", sep="\n")

train - 0.62984	0.59259
train - 0.62242	0.62963
train - 0.62242	0.55556
train - 0.62984	0.52593
train - 0.63067	0.61481
train - 0.62655	0.66667
train - 0.63644	0.57037
train - 0.63479	0.61481
train - 0.63674	0.63433
train - 0.63180	0.62687
Average accuracy on crossval is 0.60316
Std is 0.03995


In [50]:
%%timeit

logreg.fit(X_train, y_train)

139 ms ± 7.38 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


## 3. SVM regularization

### a. Default regularization

1. Train a baseline model with the only parameters `probability=True`, `kernel='linear'`, `random_state=21`.
2. Use stratified K-fold cross-validation with `10` splits to evaluate the accuracy of the model.
3. The format of the result of the code where you trained and evaluated the baseline model should be similar to what you have got for the logreg.

In [57]:
svc = SVC(
    random_state=21,
    kernel='linear',
    probability=True
)

kf = StratifiedKFold(
    n_splits=10
)

scores = cross_validate(
    svc,
    X_train,
    y_train,
    cv=kf,
    return_train_score=True
)

for train, test in zip(scores['train_score'], scores['test_score']):
    print(f"train - {train:.5f}\t{test:.5f}")
print(f"Average accuracy on crossval is {np.mean(scores['test_score']):.5f}", f"Std is {np.std(scores['test_score']):.5f}", sep="\n")

train - 0.69744	0.65185
train - 0.70074	0.66667
train - 0.69909	0.62963
train - 0.69497	0.64444
train - 0.68096	0.68148
train - 0.69662	0.60741
train - 0.67189	0.68889
train - 0.70239	0.62222
train - 0.68699	0.69403
train - 0.70840	0.61194
Average accuracy on crossval is 0.64986
Std is 0.03034


In [55]:
%%timeit

svc.fit(X_train, y_train)

56.6 ms ± 2.52 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


### b. Optimizing regularization parameters

1. In the cells below try different values of the parameter `C`.

In [63]:
svc = SVC(
    random_state=21,
    C=0.1
)

kf = StratifiedKFold(
    n_splits=10
)

scores = cross_validate(
    svc,
    X_train,
    y_train,
    cv=kf,
    return_train_score=True
)

for train, test in zip(scores['train_score'], scores['test_score']):
    print(f"train - {train:.5f}\t{test:.5f}")
print(f"Average accuracy on crossval is {np.mean(scores['test_score']):.5f}", f"Std is {np.std(scores['test_score']):.5f}", sep="\n")

train - 0.56307	0.54815
train - 0.56966	0.57037
train - 0.56472	0.51852
train - 0.56472	0.51111
train - 0.57296	0.56296
train - 0.56966	0.51111
train - 0.55894	0.55556
train - 0.56884	0.54074
train - 0.57249	0.55970
train - 0.56013	0.58955
Average accuracy on crossval is 0.54678
Std is 0.02505


In [64]:
%%timeit

svc.fit(X_train, y_train)

82.3 ms ± 2.75 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [67]:
svc = SVC(
    random_state=21,
    C=0.5
)

kf = StratifiedKFold(
    n_splits=10
)

scores = cross_validate(
    svc,
    X_train,
    y_train,
    cv=kf,
    return_train_score=True
)

for train, test in zip(scores['train_score'], scores['test_score']):
    print(f"train - {train:.5f}\t{test:.5f}")
print(f"Average accuracy on crossval is {np.mean(scores['test_score']):.5f}", f"Std is {np.std(scores['test_score']):.5f}", sep="\n")

train - 0.79143	0.77778
train - 0.78978	0.72593
train - 0.79390	0.70370
train - 0.79060	0.74815
train - 0.79308	0.79259
train - 0.78895	0.75556
train - 0.80462	0.79259
train - 0.77824	0.71111
train - 0.79077	0.79851
train - 0.78254	0.69403
Average accuracy on crossval is 0.74999
Std is 0.03762


In [68]:
%%timeit

svc.fit(X_train, y_train)

62 ms ± 1.9 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [69]:
svc = SVC(
    random_state=21,
    C=1
)

kf = StratifiedKFold(
    n_splits=10
)

scores = cross_validate(
    svc,
    X_train,
    y_train,
    cv=kf,
    return_train_score=True
)

for train, test in zip(scores['train_score'], scores['test_score']):
    print(f"train - {train:.5f}\t{test:.5f}")
print(f"Average accuracy on crossval is {np.mean(scores['test_score']):.5f}", f"Std is {np.std(scores['test_score']):.5f}", sep="\n")

train - 0.84666	0.81481
train - 0.83924	0.74815
train - 0.85655	0.75556
train - 0.85243	0.80000
train - 0.84254	0.81481
train - 0.83677	0.82963
train - 0.83347	0.80000
train - 0.84666	0.74815
train - 0.84926	0.85075
train - 0.85091	0.76866
Average accuracy on crossval is 0.79305
Std is 0.03427


In [70]:
%%timeit

svc.fit(X_train, y_train)

52.9 ms ± 1.23 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [71]:
svc = SVC(
    random_state=21,
    C=100
)

kf = StratifiedKFold(
    n_splits=10
)

scores = cross_validate(
    svc,
    X_train,
    y_train,
    cv=kf,
    return_train_score=True
)

for train, test in zip(scores['train_score'], scores['test_score']):
    print(f"train - {train:.5f}\t{test:.5f}")
print(f"Average accuracy on crossval is {np.mean(scores['test_score']):.5f}", f"Std is {np.std(scores['test_score']):.5f}", sep="\n")

train - 0.99093	0.90370
train - 0.99093	0.91111
train - 0.99011	0.91852
train - 0.98928	0.93333
train - 0.99011	0.94074
train - 0.99176	0.94074
train - 0.99093	0.88148
train - 0.99093	0.91852
train - 0.98929	0.91791
train - 0.98929	0.90299
Average accuracy on crossval is 0.91690
Std is 0.01752


In [72]:
%%timeit

svc.fit(X_train, y_train)

40 ms ± 1.2 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


## 4. Tree

### a. Default regularization

1. Train a baseline model with the only parameter `max_depth=10` and `random_state=21`.
2. Use stratified K-fold cross-validation with `10` splits to evaluate the accuracy of the model.
3. The format of the result of the code where you trained and evaluated the baseline model should be similar to what you have got for the logreg.

In [88]:
tree = DecisionTreeClassifier(
    random_state=21,
    max_depth=10
)

kf = StratifiedKFold(
    n_splits=10
)

scores = cross_validate(
    tree,
    X_train,
    y_train,
    cv=kf,
    return_train_score=True
)

for train, test in zip(scores['train_score'], scores['test_score']):
    print(f"train - {train:.5f}\t{test:.5f}")
print(f"Average accuracy on crossval is {np.mean(scores['test_score']):.5f}", f"Std is {np.std(scores['test_score']):.5f}", sep="\n")

train - 0.82358	0.70370
train - 0.83759	0.79259
train - 0.81698	0.75556
train - 0.82605	0.70370
train - 0.81946	0.77778
train - 0.83512	0.76296
train - 0.84419	0.77037
train - 0.83347	0.68889
train - 0.81384	0.79851
train - 0.86656	0.73134
Average accuracy on crossval is 0.74854
Std is 0.03728


In [89]:
%%timeit

tree.fit(X_train, y_train)

3.92 ms ± 178 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)


### b. Optimizing regularization parameters

1. In the cells below try different values of the parameter `max_depth`.
2. As a bonus, play with other regularization parameters trying to find the best combination.

In [86]:
tree = DecisionTreeClassifier(
    random_state=21,
    max_depth=100
)

kf = StratifiedKFold(
    n_splits=10
)

scores = cross_validate(
    tree,
    X_train,
    y_train,
    cv=kf,
    return_train_score=True
)

for train, test in zip(scores['train_score'], scores['test_score']):
    print(f"train - {train:.5f}\t{test:.5f}")
print(f"Average accuracy on crossval is {np.mean(scores['test_score']):.5f}", f"Std is {np.std(scores['test_score']):.5f}", sep="\n")

train - 1.00000	0.90370
train - 1.00000	0.91852
train - 1.00000	0.88889
train - 1.00000	0.91111
train - 1.00000	0.91852
train - 1.00000	0.94074
train - 1.00000	0.82222
train - 1.00000	0.84444
train - 1.00000	0.92537
train - 1.00000	0.85075
Average accuracy on crossval is 0.89243
Std is 0.03773


In [81]:
%%timeit

tree.fit(X_train, y_train)

4.66 ms ± 168 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [90]:
tree = DecisionTreeClassifier(
    random_state=21,
    max_depth=50
)

kf = StratifiedKFold(
    n_splits=10
)

scores = cross_validate(
    tree,
    X_train,
    y_train,
    cv=kf,
    return_train_score=True
)

for train, test in zip(scores['train_score'], scores['test_score']):
    print(f"train - {train:.5f}\t{test:.5f}")
print(f"Average accuracy on crossval is {np.mean(scores['test_score']):.5f}", f"Std is {np.std(scores['test_score']):.5f}", sep="\n")

train - 1.00000	0.90370
train - 1.00000	0.91852
train - 1.00000	0.88889
train - 1.00000	0.91111
train - 1.00000	0.91852
train - 1.00000	0.94074
train - 1.00000	0.82222
train - 1.00000	0.84444
train - 1.00000	0.92537
train - 1.00000	0.85075
Average accuracy on crossval is 0.89243
Std is 0.03773


In [79]:
%%timeit

tree.fit(X_train, y_train)

4.8 ms ± 261 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)


## 5. Random forest

### a. Default regularization

1. Train a baseline model with the only parameters `n_estimators=50`, `max_depth=14`, `random_state=21`.
2. Use stratified K-fold cross-validation with `10` splits to evaluate the accuracy of the model.
3. The format of the result of the code where you trained and evaluated the baseline model should be similar to what you have got for the logreg.

In [91]:
forest = RandomForestClassifier(
    n_estimators=50,
    max_depth=14,
    random_state=21
)

kf = StratifiedKFold(
    n_splits=10
)

scores = cross_validate(
    forest,
    X_train,
    y_train,
    cv=kf,
    return_train_score=True
)

for train, test in zip(scores['train_score'], scores['test_score']):
    print(f"train - {train:.5f}\t{test:.5f}")
print(f"Average accuracy on crossval is {np.mean(scores['test_score']):.5f}", f"Std is {np.std(scores['test_score']):.5f}", sep="\n")

train - 0.97279	0.88889
train - 0.96455	0.86667
train - 0.97279	0.86667
train - 0.95631	0.88148
train - 0.96867	0.88889
train - 0.96208	0.92593
train - 0.96950	0.88148
train - 0.97032	0.86667
train - 0.96046	0.90299
train - 0.96540	0.84328
Average accuracy on crossval is 0.88129
Std is 0.02160


In [83]:
%%timeit

forest.fit(X_train, y_train)

87.7 ms ± 6.18 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


### b. Optimizing regularization parameters

1. In the new cells try different values of the parameters `max_depth` and `n_estimators`.
2. As a bonus, play with other regularization parameters trying to find the best combination.

In [93]:
forest = RandomForestClassifier(
    n_estimators=50,
    max_depth=30,
    random_state=21
)

kf = StratifiedKFold(
    n_splits=10
)

scores = cross_validate(
    forest,
    X_train,
    y_train,
    cv=kf,
    return_train_score=True
)

for train, test in zip(scores['train_score'], scores['test_score']):
    print(f"train - {train:.5f}\t{test:.5f}")
print(f"Average accuracy on crossval is {np.mean(scores['test_score']):.5f}", f"Std is {np.std(scores['test_score']):.5f}", sep="\n")

train - 1.00000	0.91852
train - 1.00000	0.90370
train - 1.00000	0.91852
train - 1.00000	0.91852
train - 1.00000	0.96296
train - 1.00000	0.95556
train - 1.00000	0.90370
train - 1.00000	0.89630
train - 1.00000	0.94030
train - 1.00000	0.88806
Average accuracy on crossval is 0.92061
Std is 0.02374


In [95]:
forest = RandomForestClassifier(
    n_estimators=85,
    max_depth=100,
    random_state=21
)

kf = StratifiedKFold(
    n_splits=10
)

scores = cross_validate(
    forest,
    X_train,
    y_train,
    cv=kf,
    return_train_score=True
)

for train, test in zip(scores['train_score'], scores['test_score']):
    print(f"train - {train:.5f}\t{test:.5f}")
print(f"Average accuracy on crossval is {np.mean(scores['test_score']):.5f}", f"Std is {np.std(scores['test_score']):.5f}", sep="\n")

train - 1.00000	0.91852
train - 1.00000	0.91852
train - 1.00000	0.91852
train - 1.00000	0.91852
train - 1.00000	0.94815
train - 1.00000	0.96296
train - 1.00000	0.89630
train - 1.00000	0.91111
train - 1.00000	0.93284
train - 1.00000	0.90299
Average accuracy on crossval is 0.92284
Std is 0.01913


## 6. Predictions

1. Choose the best model and use it to make predictions for the test dataset.
2. Calculate the final accuracy.
3. Analyze: for which weekday your model makes the most errors (in % of the total number of samples of that class in your test dataset).
4. Save the model.

In [97]:
forest.fit(X_train, y_train)

accuracy_score(forest.predict(X_test), y_test)

0.9378698224852071

In [98]:
df['predicts'] = forest.predict(X)

In [115]:
errors = pd.DataFrame({
    'weekday' : df['weekday'],
    'predicts' : df['predicts'],
    'diff' : df['predicts'] != df['weekday']
}).groupby('weekday').sum()

In [116]:
errors['diff'] = errors['diff'] / errors['predicts'] * 100

In [117]:
errors

Unnamed: 0_level_0,predicts,diff
weekday,Unnamed: 1_level_1,Unnamed: 2_level_1
0,7,28.571429
1,278,0.359712
2,302,0.662252
3,1192,0.251678
4,420,0.952381
5,1355,0.221402
6,2118,0.283286


In [118]:
joblib.dump(forest, 'best_model.joblib')

['best_model.joblib']