# Day 09. Exercise 00
# Regularization

## 0. Imports

In [63]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from collections import Counter

from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn import metrics, svm, linear_model, tree, ensemble

import pickle

## 1. Preprocessing

1. Read the file `dayofweek.csv` that you used in the previous day to a dataframe.
2. Using `train_test_split` with parameters `test_size=0.2`, `random_state=21` get `X_train`, `y_train`, `X_test`, `y_test`. Use the additional parameter `stratify`.

In [64]:
df = pd.read_csv('../data/dayofweek.csv')
X = df.drop('dayofweek', axis=1)
y = df['dayofweek']
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.2,
                                                    random_state=21)

## 2. Logreg regularization

### a. Default regularization

1. Train a baseline model with the only parameters `random_state=21`, `fit_intercept=False`.
2. Use stratified K-fold cross-validation with `10` splits to evaluate the accuracy of the model


The result of the code where you trained and evaluated the baseline model should be exactly like this (use `%%time` to get the info about how long it took to run the cell):

```
train -  0.62902   |   valid -  0.59259
train -  0.64633   |   valid -  0.62963
train -  0.63479   |   valid -  0.56296
train -  0.65622   |   valid -  0.61481
train -  0.63397   |   valid -  0.57778
train -  0.64056   |   valid -  0.59259
train -  0.64138   |   valid -  0.65926
train -  0.65952   |   valid -  0.56296
train -  0.64333   |   valid -  0.59701
train -  0.63674   |   valid -  0.62687
Average accuracy on crossval is 0.60165
Std is 0.02943
```

In [65]:
def cross_validation_test(model, split_n=10):
    kf = StratifiedKFold(split_n)
    X = X_train.values
    y = y_train.values

    for train, test in kf.split(X, y):
        model.fit(X[train], y[train])

        train_accuracy = metrics.accuracy_score(model.predict(X[train]), y[train])
        test_accuracy = metrics.accuracy_score(model.predict(X[test]), y[test])

        print(f'train -  {train_accuracy:1.5f}  |  valid -  {test_accuracy:1.5f}')

    accuracy_values = cross_val_score(model, X, y, cv=kf, n_jobs=-1)
    print(f'Average accuracy on crossval is {accuracy_values.mean():1.5f}')
    print(f'Std is {accuracy_values.std():1.5f}\n')

In [66]:
%%timeit -n 1 -r 1

lr = linear_model.LogisticRegression(random_state=21, fit_intercept=False, max_iter=30000)
cross_validation_test(lr, 10)

train -  0.66529  |  valid -  0.65185
train -  0.65622  |  valid -  0.60000
train -  0.66282  |  valid -  0.65926
train -  0.66200  |  valid -  0.62963
train -  0.66200  |  valid -  0.61481
train -  0.67106  |  valid -  0.62222
train -  0.66529  |  valid -  0.66667
train -  0.63974  |  valid -  0.62222
train -  0.66227  |  valid -  0.64179
train -  0.65898  |  valid -  0.61194
Average accuracy on crossval is 0.62758
Std is 0.02683

12.6 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


### b. Optimizing regularization parameters

1. In the cells below try different values of penalty: `none`, `l1`, `l2` – you can change the values of solver too.

In [69]:
%%timeit -n 1 -r 1

lr = linear_model.LogisticRegression(random_state=21, fit_intercept=False, max_iter=30000, penalty='none')
cross_validation_test(lr, 10)

train -  0.69580  |  valid -  0.64444
train -  0.69744  |  valid -  0.66667
train -  0.70569  |  valid -  0.67407
train -  0.71063  |  valid -  0.71111
train -  0.70569  |  valid -  0.64444
train -  0.70074  |  valid -  0.62222
train -  0.69662  |  valid -  0.71852
train -  0.69497  |  valid -  0.64444
train -  0.69440  |  valid -  0.64179
train -  0.68616  |  valid -  0.62687
Average accuracy on crossval is 0.65871
Std is 0.03038

32.8 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [70]:
%%timeit -n 1 -r 1

lr = linear_model.LogisticRegression(random_state=21, fit_intercept=False, max_iter=1000, penalty='l1', solver='liblinear')
cross_validation_test(lr, 10)

train -  0.64880  |  valid -  0.63704
train -  0.64551  |  valid -  0.62222
train -  0.63561  |  valid -  0.61481
train -  0.63726  |  valid -  0.64444
train -  0.63891  |  valid -  0.60000
train -  0.64468  |  valid -  0.59259
train -  0.62819  |  valid -  0.65185
train -  0.62819  |  valid -  0.59259
train -  0.64333  |  valid -  0.58209
train -  0.64003  |  valid -  0.61194
Average accuracy on crossval is 0.61496
Std is 0.02255

247 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [71]:
%%timeit -n 1 -r 1

lr = linear_model.LogisticRegression(random_state=21, fit_intercept=False, max_iter=1000, penalty='l2', solver='liblinear')
cross_validation_test(lr, 10)

train -  0.65128  |  valid -  0.62963
train -  0.62655  |  valid -  0.56296
train -  0.63479  |  valid -  0.61481
train -  0.63397  |  valid -  0.60741
train -  0.64056  |  valid -  0.57778
train -  0.64138  |  valid -  0.59259
train -  0.62737  |  valid -  0.63704
train -  0.62325  |  valid -  0.57037
train -  0.64745  |  valid -  0.60448
train -  0.64662  |  valid -  0.59701
Average accuracy on crossval is 0.59941
Std is 0.02313

220 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


## 3. SVM regularization

### a. Default regularization

1. Train a baseline model with the only parameters `probability=True`, `kernel='linear'`, `random_state=21`.
2. Use stratified K-fold cross-validation with `10` splits to evaluate the accuracy of the model.
3. The format of the result of the code where you trained and evaluated the baseline model should be similar to what you have got for the logreg.

In [None]:
%%timeit -n 1 -r 1

svc = svm.SVC(probability=True, kernel='linear', random_state=21)
cross_validation_test(svc, 10)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

train -  0.73866  |  valid -  0.68889


### b. Optimizing regularization parameters

1. In the cells below try different values of the parameter `C`.

In [None]:
%%timeit -n 1 -r 1

svc = svm.SVC(probability=True, kernel='linear', random_state=21, C=0.1)
cross_validation_test(svc, 10)

In [None]:
%%timeit -n 1 -r 1

svc = svm.SVC(probability=True, kernel='linear', random_state=21, C=0.5)
cross_validation_test(svc, 10)

In [None]:
%%timeit -n 1 -r 1

svc = svm.SVC(probability=True, kernel='linear', random_state=21, C=1.5)
cross_validation_test(svc, 10)

In [None]:
%%timeit -n 1 -r 1

svc = svm.SVC(probability=True, kernel='linear', random_state=21, C=2)
cross_validation_test(svc, 10)

## 4. Tree

### a. Default regularization

1. Train a baseline model with the only parameter `max_depth=10` and `random_state=21`.
2. Use stratified K-fold cross-validation with `10` splits to evaluate the accuracy of the model.
3. The format of the result of the code where you trained and evaluated the baseline model should be similar to what you have got for the logreg.

In [None]:
%%timeit -n 1 -r 1

tree_classifier = tree.DecisionTreeClassifier(max_depth=10, random_state=21)
cross_validation_test(tree_classifier, 10)

### b. Optimizing regularization parameters

1. In the cells below try different values of the parameter `max_depth`.
2. As a bonus, play with other regularization parameters trying to find the best combination.

In [None]:
%%timeit -n 1 -r 1

tree_classifier = tree.DecisionTreeClassifier(max_depth=1, random_state=21)
cross_validation_test(tree_classifier, 10)

In [None]:
%%timeit -n 1 -r 1

tree_classifier = tree.DecisionTreeClassifier(max_depth=5, random_state=21)
cross_validation_test(tree_classifier, 10)

In [None]:
%%timeit -n 1 -r 1

tree_classifier = tree.DecisionTreeClassifier(max_depth=15, random_state=21)
cross_validation_test(tree_classifier, 10)

In [None]:
%%timeit -n 1 -r 1

tree_classifier = tree.DecisionTreeClassifier(max_depth=25, random_state=21)
cross_validation_test(tree_classifier, 10)

In [None]:
%%timeit -n 1 -r 1

tree_classifier = tree.DecisionTreeClassifier(max_depth=50, random_state=21)
cross_validation_test(tree_classifier, 10)

## 5. Random forest

### a. Default regularization

1. Train a baseline model with the only parameters `n_estimators=50`, `max_depth=14`, `random_state=21`.
2. Use stratified K-fold cross-validation with `10` splits to evaluate the accuracy of the model.
3. The format of the result of the code where you trained and evaluated the baseline model should be similar to what you have got for the logreg.

In [None]:
%%timeit -n 1 -r 1

random_forest = ensemble.RandomForestClassifier(n_estimators=50, max_depth=14, random_state=21)
cross_validation_test(random_forest, 10)

### b. Optimizing regularization parameters

1. In the new cells try different values of the parameters `max_depth` and `n_estimators`.
2. As a bonus, play with other regularization parameters trying to find the best combination.

In [None]:
%%timeit -n 1 -r 1

random_forest = ensemble.RandomForestClassifier(n_estimators=10, max_depth=14, random_state=21)
cross_validation_test(random_forest, 10)

In [None]:
%%timeit -n 1 -r 1

random_forest = ensemble.RandomForestClassifier(n_estimators=150, max_depth=14, random_state=21)
cross_validation_test(random_forest, 10)

In [None]:
%%timeit -n 1 -r 1

random_forest = ensemble.RandomForestClassifier(n_estimators=50, max_depth=5, random_state=21)
cross_validation_test(random_forest, 10)

In [None]:
%%timeit -n 1 -r 1

random_forest = ensemble.RandomForestClassifier(n_estimators=50, max_depth=35, random_state=21)
cross_validation_test(random_forest, 10)

In [None]:
%%timeit -n 1 -r 1

random_forest = ensemble.RandomForestClassifier(n_estimators=150, max_depth=35, random_state=21)
cross_validation_test(random_forest, 10)

In [None]:
%%timeit -n 1 -r 1

random_forest = ensemble.RandomForestClassifier(n_estimators=100, max_depth=25, random_state=21)
cross_validation_test(random_forest, 10)

## 6. Predictions

1. Choose the best model and use it to make predictions for the test dataset.
2. Calculate the final accuracy.
3. Analyze: for which weekday your model makes the most errors (in % of the total number of samples of that class in your test dataset).
4. Save the model.

In [None]:
random_forest = ensemble.RandomForestClassifier(n_estimators=100, max_depth=25, random_state=21)
random_forest.fit(X_train, y_train)

In [None]:
y_pred = random_forest.predict(X_test)
print('Total accuracy:', metrics.accuracy_score(y_pred, y_test))

In [None]:
most_wrong_predicted = y_test[y_test != y_pred].value_counts().reset_index().iloc[0]
print(f'Most wrong predicted day: {most_wrong_predicted["index"]}')

In [None]:
total_errors_frequency = most_wrong_predicted["dayofweek"] / y_test.value_counts()[most_wrong_predicted["index"]] * 100
print(f'Total percentage of errors in predicting this day: {total_errors_frequency}')

In [None]:
with open('../data/random_forest_classifier.pkl', 'wb') as fout:
    pickle.dump(random_forest, fout)

In [None]:
%ls ../data/random_forest_classifier.pkl