# Day 09. Exercise 00
# Regularization

## 0. Imports

In [14]:
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from math import sqrt
from sklearn.metrics import mean_squared_error
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import accuracy_score

## 1. Preprocessing

1. Read the file `dayofweek.csv` that you used in the previous day to a dataframe.
2. Using `train_test_split` with parameters `test_size=0.2`, `random_state=21` get `X_train`, `y_train`, `X_test`, `y_test`. Use the additional parameter `stratify`.

In [8]:
df = pd.read_csv("../data/dayofweek.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,numTrials,hour,dayofweek,uid_user_1,uid_user_10,uid_user_11,uid_user_12,uid_user_13,uid_user_14,...,labname_lab02,labname_lab03,labname_lab03s,labname_lab05s,labname_laba04,labname_laba04s,labname_laba05,labname_laba06,labname_laba06s,labname_project1
0,0,-0.788667,-2.562352,4,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,1,-0.756764,-2.562352,4,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,2,-0.724861,-2.562352,4,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,3,-0.692958,-2.562352,4,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,4,-0.661055,-2.562352,4,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [9]:
X = df.drop("dayofweek",axis=1)
y =  df["dayofweek"]

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=21,stratify=y)

## 2. Logreg regularization

### a. Default regularization

1. Train a baseline model with the only parameters `random_state=21`, `fit_intercept=False`.
2. Use stratified K-fold cross-validation with `10` splits to evaluate the accuracy of the model


The result of the code where you trained and evaluated the baseline model should be exactly like this (use `%%time` to get the info about how long it took to run the cell):

```
train -  0.62902   |   valid -  0.59259
train -  0.64633   |   valid -  0.62963
train -  0.63479   |   valid -  0.56296
train -  0.65622   |   valid -  0.61481
train -  0.63397   |   valid -  0.57778
train -  0.64056   |   valid -  0.59259
train -  0.64138   |   valid -  0.65926
train -  0.65952   |   valid -  0.56296
train -  0.64333   |   valid -  0.59701
train -  0.63674   |   valid -  0.62687
Average accuracy on crossval is 0.60165
Std is 0.02943
```

In [5]:
def crossval(n_splits, X, y, model):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=21)
    train_rmse, test_rmse = [], []

    for train_idx, test_idx in kf.split(X):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        model.fit(X_train, y_train)
        train_pred = model.predict(X_train)
        test_pred = model.predict(X_test)

        train_rmse.append(sqrt(mean_squared_error(y_train, train_pred)))
        test_rmse.append(sqrt(mean_squared_error(y_test, test_pred)))

        print(f"train - {train_rmse[-1]} | valid - {test_rmse[-1]}")

    avg_rmse = np.mean(test_rmse)
    std_rmse = np.std(test_rmse)
    print(f"Average RMSE on crossval is {avg_rmse}")
    print(f"Std is {std_rmse}")

In [6]:
model = LogisticRegression(random_state=21,fit_intercept=False)

In [7]:
import warnings
warnings.filterwarnings("ignore")

In [8]:
%%time
crossval(10, X_train, y_train, model)

train - 1.7467435252824728 | valid - 1.7929904670172718
train - 1.7396496411250304 | valid - 1.803289175881631
train - 1.82295521197422 | valid - 1.6555182695279267
train - 1.7997435010063723 | valid - 1.8658728470829629
train - 1.7940085316704915 | valid - 1.7213259316477407
train - 1.7349042704714048 | valid - 1.9851299052557594
train - 1.7641198128053546 | valid - 1.4401645996461911
train - 1.6771585124876816 | valid - 2.0147603478476688
train - 1.8051728709196933 | valid - 1.801740452095256
train - 1.7384592394367107 | valid - 1.8264229782617136
Average RMSE on crossval is 1.7907214974264125
Std is 0.15529992424782782
CPU times: user 5min 13s, sys: 4min 43s, total: 9min 56s
Wall time: 38 s


### b. Optimizing regularization parameters

1. In the cells below try different values of penalty: `none`, `l1`, `l2` – you can change the values of solver too.

In [9]:
model_l2 = LogisticRegression(random_state=21,fit_intercept=False,penalty="l2")

In [10]:
%%time
crossval(10, X_train, y_train, model_l2)

train - 1.7467435252824728 | valid - 1.7929904670172718
train - 1.7396496411250304 | valid - 1.803289175881631
train - 1.82295521197422 | valid - 1.6555182695279267
train - 1.7997435010063723 | valid - 1.8658728470829629
train - 1.7940085316704915 | valid - 1.7213259316477407
train - 1.7349042704714048 | valid - 1.9851299052557594
train - 1.7641198128053546 | valid - 1.4401645996461911
train - 1.6771585124876816 | valid - 2.0147603478476688
train - 1.8051728709196933 | valid - 1.801740452095256
train - 1.7384592394367107 | valid - 1.8264229782617136
Average RMSE on crossval is 1.7907214974264125
Std is 0.15529992424782782
CPU times: user 4min 58s, sys: 4min 26s, total: 9min 25s
Wall time: 35.8 s


In [11]:
model_l1 = LogisticRegression(random_state=21,fit_intercept=False,penalty="l1",solver="liblinear")

In [12]:
%%time
crossval(10, X_train, y_train, model_l1)

train - 1.595025242275327 | valid - 1.5468007604706537
train - 1.5438092736383615 | valid - 1.5682025568335423
train - 1.639626764679266 | valid - 1.5371932093796679
train - 1.595025242275327 | valid - 1.6238956361284542
train - 1.548341665335321 | valid - 1.4478592114899487
train - 1.4927690509273128 | valid - 1.9531550923607717
train - 1.5898482560700367 | valid - 1.4168300559373406
train - 1.5187750520394534 | valid - 1.7018508443151816
train - 1.5450404258878396 | valid - 1.6706420251266885
train - 1.6085131837425133 | valid - 1.6390750941244592
Average RMSE on crossval is 1.6105504486166708
Std is 0.14337054494266874
CPU times: user 1.58 s, sys: 5 s, total: 6.58 s
Wall time: 440 ms


## 3. SVM regularization

### a. Default regularization

1. Train a baseline model with the only parameters `probability=True`, `kernel='linear'`, `random_state=21`.
2. Use stratified K-fold cross-validation with `10` splits to evaluate the accuracy of the model.
3. The format of the result of the code where you trained and evaluated the baseline model should be similar to what you have got for the logreg.

In [13]:
svm_model = SVC(probability=True, kernel='linear', random_state=21)

In [6]:
def crossval_reg(n_splits, X, y, model):
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=21)
    train_rmse, test_rmse = [], []

    for train_idx, test_idx in skf.split(X,y):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        model.fit(X_train, y_train)
        train_pred = model.predict(X_train)
        test_pred = model.predict(X_test)

        train_rmse.append(sqrt(mean_squared_error(y_train, train_pred)))
        test_rmse.append(sqrt(mean_squared_error(y_test, test_pred)))

        print(f"train - {train_rmse[-1]} | valid - {test_rmse[-1]}")

    avg_rmse = np.mean(test_rmse)
    std_rmse = np.std(test_rmse)
    print(f"Average RMSE on crossval is {avg_rmse}")
    print(f"Std is {std_rmse}")
    return np.mean(test_rmse)

In [None]:
%%time
crossval_reg(10, X_train, y_train, svm_model)

### b. Optimizing regularization parameters

1. In the cells below try different values of the parameter `C`.

In [3]:
svm_model_with_C_001 = SVC(probability=True, kernel='linear', random_state=21,C=0.01)

In [None]:
%%time
crossval_reg(10, X_train, y_train, svm_model_with_C_001)

In [None]:
svm_model_with_C_100 = SVC(probability=True, kernel='linear', random_state=21,C=100)

In [None]:
%%time
crossval_reg(10, X_train, y_train, svm_model_with_C_100)

## 4. Tree

### a. Default regularization

1. Train a baseline model with the only parameter `max_depth=10` and `random_state=21`.
2. Use stratified K-fold cross-validation with `10` splits to evaluate the accuracy of the model.
3. The format of the result of the code where you trained and evaluated the baseline model should be similar to what you have got for the logreg.

In [10]:
tree_reg = DecisionTreeRegressor(max_depth=10,random_state=21)

In [11]:
%%time
crossval_reg(10, X_train, y_train, tree_reg)

train - 0.15700200323231947 | valid - 0.4057001264389655
train - 0.19990811905356023 | valid - 0.7661210989624498
train - 0.15801143239164248 | valid - 0.37191665123365014
train - 0.2043654958738277 | valid - 0.5788423353499378
train - 0.20347909571889847 | valid - 0.5556536787006117
train - 0.19018125950723247 | valid - 0.6090652715910779
train - 0.19891186901501995 | valid - 0.5019242203798154
train - 0.20104395503179312 | valid - 0.20776836041118868
train - 0.19857624191967715 | valid - 0.24370030757882746
train - 0.20207279565427816 | valid - 0.5618374145046284
Average RMSE on crossval is 0.4802529465151153
Std is 0.16381205685683853
CPU times: user 85.6 ms, sys: 0 ns, total: 85.6 ms
Wall time: 88.3 ms


0.4802529465151153

### b. Optimizing regularization parameters

1. In the cells below try different values of the parameter `max_depth`.
2. As a bonus, play with other regularization parameters trying to find the best combination.

In [None]:
best_score = 0
best_params = {}

for max_depth in [5, 10, 15, 20]:
    for min_samples_split in [2, 5, 10]:
        for min_samples_leaf in [1, 2, 4]:
            model = DecisionTreeRegressor(
                max_depth=max_depth,
                min_samples_split=min_samples_split,
                min_samples_leaf=min_samples_leaf,
                random_state=21
            )

            avg_acc = crossval_reg(10, X, y, model)

            if avg_acc > best_score:
                best_score = avg_acc
                best_params = {
                    "max_depth": max_depth,
                    "min_samples_split": min_samples_split,
                    "min_samples_leaf": min_samples_leaf
                }

print(f"\nBest parameters: {best_params}")
print(f"Best average accuracy: {best_score:.4f}")

train - 0.8059047467943282 | valid - 0.9698376817679668
train - 0.7910945392101948 | valid - 0.9487965808440596
train - 0.8241507118586684 | valid - 0.6160818218887908
train - 0.806581618410998 | valid - 0.8239755999621681
train - 0.8027450029948093 | valid - 0.8271370813712471
train - 0.8078560389583185 | valid - 0.8008400586476798
train - 0.8031815448536234 | valid - 0.8232557340758169
train - 0.8049531310400408 | valid - 0.9109827586774378
train - 0.8004327506123587 | valid - 0.9684384554258653
train - 0.7815270345605356 | valid - 1.1001614069745518
Average RMSE on crossval is 0.8789507179635583
Std is 0.12454611188672786
train - 0.8061092102167073 | valid - 0.9698376817679668
train - 0.7962854495923732 | valid - 0.9680894588528935
train - 0.8273439200878471 | valid - 0.6160818218887908
train - 0.8070582190295669 | valid - 0.8239755999621681
train - 0.806023018332797 | valid - 0.8271370813712471
train - 0.8084677936400093 | valid - 0.8017631130674744
train - 0.8036598466452297 | val

## 5. Random forest

### a. Default regularization

1. Train a baseline model with the only parameters `n_estimators=50`, `max_depth=14`, `random_state=21`.
2. Use stratified K-fold cross-validation with `10` splits to evaluate the accuracy of the model.
3. The format of the result of the code where you trained and evaluated the baseline model should be similar to what you have got for the logreg.

In [12]:
tree_rang_reg = RandomForestRegressor(n_estimators=50,max_depth=14,random_state=21)

In [None]:
%%time
crossval_reg(10, X_train, y_train, tree_rang_reg)

train - 0.1371407470492873 | valid - 0.47278215169670157
train - 0.08825206724210807 | valid - 0.5944093921601937
train - 0.10609570012108155 | valid - 0.17762539478713343
train - 0.11804815483109936 | valid - 0.1986621923433512
train - 0.12511621218308525 | valid - 0.4032795975173267
train - 0.12657157159890076 | valid - 0.39613877104611445
train - 0.14740359831060357 | valid - 0.1659341955628544
train - 0.12879298880519507 | valid - 0.18119766166425158
train - 0.1333635330982997 | valid - 0.10892966373670235
train - 0.10648959080083441 | valid - 0.33361878325015426
Average RMSE on crossval is 0.3032577803764783
Std is 0.1520869475434803
CPU times: user 1.25 s, sys: 0 ns, total: 1.25 s
Wall time: 1.31 s


0.3032577803764783

### b. Optimizing regularization parameters

1. In the new cells try different values of the parameters `max_depth` and `n_estimators`.
2. As a bonus, play with other regularization parameters trying to find the best combination.

In [None]:
best_score = 0
best_params = {}

for max_depth in [5, 10, 15, 20]:
    for min_samples_split in [2, 5, 10]:
        for min_samples_leaf in [1, 2, 4]:
            model = RandomForestRegressor(
                max_depth=max_depth,
                min_samples_split=min_samples_split,
                min_samples_leaf=min_samples_leaf,
                random_state=21
            )

            avg_acc = crossval_reg(10, X, y, model)

            if avg_acc > best_score:
                best_score = avg_acc
                best_params = {
                    "max_depth": max_depth,
                    "min_samples_split": min_samples_split,
                    "min_samples_leaf": min_samples_leaf
                }

print(f"\nBest parameters: {best_params}")
print(f"Best average accuracy: {best_score:.4f}")

train - 0.5887554748903061 | valid - 0.6772191717259829
train - 0.5506784906898337 | valid - 0.6871678622784522
train - 0.5286174057931483 | valid - 0.4365335924511342
train - 0.6371244446755532 | valid - 0.6572411611847283
train - 0.5620367877662017 | valid - 0.6403518102457892
train - 0.6369546044430711 | valid - 0.5825884418869736
train - 0.5739945164963718 | valid - 0.6169108943404001
train - 0.5124554071017025 | valid - 0.5052269520236203
train - 0.5712212016725935 | valid - 0.711408425854825
train - 0.5178605067940272 | valid - 0.8076158880020284
Average RMSE on crossval is 0.6322264199993934
Std is 0.0999444677736885
train - 0.5899058752264015 | valid - 0.6802107214446024
train - 0.5530022373120624 | valid - 0.6886374138820409
train - 0.5304489399193598 | valid - 0.44005681046967254
train - 0.6402097620573403 | valid - 0.6597710894224931
train - 0.5638665640416903 | valid - 0.6420228034141736
train - 0.6394602282097505 | valid - 0.586557028693399
train - 0.5763750823054268 | val

## 6. Predictions

1. Choose the best model and use it to make predictions for the test dataset.
2. Calculate the final accuracy.
3. Analyze: for which weekday your model makes the most errors (in % of the total number of samples of that class in your test dataset).
4. Save the model.

In [None]:
tree_rang_reg = RandomForestRegressor(n_estimators=50,max_depth=5,random_state=21,min_samples_split=10,min_samples_leaf=4)
tree_rang_reg.fit(X_train,y_train)

In [19]:
y_pred = tree_rang_reg.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
print(f"MSE: {mse:.4f}")

MSE: 0.3391


In [15]:
import joblib

In [23]:
X_test['dayofweek'] = df.loc[X_test.index, 'dayofweek']
errors = abs(y_pred - y_test)
error_df = pd.DataFrame({'dayofweek': X_test['dayofweek'], 'error': errors})
error_percent = error_df.groupby('dayofweek')['error'].sum() / error_df['dayofweek'].value_counts() * 100
print(error_percent)

dayofweek
0    39.822057
1    16.106040
2    19.877076
3    20.714580
4    68.630486
5    15.733025
6    70.377723
dtype: float64


In [24]:
joblib.dump(tree_rang_reg, "my_randomforest_model.pkl")

['my_randomforest_model.pkl']