# Day 09. Exercise 00
# Regularization

## 0. Imports

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

## 1. Preprocessing

1. Read the file `dayofweek.csv` that you used in the previous day to a dataframe.
2. Using `train_test_split` with parameters `test_size=0.2`, `random_state=21` get `X_train`, `y_train`, `X_test`, `y_test`. Use the additional parameter `stratify`.

In [6]:
df = pd.read_csv("../data/dayofweek.csv")

In [7]:
X = df.drop(columns=["dayofweek"])
y = df["dayofweek"]

In [8]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=21, stratify=y
)

## 2. Logreg regularization

### a. Default regularization

1. Train a baseline model with the only parameters `random_state=21`, `fit_intercept=False`.
2. Use stratified K-fold cross-validation with `10` splits to evaluate the accuracy of the model


The result of the code where you trained and evaluated the baseline model should be exactly like this (use `%%time` to get the info about how long it took to run the cell):

```
train -  0.62902   |   valid -  0.59259
train -  0.64633   |   valid -  0.62963
train -  0.63479   |   valid -  0.56296
train -  0.65622   |   valid -  0.61481
train -  0.63397   |   valid -  0.57778
train -  0.64056   |   valid -  0.59259
train -  0.64138   |   valid -  0.65926
train -  0.65952   |   valid -  0.56296
train -  0.64333   |   valid -  0.59701
train -  0.63674   |   valid -  0.62687
Average accuracy on crossval is 0.60165
Std is 0.02943
```

In [9]:
%%time
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=21)
model = LogisticRegression(random_state=21, fit_intercept=False)
train_acc = []
valid_acc = []

for train_index, valid_index in skf.split(X, y):
    X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
    model.fit(X_train, y_train)
    y_train_pred = model.predict(X_train)
    y_valid_pred = model.predict(X_valid)
    train_accuracy = accuracy_score(y_train, y_train_pred)
    valid_accuracy = accuracy_score(y_valid, y_valid_pred)
    train_acc.append(train_accuracy)
    valid_acc.append(valid_accuracy)
    print(f"train -  {train_accuracy:.5f}   |   valid -  {valid_accuracy:.5f}")
avg_accuracy = np.mean(valid_acc)
std = np.std(valid_acc)
print(f"Average accuracy on crossval is {avg_accuracy:.5f}")
print(f"Std is {std:.5f}")

train -  0.63546   |   valid -  0.65089
train -  0.65326   |   valid -  0.60947
train -  0.63942   |   valid -  0.63314
train -  0.63283   |   valid -  0.57988
train -  0.65590   |   valid -  0.57988
train -  0.64535   |   valid -  0.62130
train -  0.63834   |   valid -  0.60714
train -  0.63702   |   valid -  0.59524
train -  0.64295   |   valid -  0.68452
train -  0.63900   |   valid -  0.56548
Average accuracy on crossval is 0.61269
Std is 0.03441
CPU times: user 2.61 s, sys: 48.4 ms, total: 2.66 s
Wall time: 1.37 s


### b. Optimizing regularization parameters

1. In the cells below try different values of penalty: `none`, `l1`, `l2` – you can change the values of solver too.

In [10]:
def evaluate_logreg(penalty, solver, C=1.0):
    train_acc = []
    valid_acc = []
    
    for train_index, valid_index in skf.split(X_train, y_train):
        X_tr, X_val = X_train.iloc[train_index], X_train.iloc[valid_index]
        y_tr, y_val = y_train.iloc[train_index], y_train.iloc[valid_index]

        model = LogisticRegression(penalty=penalty, solver=solver, C=C, random_state=21, max_iter=500)
        model.fit(X_tr, y_tr)

        y_tr_pred = model.predict(X_tr)
        y_val_pred = model.predict(X_val)

        train_acc.append(accuracy_score(y_tr, y_tr_pred))
        valid_acc.append(accuracy_score(y_val, y_val_pred))

    avg_valid_acc = np.mean(valid_acc)
    std = np.std(valid_acc)

    print(f"Penalty: {penalty} | Solver: {solver} | C: {C}")
    print(f"Average Accuracy: {avg_valid_acc:.5f}, std {std:.5f}")

In [11]:
evaluate_logreg(penalty=None, solver="lbfgs")

evaluate_logreg(penalty="l1", solver="liblinear", C=0.1)
evaluate_logreg(penalty="l1", solver="liblinear", C=1.0)
evaluate_logreg(penalty="l1", solver="saga", C=1.0)

evaluate_logreg(penalty="l2", solver="lbfgs", C=0.1)
evaluate_logreg(penalty="l2", solver="lbfgs", C=1.0)
evaluate_logreg(penalty="l2", solver="saga", C=1.0)

Penalty: None | Solver: lbfgs | C: 1.0
Average Accuracy: 0.63303, std 0.03498
Penalty: l1 | Solver: liblinear | C: 0.1
Average Accuracy: 0.49668, std 0.03564
Penalty: l1 | Solver: liblinear | C: 1.0
Average Accuracy: 0.60339, std 0.03430
Penalty: l1 | Solver: saga | C: 1.0
Average Accuracy: 0.61261, std 0.03267
Penalty: l2 | Solver: lbfgs | C: 0.1
Average Accuracy: 0.53421, std 0.03132
Penalty: l2 | Solver: lbfgs | C: 1.0
Average Accuracy: 0.61393, std 0.03454
Penalty: l2 | Solver: saga | C: 1.0
Average Accuracy: 0.61327, std 0.03337


## 3. SVM regularization

### a. Default regularization

1. Train a baseline model with the only parameters `probability=True`, `kernel='linear'`, `random_state=21`.
2. Use stratified K-fold cross-validation with `10` splits to evaluate the accuracy of the model.
3. The format of the result of the code where you trained and evaluated the baseline model should be similar to what you have got for the logreg.

In [17]:
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=21)
def evaluate_svm(C=1.0):
    train_acc = []
    valid_acc = []

    for train_index, valid_index in skf.split(X_train, y_train):
        X_tr, X_val = X_train.iloc[train_index], X_train.iloc[valid_index]
        y_tr, y_val = y_train.iloc[train_index], y_train.iloc[valid_index]

        model = SVC(probability=True, kernel='linear', C=C, random_state=21)
        model.fit(X_tr, y_tr)
        
        y_tr_pred = model.predict(X_tr)
        y_val_pred = model.predict(X_val)
        
        train_acc.append(accuracy_score(y_tr, y_tr_pred))
        valid_acc.append(accuracy_score(y_val, y_val_pred))

        print(f"train -  {train_acc[-1]:.5f}   |   valid -  {valid_acc[-1]:.5f}")

    avg_valid_acc = np.mean(valid_acc)
    std_valid_acc = np.std(valid_acc)

    print(f"\nSVM with C={C}: Average Accuracy: {avg_valid_acc:.5f}, std {std_valid_acc:.5f}")
evaluate_svm(C=1.0)

train -  0.69473   |   valid -  0.73026
train -  0.68814   |   valid -  0.67105
train -  0.69693   |   valid -  0.67763
train -  0.70864   |   valid -  0.66447
train -  0.72108   |   valid -  0.64474
train -  0.71230   |   valid -  0.67763
train -  0.70132   |   valid -  0.72368
train -  0.71083   |   valid -  0.62500
train -  0.70373   |   valid -  0.70199
train -  0.71982   |   valid -  0.65563

SVM with C=1.0: Average Accuracy: 0.67721, std 0.03168


### b. Optimizing regularization parameters

1. In the cells below try different values of the parameter `C`.

In [18]:
for C_value in [0.01, 0.1, 1.0, 10, 100]:
    evaluate_svm(C=C_value)
    

train -  0.39898   |   valid -  0.38816
train -  0.41435   |   valid -  0.44737
train -  0.39092   |   valid -  0.40132
train -  0.40410   |   valid -  0.40132
train -  0.35432   |   valid -  0.38158
train -  0.38653   |   valid -  0.39474
train -  0.38580   |   valid -  0.34868
train -  0.41215   |   valid -  0.38158
train -  0.38259   |   valid -  0.37748
train -  0.42648   |   valid -  0.39735

SVM with C=0.01: Average Accuracy: 0.39196, std 0.02365
train -  0.57467   |   valid -  0.61842
train -  0.59224   |   valid -  0.57237
train -  0.59224   |   valid -  0.58553
train -  0.57540   |   valid -  0.56579
train -  0.58931   |   valid -  0.57237
train -  0.60249   |   valid -  0.59868
train -  0.57687   |   valid -  0.54605
train -  0.59078   |   valid -  0.53289
train -  0.57279   |   valid -  0.58940
train -  0.58449   |   valid -  0.52318

SVM with C=0.1: Average Accuracy: 0.57047, std 0.02820
train -  0.69473   |   valid -  0.73026
train -  0.68814   |   valid -  0.67105
train -

## 4. Tree

### a. Default regularization

1. Train a baseline model with the only parameter `max_depth=10` and `random_state=21`.
2. Use stratified K-fold cross-validation with `10` splits to evaluate the accuracy of the model.
3. The format of the result of the code where you trained and evaluated the baseline model should be similar to what you have got for the logreg.

In [24]:
def evaluate_tree(max_depth=10, min_samples_split=2, min_samples_leaf=1, random_state=21):
    train_acc = []
    valid_acc = []

    for train_index, valid_index in skf.split(X_train, y_train):
        X_tr, X_val = X_train.iloc[train_index], X_train.iloc[valid_index]
        y_tr, y_val = y_train.iloc[train_index], y_train.iloc[valid_index]
        
        model = DecisionTreeClassifier(max_depth=max_depth, min_samples_split=min_samples_split, 
                                       min_samples_leaf=min_samples_leaf, random_state=21)
        model.fit(X_tr, y_tr)

        y_tr_pred = model.predict(X_tr)
        y_val_pred = model.predict(X_val)

        train_acc.append(accuracy_score(y_tr, y_tr_pred))
        valid_acc.append(accuracy_score(y_val, y_val_pred))
        print(f"train -  {train_acc[-1]:.5f}   |   valid -  {valid_acc[-1]:.5f}")
        
    avg_valid_acc = np.mean(valid_acc)
    std_valid_acc = np.std(valid_acc)
    print(f"\nTree with max_depth={max_depth}: Average Accuracy: {avg_valid_acc:.5f} std {std_valid_acc:.5f}")

evaluate_tree(max_depth=10, random_state=21)
    

train -  0.81479   |   valid -  0.76974
train -  0.82430   |   valid -  0.75658
train -  0.81625   |   valid -  0.71053
train -  0.82284   |   valid -  0.76316
train -  0.82796   |   valid -  0.71053
train -  0.81991   |   valid -  0.75000
train -  0.81845   |   valid -  0.71053
train -  0.82650   |   valid -  0.75000
train -  0.82370   |   valid -  0.78146
train -  0.81858   |   valid -  0.75497

Tree with max_depth=10: Average Accuracy: 0.74575 std 0.02473


### b. Optimizing regularization parameters

1. In the cells below try different values of the parameter `max_depth`.
2. As a bonus, play with other regularization parameters trying to find the best combination.

In [25]:
for depth in [3, 5, 10, 15, 20]:
    evaluate_tree(max_depth=depth, random_state=21)

train -  0.49122   |   valid -  0.53289
train -  0.49341   |   valid -  0.51316
train -  0.48975   |   valid -  0.46053
train -  0.48609   |   valid -  0.49342
train -  0.48975   |   valid -  0.50658
train -  0.48682   |   valid -  0.48684
train -  0.49341   |   valid -  0.42763
train -  0.50512   |   valid -  0.45395
train -  0.48647   |   valid -  0.49007
train -  0.50475   |   valid -  0.45695

Tree with max_depth=3: Average Accuracy: 0.48220 std 0.03036
train -  0.62445   |   valid -  0.68421
train -  0.63616   |   valid -  0.61842
train -  0.62079   |   valid -  0.61184
train -  0.61786   |   valid -  0.61842
train -  0.62006   |   valid -  0.58553
train -  0.61493   |   valid -  0.65132
train -  0.62152   |   valid -  0.58553
train -  0.63690   |   valid -  0.59211
train -  0.62473   |   valid -  0.59603
train -  0.62692   |   valid -  0.55629

Tree with max_depth=5: Average Accuracy: 0.60997 std 0.03458
train -  0.81479   |   valid -  0.76974
train -  0.82430   |   valid -  0.75

In [27]:
for depth in [5, 10, 15]:
    for min_samples_split in [2, 5, 10]:
        for min_samples_leaf in [1, 5, 10]:
            print(f"\n Testing: max_depth={depth}, min_samples_split={min_samples_split}, min_samples_leaf={min_samples_leaf}")
            evaluate_tree(max_depth=depth, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf, random_state=21)


 Testing: max_depth=5, min_samples_split=2, min_samples_leaf=1
train -  0.62445   |   valid -  0.68421
train -  0.63616   |   valid -  0.61842
train -  0.62079   |   valid -  0.61184
train -  0.61786   |   valid -  0.61842
train -  0.62006   |   valid -  0.58553
train -  0.61493   |   valid -  0.65132
train -  0.62152   |   valid -  0.58553
train -  0.63690   |   valid -  0.59211
train -  0.62473   |   valid -  0.59603
train -  0.62692   |   valid -  0.55629

Tree with max_depth=5: Average Accuracy: 0.60997 std 0.03458

 Testing: max_depth=5, min_samples_split=2, min_samples_leaf=5
train -  0.62299   |   valid -  0.68421
train -  0.63250   |   valid -  0.61842
train -  0.61274   |   valid -  0.59868
train -  0.61420   |   valid -  0.61842
train -  0.61713   |   valid -  0.58553
train -  0.60908   |   valid -  0.64474
train -  0.61493   |   valid -  0.57237
train -  0.63543   |   valid -  0.59211
train -  0.61887   |   valid -  0.59603
train -  0.62253   |   valid -  0.54967

Tree with

## 5. Random forest

### a. Default regularization

1. Train a baseline model with the only parameters `n_estimators=50`, `max_depth=14`, `random_state=21`.
2. Use stratified K-fold cross-validation with `10` splits to evaluate the accuracy of the model.
3. The format of the result of the code where you trained and evaluated the baseline model should be similar to what you have got for the logreg.

In [33]:
def evaluate_random_forest(n_estimators=50, max_depth=14, min_samples_split=2, min_samples_leaf=1, max_features=None, random_state=21):
    train_acc = []
    valid_acc = []

    for train_index, valid_index in skf.split(X_train, y_train):
        X_tr, X_val = X_train.iloc[train_index], X_train.iloc[valid_index]
        y_tr, y_val = y_train.iloc[train_index], y_train.iloc[valid_index]

        model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=21)
        model.fit(X_tr, y_tr)

        y_tr_pred = model.predict(X_tr)
        y_val_pred = model.predict(X_val)

        train_acc.append(accuracy_score(y_tr, y_tr_pred))
        valid_acc.append(accuracy_score(y_val, y_val_pred))

        print(f"train -  {train_acc[-1]:.5f}   |   valid -  {valid_acc[-1]:.5f}")

    avg_valid_acc = np.mean(valid_acc)
    std_valid_acc = np.std(valid_acc)

    print(f"\nRandom Forest (n_estimators={n_estimators}, max_depth={max_depth}): Average Accuracy: {avg_valid_acc:.5f} std: {std_valid_acc:.5f}")

evaluate_random_forest(n_estimators=50, max_depth=14, random_state=21)

train -  0.97657   |   valid -  0.90132
train -  0.96559   |   valid -  0.93421
train -  0.96266   |   valid -  0.87500
train -  0.97072   |   valid -  0.89474
train -  0.96999   |   valid -  0.86184
train -  0.97877   |   valid -  0.88816
train -  0.96779   |   valid -  0.92105
train -  0.97145   |   valid -  0.90132
train -  0.96489   |   valid -  0.91391
train -  0.96928   |   valid -  0.86093

Random Forest (n_estimators=50, max_depth=14): Average Accuracy: 0.89525 std: 0.02315


### b. Optimizing regularization parameters

1. In the new cells try different values of the parameters `max_depth` and `n_estimators`.
2. As a bonus, play with other regularization parameters trying to find the best combination.

In [34]:
for depth in [10, 14, 18]:
    for n_trees in [50, 100, 150]:
        print(f"\n Testing: n_estimators={n_trees}, max_depth={depth}")
        evaluate_random_forest(n_estimators=n_trees, max_depth=depth)


 Testing: n_estimators=50, max_depth=10
train -  0.88507   |   valid -  0.86184
train -  0.87848   |   valid -  0.87500
train -  0.87335   |   valid -  0.78289
train -  0.89751   |   valid -  0.82237
train -  0.87701   |   valid -  0.77632
train -  0.88141   |   valid -  0.80263
train -  0.87262   |   valid -  0.78947
train -  0.85359   |   valid -  0.82237
train -  0.87783   |   valid -  0.84106
train -  0.88881   |   valid -  0.79470

Random Forest (n_estimators=50, max_depth=10): Average Accuracy: 0.81687 std: 0.03208

 Testing: n_estimators=100, max_depth=10
train -  0.88873   |   valid -  0.86842
train -  0.88067   |   valid -  0.86842
train -  0.88799   |   valid -  0.78947
train -  0.88873   |   valid -  0.82237
train -  0.88433   |   valid -  0.78947
train -  0.87408   |   valid -  0.80263
train -  0.87848   |   valid -  0.79605
train -  0.88214   |   valid -  0.82895
train -  0.89905   |   valid -  0.84106
train -  0.88003   |   valid -  0.78808

Random Forest (n_estimators=1

In [35]:
for depth in [10, 14, 18]:
    for n_trees in [50, 100, 150]:
        for min_samples_split in [2, 5, 10]:
            for min_samples_leaf in [1, 5, 10]: 
                for max_features in ["sqrt", "log2", None]:
                    print(f"\n Testing: n_estimators={n_trees}, max_depth={depth}, "
                          f"min_samples_split={min_samples_split}, min_samples_leaf={min_samples_leaf}, max_features={max_features}")
                    
                    evaluate_random_forest(n_estimators=n_trees, max_depth=depth,
                                           min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf,
                                           max_features=max_features)


 Testing: n_estimators=50, max_depth=10, min_samples_split=2, min_samples_leaf=1, max_features=sqrt
train -  0.88507   |   valid -  0.86184
train -  0.87848   |   valid -  0.87500
train -  0.87335   |   valid -  0.78289
train -  0.89751   |   valid -  0.82237
train -  0.87701   |   valid -  0.77632
train -  0.88141   |   valid -  0.80263
train -  0.87262   |   valid -  0.78947
train -  0.85359   |   valid -  0.82237
train -  0.87783   |   valid -  0.84106
train -  0.88881   |   valid -  0.79470

Random Forest (n_estimators=50, max_depth=10): Average Accuracy: 0.81687 std: 0.03208

 Testing: n_estimators=50, max_depth=10, min_samples_split=2, min_samples_leaf=1, max_features=log2
train -  0.88507   |   valid -  0.86184
train -  0.87848   |   valid -  0.87500
train -  0.87335   |   valid -  0.78289
train -  0.89751   |   valid -  0.82237
train -  0.87701   |   valid -  0.77632
train -  0.88141   |   valid -  0.80263
train -  0.87262   |   valid -  0.78947
train -  0.85359   |   valid - 

## 6. Predictions

1. Choose the best model and use it to make predictions for the test dataset.
2. Calculate the final accuracy.
3. Analyze: for which weekday your model makes the most errors (in % of the total number of samples of that class in your test dataset).
4. Save the model.

In [12]:
best_params = {
    "n_estimators": 150,
    "max_depth": 18,
    "min_samples_split": 10,
    "min_samples_leaf": 10,
    "max_features": "sqrt"
}

In [13]:
best_model = RandomForestClassifier(
    n_estimators=best_params["n_estimators"],
    max_depth=best_params["max_depth"],
    min_samples_split=best_params["min_samples_split"],
    min_samples_leaf=best_params["min_samples_leaf"],
    max_features=best_params["max_features"],
    random_state=21
)
best_model.fit(X_train, y_train)

In [15]:
y_test_pred = best_model.predict(X_test)

accuracy_score(y_test, y_test_pred)

0.7248520710059172

In [17]:
test_results = pd.DataFrame({"actual": y_test, "predicted": y_test_pred})
errors = test_results[test_results["actual"] != test_results["predicted"]]


In [18]:
error_counts = errors["actual"].value_counts()
error_counts

actual
2    21
0    16
6    13
3    13
5    12
1     9
4     9
Name: count, dtype: int64

In [19]:
total_counts = y_test.value_counts()
total_counts

dayofweek
3    80
6    71
1    55
5    54
2    30
0    27
4    21
Name: count, dtype: int64

In [20]:
error_percentage = (error_counts / total_counts) * 100
error_percentage = error_percentage.sort_values(ascending=False)


error_percentage.max()

np.float64(70.0)

In [21]:
error_percentage.idxmax()

np.int64(2)

Wednesday, 70 %

In [22]:
import joblib

In [None]:
joblib.dump(best_model, "best_random_forest.joblib")

In [23]:
joblib.dump(best_model, "best_random_forest.joblib")

['best_random_forest.joblib']