# Day 09. Exercise 00
# Regularization

## 0. Imports

In [95]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
import seaborn as sns
import joblib

## 1. Preprocessing

1. Read the file `dayofweek.csv` that you used in the previous day to a dataframe.
2. Using `train_test_split` with parameters `test_size=0.2`, `random_state=21` get `X_train`, `y_train`, `X_test`, `y_test`. Use the additional parameter `stratify`.

In [96]:
df = pd.read_csv("../data/dayofweek.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1686 entries, 0 to 1685
Data columns (total 44 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   numTrials         1686 non-null   float64
 1   hour              1686 non-null   float64
 2   dayofweek         1686 non-null   int64  
 3   uid_user_0        1686 non-null   float64
 4   uid_user_1        1686 non-null   float64
 5   uid_user_10       1686 non-null   float64
 6   uid_user_11       1686 non-null   float64
 7   uid_user_12       1686 non-null   float64
 8   uid_user_13       1686 non-null   float64
 9   uid_user_14       1686 non-null   float64
 10  uid_user_15       1686 non-null   float64
 11  uid_user_16       1686 non-null   float64
 12  uid_user_17       1686 non-null   float64
 13  uid_user_18       1686 non-null   float64
 14  uid_user_19       1686 non-null   float64
 15  uid_user_2        1686 non-null   float64
 16  uid_user_20       1686 non-null   float64


In [97]:
df.head()

Unnamed: 0,numTrials,hour,dayofweek,uid_user_0,uid_user_1,uid_user_10,uid_user_11,uid_user_12,uid_user_13,uid_user_14,...,labname_lab02,labname_lab03,labname_lab03s,labname_lab05s,labname_laba04,labname_laba04s,labname_laba05,labname_laba06,labname_laba06s,labname_project1
0,-0.788667,-2.562352,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,-0.756764,-2.562352,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,-0.724861,-2.562352,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,-0.692958,-2.562352,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,-0.661055,-2.562352,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [98]:
X = df.drop(columns=['dayofweek'])
y = df['dayofweek']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=21, stratify=y
)

X_train.shape, X_test.shape, y_train.shape, y_test.shape


((1348, 43), (338, 43), (1348,), (338,))

## 2. Logreg regularization

### a. Default regularization

1. Train a baseline model with the only parameters `random_state=21`, `fit_intercept=False`.
2. Use stratified K-fold cross-validation with `10` splits to evaluate the accuracy of the model


The result of the code where you trained and evaluated the baseline model should be exactly like this (use `%%time` to get the info about how long it took to run the cell):

```
train -  0.62902   |   valid -  0.59259
train -  0.64633   |   valid -  0.62963
train -  0.63479   |   valid -  0.56296
train -  0.65622   |   valid -  0.61481
train -  0.63397   |   valid -  0.57778
train -  0.64056   |   valid -  0.59259
train -  0.64138   |   valid -  0.65926
train -  0.65952   |   valid -  0.56296
train -  0.64333   |   valid -  0.59701
train -  0.63674   |   valid -  0.62687
Average accuracy on crossval is 0.60165
Std is 0.02943
```

In [99]:
lr = LogisticRegression(random_state=21, fit_intercept=False)

In [100]:
# StratifiedKFold учитывает распределение целевой переменной y. Он гарантирует, что в каждом фолде соотношение классов будет таким же, как и в исходном наборе данных.
# Например, если в исходных данных 70% объектов принадлежат классу 0 и 30% — классу 1, то в каждом фолде это соотношение (70/30) сохранится.

def crossval(model, X, y, n_splits=10):
    train_scores = []
    valid_scores = []
    skf = StratifiedKFold(n_splits=10)
    for train, valid in skf.split(X, y):
        X_train, X_valid = X.iloc[train], X.iloc[valid]
        y_train, y_valid = y.iloc[train], y.iloc[valid]
        model.fit(X_train, y_train)
        y_train_pred = model.predict(X_train)
        y_valid_pred = model.predict(X_valid)
        train_scores.append(accuracy_score(y_train, y_train_pred))
        valid_scores.append(accuracy_score(y_valid, y_valid_pred))

    for i in range(len(train_scores)):
        print(f'train - {train_scores[i]:.5f} | valid - {valid_scores[i]:.5f}')
        
    print(f'Average accuracy on crossval is {np.mean(valid_scores):.5f}')
    print(f'Std is {np.std(valid_scores):.5f}')


In [101]:
%%time
crossval(lr, X_train, y_train)

train - 0.62819 | valid - 0.59259
train - 0.64716 | valid - 0.62963
train - 0.63479 | valid - 0.57037
train - 0.65540 | valid - 0.61481
train - 0.63314 | valid - 0.57778
train - 0.64056 | valid - 0.59259
train - 0.64221 | valid - 0.65926
train - 0.65952 | valid - 0.56296
train - 0.64333 | valid - 0.59701
train - 0.63591 | valid - 0.62687
Average accuracy on crossval is 0.60239
Std is 0.02852
CPU times: user 83.7 ms, sys: 2.98 ms, total: 86.7 ms
Wall time: 87.1 ms


### b. Optimizing regularization parameters

1. In the cells below try different values of penalty: `none`, `l1`, `l2` – you can change the values of solver too.

In [118]:
%%time
lr = LogisticRegression(penalty=None, max_iter=1000, random_state=21, fit_intercept=False)
crossval(lr, X_train, y_train)

train - 0.66612 | valid - 0.63704
train - 0.65705 | valid - 0.65926
train - 0.66694 | valid - 0.57778
train - 0.66529 | valid - 0.62963
train - 0.66777 | valid - 0.61481
train - 0.65870 | valid - 0.57778
train - 0.65045 | valid - 0.69630
train - 0.68425 | valid - 0.61481
train - 0.66392 | valid - 0.62687
train - 0.65733 | valid - 0.60448
Average accuracy on crossval is 0.62388
Std is 0.03392
CPU times: user 236 ms, sys: 14.9 ms, total: 251 ms
Wall time: 251 ms


In [103]:
%%time
lr = LogisticRegression(random_state=21, fit_intercept=False, solver='liblinear', penalty='l1')
crossval(lr, X_train, y_train)

train - 0.61830 | valid - 0.54815
train - 0.62737 | valid - 0.62222
train - 0.60511 | valid - 0.54074
train - 0.63644 | valid - 0.62222
train - 0.62407 | valid - 0.55556
train - 0.62325 | valid - 0.58519
train - 0.61253 | valid - 0.63704
train - 0.64716 | valid - 0.58519
train - 0.63015 | valid - 0.59701
train - 0.61367 | valid - 0.59701
Average accuracy on crossval is 0.58903
Std is 0.03129
CPU times: user 105 ms, sys: 3.11 ms, total: 109 ms
Wall time: 113 ms


In [104]:
%%time
lr = LogisticRegression(random_state=21, fit_intercept=False, solver='liblinear', penalty='l2')
crossval(lr, X_train, y_train)

train - 0.61006 | valid - 0.56296
train - 0.61665 | valid - 0.61481
train - 0.61336 | valid - 0.59259
train - 0.62902 | valid - 0.60741
train - 0.60923 | valid - 0.55556
train - 0.61500 | valid - 0.57778
train - 0.61665 | valid - 0.61481
train - 0.64056 | valid - 0.53333
train - 0.62109 | valid - 0.58209
train - 0.61120 | valid - 0.57463
Average accuracy on crossval is 0.58160
Std is 0.02532
CPU times: user 67.4 ms, sys: 2.46 ms, total: 69.9 ms
Wall time: 70.1 ms


## 3. SVM regularization

### a. Default regularization

1. Train a baseline model with the only parameters `probability=True`, `kernel='linear'`, `random_state=21`.
2. Use stratified K-fold cross-validation with `10` splits to evaluate the accuracy of the model.
3. The format of the result of the code where you trained and evaluated the baseline model should be similar to what you have got for the logreg.

In [113]:
svm_model = SVC(probability=True, kernel='linear', random_state=21)

kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=21)

train_scores, valid_scores = [], []

start_time = time.time()

for train_idx, valid_idx in kf.split(X_train, y_train):
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[valid_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[valid_idx]

    svm_model.fit(X_tr, y_tr)

    train_acc = accuracy_score(y_tr, svm_model.predict(X_tr))
    valid_acc = accuracy_score(y_val, svm_model.predict(X_val))

    train_scores.append(train_acc)
    valid_scores.append(valid_acc)

    print(f"train -  {train_acc:.5f}   |   valid -  {valid_acc:.5f}")

avg_acc = np.mean(valid_scores)
std_acc = np.std(valid_scores)

print(f"Average accuracy on crossval is {avg_acc:.5f}")
print(f"Std is {std_acc:.5f}")

print(f"Execution time: {time.time() - start_time:.2f} seconds")

train -  0.70651   |   valid -  0.68148
train -  0.68920   |   valid -  0.64444
train -  0.69744   |   valid -  0.66667
train -  0.68920   |   valid -  0.65926
train -  0.69497   |   valid -  0.63704
train -  0.68673   |   valid -  0.68148
train -  0.69827   |   valid -  0.61481
train -  0.70486   |   valid -  0.57778
train -  0.68863   |   valid -  0.72388
train -  0.71005   |   valid -  0.64179
Average accuracy on crossval is 0.65286
Std is 0.03800
Execution time: 1.83 seconds


### b. Optimizing regularization parameters

1. In the cells below try different values of the parameter `C`.

In [106]:
C_values = [0.01, 0.1, 1, 10, 100]

for C in C_values:
    print(f"\nTraining SVM with C = {C}")

    svm_model = SVC(probability=True, kernel='linear', random_state=21, C=C)
    
    kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=21)
    
    train_scores, valid_scores = [], []
    start_time = time.time()
    
    for train_idx, valid_idx in kf.split(X_train, y_train):
        X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[valid_idx]
        y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[valid_idx]

        svm_model.fit(X_tr, y_tr)

        train_acc = accuracy_score(y_tr, svm_model.predict(X_tr))
        valid_acc = accuracy_score(y_val, svm_model.predict(X_val))

        train_scores.append(train_acc)
        valid_scores.append(valid_acc)

        print(f"train -  {train_acc:.5f}   |   valid -  {valid_acc:.5f}")

    avg_acc = np.mean(valid_scores)
    std_acc = np.std(valid_scores)

    print(f"Average accuracy on crossval is {avg_acc:.5f}")
    print(f"Std is {std_acc:.5f}")
    print(f"Execution time: {time.time() - start_time:.2f} seconds")



Training SVM with C = 0.01
train -  0.37923   |   valid -  0.40000
train -  0.38005   |   valid -  0.39259
train -  0.38252   |   valid -  0.37037
train -  0.37840   |   valid -  0.40741
train -  0.38170   |   valid -  0.37778
train -  0.37840   |   valid -  0.40741
train -  0.38335   |   valid -  0.36296
train -  0.37923   |   valid -  0.40000
train -  0.38386   |   valid -  0.35821
train -  0.38633   |   valid -  0.33582
Average accuracy on crossval is 0.38125
Std is 0.02293
Execution time: 2.21 seconds

Training SVM with C = 0.1
train -  0.57049   |   valid -  0.55556
train -  0.56884   |   valid -  0.59259
train -  0.57543   |   valid -  0.54074
train -  0.56142   |   valid -  0.60000
train -  0.59110   |   valid -  0.57037
train -  0.57873   |   valid -  0.53333
train -  0.59687   |   valid -  0.54074
train -  0.59439   |   valid -  0.52593
train -  0.56590   |   valid -  0.58209
train -  0.58731   |   valid -  0.53731
Average accuracy on crossval is 0.55787
Std is 0.02522
Execut

## 4. Tree

### a. Default regularization

1. Train a baseline model with the only parameter `max_depth=10` and `random_state=21`.
2. Use stratified K-fold cross-validation with `10` splits to evaluate the accuracy of the model.
3. The format of the result of the code where you trained and evaluated the baseline model should be similar to what you have got for the logreg.

In [107]:
tree_model = DecisionTreeClassifier(max_depth=10, random_state=21)

kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=21)

train_scores, valid_scores = [], []
start_time = time.time()

for train_idx, valid_idx in kf.split(X_train, y_train):
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[valid_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[valid_idx]

    tree_model.fit(X_tr, y_tr)

    train_acc = accuracy_score(y_tr, tree_model.predict(X_tr))
    valid_acc = accuracy_score(y_val, tree_model.predict(X_val))

    train_scores.append(train_acc)
    valid_scores.append(valid_acc)

    print(f"train -  {train_acc:.5f}   |   valid -  {valid_acc:.5f}")

avg_acc = np.mean(valid_scores)
std_acc = np.std(valid_scores)

print(f"Average accuracy on crossval is {avg_acc:.5f}")
print(f"Std is {std_acc:.5f}")
print(f"Execution time: {time.time() - start_time:.2f} seconds")

train -  0.80874   |   valid -  0.77037
train -  0.79802   |   valid -  0.70370
train -  0.81286   |   valid -  0.72593
train -  0.80049   |   valid -  0.74815
train -  0.80956   |   valid -  0.68889
train -  0.78978   |   valid -  0.74074
train -  0.80627   |   valid -  0.60741
train -  0.82688   |   valid -  0.71111
train -  0.78995   |   valid -  0.79104
train -  0.80313   |   valid -  0.70896
Average accuracy on crossval is 0.71963
Std is 0.04791
Execution time: 0.04 seconds


### b. Optimizing regularization parameters

1. In the cells below try different values of the parameter `max_depth`.
2. As a bonus, play with other regularization parameters trying to find the best combination.

In [108]:
max_depth_values = [5, 10, 15, 20]
min_samples_split_values = [2, 5, 10]
min_samples_leaf_values = [1, 5, 10]
max_features_values = [None, 'sqrt', 'log2']

param_grid = list(product(max_depth_values, min_samples_split_values, min_samples_leaf_values, max_features_values))

best_acc = 0
best_params = None

print(f"Testing {len(param_grid)} combinations...\n")

for max_depth, min_samples_split, min_samples_leaf, max_features in param_grid:
    print(f"Testing: max_depth={max_depth}, min_samples_split={min_samples_split}, min_samples_leaf={min_samples_leaf}, max_features={max_features}")
    
    tree_model = DecisionTreeClassifier(
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        max_features=max_features,
        random_state=21
    )

    kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=21)
    
    train_scores, valid_scores = [], []
    start_time = time.time()
    
    for train_idx, valid_idx in kf.split(X_train, y_train):
        X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[valid_idx]
        y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[valid_idx]

        tree_model.fit(X_tr, y_tr)

        train_acc = accuracy_score(y_tr, tree_model.predict(X_tr))
        valid_acc = accuracy_score(y_val, tree_model.predict(X_val))

        train_scores.append(train_acc)
        valid_scores.append(valid_acc)

    avg_acc = np.mean(valid_scores)
    std_acc = np.std(valid_scores)

    print(f"Average accuracy on crossval is {avg_acc:.5f}")
    print(f"Std is {std_acc:.5f}")
    print(f"Execution time: {time.time() - start_time:.2f} seconds\n")

    if avg_acc > best_acc:
        best_acc = avg_acc
        best_params = (max_depth, min_samples_split, min_samples_leaf, max_features)

print(f"Best accuracy: {best_acc:.5f}")
print(f"Best parameters: max_depth={best_params[0]}, min_samples_split={best_params[1]}, min_samples_leaf={best_params[2]}, max_features={best_params[3]}")


Testing 108 combinations...

Testing: max_depth=5, min_samples_split=2, min_samples_leaf=1, max_features=None
Average accuracy on crossval is 0.54448
Std is 0.04014
Execution time: 0.03 seconds

Testing: max_depth=5, min_samples_split=2, min_samples_leaf=1, max_features=sqrt
Average accuracy on crossval is 0.47999
Std is 0.02998
Execution time: 0.10 seconds

Testing: max_depth=5, min_samples_split=2, min_samples_leaf=1, max_features=log2
Average accuracy on crossval is 0.42281
Std is 0.05279
Execution time: 0.03 seconds

Testing: max_depth=5, min_samples_split=2, min_samples_leaf=5, max_features=None
Average accuracy on crossval is 0.54374
Std is 0.03751
Execution time: 0.03 seconds

Testing: max_depth=5, min_samples_split=2, min_samples_leaf=5, max_features=sqrt
Average accuracy on crossval is 0.48144
Std is 0.02429
Execution time: 0.02 seconds

Testing: max_depth=5, min_samples_split=2, min_samples_leaf=5, max_features=log2
Average accuracy on crossval is 0.41614
Std is 0.05643
Execu

## 5. Random forest

### a. Default regularization

1. Train a baseline model with the only parameters `n_estimators=50`, `max_depth=14`, `random_state=21`.
2. Use stratified K-fold cross-validation with `10` splits to evaluate the accuracy of the model.
3. The format of the result of the code where you trained and evaluated the baseline model should be similar to what you have got for the logreg.

In [109]:
rf_model = RandomForestClassifier(n_estimators=50, max_depth=14, random_state=21)

kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=21)

train_scores, valid_scores = [], []

start_time = time.time()
for train_idx, valid_idx in kf.split(X_train, y_train):
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[valid_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[valid_idx]

    rf_model.fit(X_tr, y_tr)
    
    train_acc = accuracy_score(y_tr, rf_model.predict(X_tr))
    valid_acc = accuracy_score(y_val, rf_model.predict(X_val))

    train_scores.append(train_acc)
    valid_scores.append(valid_acc)

    print(f"train -  {train_acc:.5f}   |   valid -  {valid_acc:.5f}")

avg_acc = np.mean(valid_scores)
std_acc = np.std(valid_scores)

print(f"Average accuracy on crossval is {avg_acc:.5f}")
print(f"Std is {std_acc:.5f}")
print(f"Execution time: {time.time() - start_time:.2f} seconds")


train -  0.97939   |   valid -  0.85185
train -  0.96620   |   valid -  0.85926
train -  0.96208   |   valid -  0.91852
train -  0.97115   |   valid -  0.91852
train -  0.97197   |   valid -  0.88148
train -  0.96538   |   valid -  0.86667
train -  0.96455   |   valid -  0.88889
train -  0.96867   |   valid -  0.87407
train -  0.96458   |   valid -  0.93284
train -  0.96787   |   valid -  0.86567
Average accuracy on crossval is 0.88578
Std is 0.02673
Execution time: 0.50 seconds


### b. Optimizing regularization parameters

1. In the new cells try different values of the parameters `max_depth` and `n_estimators`.
2. As a bonus, play with other regularization parameters trying to find the best combination.

In [114]:
max_depth_values = [10, 20, None]
n_estimators_values = [50, 100, 200]
min_samples_split_values = [2, 5]
min_samples_leaf_values = [1, 2]

results = []

for max_depth, n_estimators, min_samples_split, min_samples_leaf in product(
    max_depth_values, n_estimators_values, min_samples_split_values, min_samples_leaf_values
):
    print(f"Testing: max_depth={max_depth}, n_estimators={n_estimators}, "
          f"min_samples_split={min_samples_split}, min_samples_leaf={min_samples_leaf}")

    rf_model = RandomForestClassifier(
        n_estimators=n_estimators, 
        max_depth=max_depth, 
        min_samples_split=min_samples_split, 
        min_samples_leaf=min_samples_leaf, 
        random_state=21
    )

    train_scores, valid_scores = [], []
    for train_idx, valid_idx in kf.split(X_train, y_train):
        X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[valid_idx]
        y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[valid_idx]

        rf_model.fit(X_tr, y_tr)

        train_acc = accuracy_score(y_tr, rf_model.predict(X_tr))
        valid_acc = accuracy_score(y_val, rf_model.predict(X_val))

        train_scores.append(train_acc)
        valid_scores.append(valid_acc)

    avg_acc = np.mean(valid_scores)
    std_acc = np.std(valid_scores)

    results.append((max_depth, n_estimators, min_samples_split, min_samples_leaf, avg_acc, std_acc))
    print(f"Avg Acc: {avg_acc:.5f} | Std: {std_acc:.5f}\n")

best_params = max(results, key=lambda x: x[4])
print(f"Best Parameters: max_depth={best_params[0]}, n_estimators={best_params[1]}, "
      f"min_samples_split={best_params[2]}, min_samples_leaf={best_params[3]}")
print(f"Best Cross-Val Accuracy: {best_params[4]:.5f}")


Testing: max_depth=10, n_estimators=50, min_samples_split=2, min_samples_leaf=1
Avg Acc: 0.79674 | Std: 0.04656

Testing: max_depth=10, n_estimators=50, min_samples_split=2, min_samples_leaf=2
Avg Acc: 0.74627 | Std: 0.03668

Testing: max_depth=10, n_estimators=50, min_samples_split=5, min_samples_leaf=1
Avg Acc: 0.78117 | Std: 0.04422

Testing: max_depth=10, n_estimators=50, min_samples_split=5, min_samples_leaf=2
Avg Acc: 0.74180 | Std: 0.04365

Testing: max_depth=10, n_estimators=100, min_samples_split=2, min_samples_leaf=1
Avg Acc: 0.80713 | Std: 0.03365

Testing: max_depth=10, n_estimators=100, min_samples_split=2, min_samples_leaf=2
Avg Acc: 0.74258 | Std: 0.03455

Testing: max_depth=10, n_estimators=100, min_samples_split=5, min_samples_leaf=1
Avg Acc: 0.78709 | Std: 0.03450

Testing: max_depth=10, n_estimators=100, min_samples_split=5, min_samples_leaf=2
Avg Acc: 0.74848 | Std: 0.03531

Testing: max_depth=10, n_estimators=200, min_samples_split=2, min_samples_leaf=1
Avg Acc: 0.

## 6. Predictions

1. Choose the best model and use it to make predictions for the test dataset.
2. Calculate the final accuracy.
3. Analyze: for which weekday your model makes the most errors (in % of the total number of samples of that class in your test dataset).
4. Save the model.

In [115]:
best_rf_model = RandomForestClassifier(
    max_depth=None,
    n_estimators=100,
    min_samples_split=2,
    min_samples_leaf=1,
    random_state=21
)

best_rf_model.fit(X_train, y_train)

y_pred = best_rf_model.predict(X_test)

test_accuracy = accuracy_score(y_test, y_pred)
print(f"Test accuracy: {test_accuracy:.5f}")

Test accuracy: 0.93787


In [116]:
joblib.dump(best_rf_model, '../data/best_rf_model.joblib')

['../data/best_rf_model.joblib']

In [117]:
errors = y_test[y_test != y_pred]
total_per_day = y_test.value_counts()
errors_per_day = errors.value_counts()
error_rate_per_day = (errors_per_day / total_per_day).fillna(0) * 100

print("Ошибка по дням недели (% от общего количества выборок для этого дня):")
print(error_rate_per_day.sort_values(ascending=False))

Ошибка по дням недели (% от общего количества выборок для этого дня):
dayofweek
0    25.925926
4    14.285714
2     6.666667
5     5.555556
1     5.454545
3     2.500000
6     1.408451
Name: count, dtype: float64


больше всего ошибок в предсказаниях понедельника