# Day 09. Exercise 00
# Regularization

## 0. Imports

In [4]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

## 1. Preprocessing

1. Read the file `dayofweek.csv` that you used in the previous day to a dataframe.
2. Using `train_test_split` with parameters `test_size=0.2`, `random_state=21` get `X_train`, `y_train`, `X_test`, `y_test`. Use the additional parameter `stratify`.

In [5]:
df = pd.read_csv('../data/dayofweek.csv')
df.head()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1686 entries, 0 to 1685
Data columns (total 42 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   uid_user_1        1686 non-null   float64
 1   uid_user_10       1686 non-null   float64
 2   uid_user_11       1686 non-null   float64
 3   uid_user_12       1686 non-null   float64
 4   uid_user_13       1686 non-null   float64
 5   uid_user_14       1686 non-null   float64
 6   uid_user_15       1686 non-null   float64
 7   uid_user_16       1686 non-null   float64
 8   uid_user_17       1686 non-null   float64
 9   uid_user_18       1686 non-null   float64
 10  uid_user_19       1686 non-null   float64
 11  uid_user_2        1686 non-null   float64
 12  uid_user_20       1686 non-null   float64
 13  uid_user_21       1686 non-null   float64
 14  uid_user_22       1686 non-null   float64
 15  uid_user_23       1686 non-null   float64
 16  uid_user_24       1686 non-null   float64


In [6]:
X = df.drop(columns=['dayofweek'])
y = df['dayofweek']

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21, stratify=y)

In [8]:
y_train.value_counts()

dayofweek
3    316
6    285
1    219
5    217
2    119
0    109
4     83
Name: count, dtype: int64

In [9]:
y_test.value_counts()

dayofweek
3    80
6    71
1    55
5    54
2    30
0    27
4    21
Name: count, dtype: int64

## 2. Logreg regularization

### a. Default regularization

1. Train a baseline model with the only parameters `random_state=21`, `fit_intercept=False`.
2. Use stratified K-fold cross-validation with `10` splits to evaluate the accuracy of the model


The result of the code where you trained and evaluated the baseline model should be exactly like this (use `%%time` to get the info about how long it took to run the cell):

```
train -  0.62902   |   valid -  0.59259
train -  0.64633   |   valid -  0.62963
train -  0.63479   |   valid -  0.56296
train -  0.65622   |   valid -  0.61481
train -  0.63397   |   valid -  0.57778
train -  0.64056   |   valid -  0.59259
train -  0.64138   |   valid -  0.65926
train -  0.65952   |   valid -  0.56296
train -  0.64333   |   valid -  0.59701
train -  0.63674   |   valid -  0.62687
Average accuracy on crossval is 0.60165
Std is 0.02943
```

In [10]:
def crosval(n_splits, X, y, model):

    kf = KFold(n_splits=n_splits, shuffle=True, random_state=21)

    scores = []
    result = []

    for test_index, train_index in kf.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        model.fit(X_train, y_train)

        pred_train = model.predict(X_train)
        pred_test = model.predict(X_test)

        train_score = accuracy_score(y_train, pred_train)
        test_score = accuracy_score(y_test, pred_test)
        
        result.append(f"train -  {train_score}   |   valid -  {test_score}")
        
        scores.append(test_score)

    average_accuracy = np.mean(scores)
    std = np.std(scores)
    result.append(f"Average accuracy on crossval is {average_accuracy}")
    result.append(f"Std is {std}")
    
    return result

In [11]:
regr = LogisticRegression(random_state=21, fit_intercept=True)
res = crosval(10, X, y, regr)
for item in res:
    print(item)

train -  0.6745562130177515   |   valid -  0.5201054713249835
train -  0.6568047337278107   |   valid -  0.5023071852340145
train -  0.6627218934911243   |   valid -  0.5023071852340145
train -  0.6390532544378699   |   valid -  0.5168094924192486
train -  0.7218934911242604   |   valid -  0.5082399472643375
train -  0.6449704142011834   |   valid -  0.5563612392880686
train -  0.6547619047619048   |   valid -  0.5164690382081687
train -  0.6785714285714286   |   valid -  0.5191040843214756
train -  0.7023809523809523   |   valid -  0.5125164690382081
train -  0.6964285714285714   |   valid -  0.5421607378129117
Average accuracy on crossval is 0.5196380850145432
Std is 0.016296510307424864


### b. Optimizing regularization parameters

1. In the cells below try different values of penalty: `none`, `l1`, `l2` – you can change the values of solver too.

In [12]:
regr = LogisticRegression(random_state=21, fit_intercept=True, solver='newton-cholesky', penalty='l2')
res = crosval(10, X, y, regr)
for item in res:
    print(item)

train -  0.6745562130177515   |   valid -  0.5201054713249835
train -  0.6627218934911243   |   valid -  0.5029663810151616
train -  0.6627218934911243   |   valid -  0.5023071852340145
train -  0.6390532544378699   |   valid -  0.5168094924192486
train -  0.7218934911242604   |   valid -  0.5082399472643375
train -  0.6449704142011834   |   valid -  0.5570204350692155
train -  0.6547619047619048   |   valid -  0.5164690382081687
train -  0.6785714285714286   |   valid -  0.5191040843214756
train -  0.7023809523809523   |   valid -  0.5118577075098815
train -  0.6964285714285714   |   valid -  0.5421607378129117
Average accuracy on crossval is 0.5197040480179399
Std is 0.016407228570589596


In [13]:
regr = LogisticRegression(random_state=21, fit_intercept=True, solver='liblinear', penalty='l1')
res = crosval(10, X, y, regr)
for item in res:
    print(item)

train -  0.6390532544378699   |   valid -  0.4798945286750165
train -  0.5798816568047337   |   valid -  0.5016479894528675
train -  0.6094674556213018   |   valid -  0.47659854976928145
train -  0.5384615384615384   |   valid -  0.4634146341463415
train -  0.6390532544378699   |   valid -  0.47857613711272246
train -  0.5680473372781065   |   valid -  0.48450889914304546
train -  0.6011904761904762   |   valid -  0.49077733860342554
train -  0.6071428571428571   |   valid -  0.5006587615283268
train -  0.6130952380952381   |   valid -  0.48023715415019763
train -  0.6369047619047619   |   valid -  0.5250329380764164
Average accuracy on crossval is 0.48813469306576407
Std is 0.016391684627054343




In [14]:
regr = LogisticRegression(random_state=21, fit_intercept=True, solver='newton-cholesky', penalty=None)
res = crosval(10, X, y, regr)
for item in res:
    print(item)

Further options are to use another solver or to avoid such situation in the first place. Possible remedies are removing collinear features of X or increasing the penalization strengths.
The original Linear Algebra message was:
Matrix is singular.
Further options are to use another solver or to avoid such situation in the first place. Possible remedies are removing collinear features of X or increasing the penalization strengths.
The original Linear Algebra message was:
Matrix is singular.
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=99).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.iteration += _check_optimize_result("lbfgs", opt_res, max_iter=max_iter)
Further options are to use another solver or to avoid such situation in the first place. Possible remedies are removing collinear features of X or increasing the penalization strengths.
The o

train -  0.8106508875739645   |   valid -  0.5306526038233356
train -  0.8461538461538461   |   valid -  0.5438365194462755
train -  0.8224852071005917   |   valid -  0.5339485827290705
train -  0.7810650887573964   |   valid -  0.4891232696110745
train -  0.8520710059171598   |   valid -  0.5517468688200395
train -  0.8047337278106509   |   valid -  0.5570204350692155
train -  0.8035714285714286   |   valid -  0.5500658761528326
train -  0.7976190476190477   |   valid -  0.5816864295125165
train -  0.8690476190476191   |   valid -  0.5869565217391305
train -  0.7797619047619048   |   valid -  0.541501976284585
Average accuracy on crossval is 0.5466539083188076
Std is 0.025996815873279906


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=99).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.iteration += _check_optimize_result("lbfgs", opt_res, max_iter=max_iter)
Further options are to use another solver or to avoid such situation in the first place. Possible remedies are removing collinear features of X or increasing the penalization strengths.
The original Linear Algebra message was:
Matrix is singular.
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=99).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.iteration += _check_optimize_result("lbfgs", opt_res, max_iter=max_iter)
Further options are to use another solver or to avoid such situation in the first place. Possible remedies are removing col

In [15]:
regr = LogisticRegression(random_state=21, fit_intercept=True, solver='lbfgs', penalty=None)
res = crosval(10, X, y, regr)
for item in res:
    print(item)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to sca

train -  0.8106508875739645   |   valid -  0.5306526038233356
train -  0.8461538461538461   |   valid -  0.5444957152274226
train -  0.8224852071005917   |   valid -  0.5326301911667766
train -  0.7810650887573964   |   valid -  0.4884640738299275
train -  0.8520710059171598   |   valid -  0.5524060646011866
train -  0.7988165680473372   |   valid -  0.5570204350692155
train -  0.8035714285714286   |   valid -  0.549407114624506
train -  0.7976190476190477   |   valid -  0.580368906455863
train -  0.8690476190476191   |   valid -  0.5869565217391305
train -  0.7857142857142857   |   valid -  0.5434782608695652
Average accuracy on crossval is 0.5465879887406929
Std is 0.02600499255529455


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [16]:
regr = LogisticRegression(random_state=21, fit_intercept=True, solver='saga', penalty='elasticnet', l1_ratio=0.0)
res = crosval(10, X, y, regr)
for item in res:
    print(item)

train -  0.6745562130177515   |   valid -  0.5201054713249835
train -  0.6627218934911243   |   valid -  0.5023071852340145
train -  0.6627218934911243   |   valid -  0.5023071852340145
train -  0.6390532544378699   |   valid -  0.5168094924192486
train -  0.7218934911242604   |   valid -  0.5082399472643375
train -  0.6449704142011834   |   valid -  0.5570204350692155
train -  0.6547619047619048   |   valid -  0.5164690382081687
train -  0.6785714285714286   |   valid -  0.5197628458498024
train -  0.7023809523809523   |   valid -  0.5118577075098815
train -  0.6964285714285714   |   valid -  0.5421607378129117
Average accuracy on crossval is 0.519704004592658
Std is 0.016474575358508173


## 3. SVM regularization

### a. Default regularization

1. Train a baseline model with the only parameters `probability=True`, `kernel='linear'`, `random_state=21`.
2. Use stratified K-fold cross-validation with `10` splits to evaluate the accuracy of the model.
3. The format of the result of the code where you trained and evaluated the baseline model should be similar to what you have got for the logreg.

In [17]:
svm = SVC(probability=True, kernel='linear', random_state=21)
res = crosval(10, X, y, svm)
for item in res:
    print(item)

train -  0.7514792899408284   |   valid -  0.5708635464733026
train -  0.6982248520710059   |   valid -  0.5629531970995386
train -  0.7810650887573964   |   valid -  0.5392221489782465
train -  0.6804733727810651   |   valid -  0.5524060646011866
train -  0.7751479289940828   |   valid -  0.5570204350692155
train -  0.7514792899408284   |   valid -  0.5893210283454186
train -  0.7023809523809523   |   valid -  0.5171277997364954
train -  0.7142857142857143   |   valid -  0.5204216073781291
train -  0.7440476190476191   |   valid -  0.5553359683794467
train -  0.7440476190476191   |   valid -  0.5704874835309618
Average accuracy on crossval is 0.5535159279591941
Std is 0.021448796681517085


### b. Optimizing regularization parameters

1. In the cells below try different values of the parameter `C`.

In [18]:
svm = SVC(probability=True, kernel='linear', random_state=21, C=3)
res = crosval(10, X, y, svm)
for item in res:
    print(item)

train -  0.834319526627219   |   valid -  0.5965721819380356
train -  0.8520710059171598   |   valid -  0.5682267633487146
train -  0.8579881656804734   |   valid -  0.5715227422544495
train -  0.7396449704142012   |   valid -  0.5510876730388925
train -  0.8461538461538461   |   valid -  0.5504284772577456
train -  0.7633136094674556   |   valid -  0.5741595253790376
train -  0.8214285714285714   |   valid -  0.5974967061923584
train -  0.7738095238095238   |   valid -  0.5533596837944664
train -  0.7976190476190477   |   valid -  0.5830039525691699
train -  0.7916666666666666   |   valid -  0.5830039525691699
Average accuracy on crossval is 0.5728861658342039
Std is 0.01660554497004687


In [19]:
svm = SVC(probability=True, kernel='linear', random_state=21, C=10)
res = crosval(10, X, y, svm)
for item in res:
    print(item)

train -  0.8520710059171598   |   valid -  0.6018457481872116
train -  0.9112426035502958   |   valid -  0.5616348055372445
train -  0.8875739644970414   |   valid -  0.5919578114700066
train -  0.8461538461538461   |   valid -  0.5702043506921556
train -  0.9053254437869822   |   valid -  0.6334871456822676
train -  0.9171597633136095   |   valid -  0.6578773895847067
train -  0.8630952380952381   |   valid -  0.6001317523056654
train -  0.8392857142857143   |   valid -  0.5862977602108037
train -  0.8928571428571429   |   valid -  0.6653491436100132
train -  0.8571428571428571   |   valid -  0.5988142292490118
Average accuracy on crossval is 0.6067600136529088
Std is 0.03301763333208569


In [20]:
svm = SVC(probability=True, kernel='linear', random_state=21, C=15)
res = crosval(10, X, y, svm)
for item in res:
    print(item)

train -  0.863905325443787   |   valid -  0.6077785102175346
train -  0.9171597633136095   |   valid -  0.5741595253790376
train -  0.893491124260355   |   valid -  0.5919578114700066
train -  0.863905325443787   |   valid -  0.5721819380355966
train -  0.9289940828402367   |   valid -  0.6446934739617667
train -  0.9289940828402367   |   valid -  0.6756756756756757
train -  0.8869047619047619   |   valid -  0.5981554677206851
train -  0.8452380952380952   |   valid -  0.5816864295125165
train -  0.9226190476190477   |   valid -  0.6574440052700923
train -  0.8571428571428571   |   valid -  0.6067193675889329
Average accuracy on crossval is 0.6110452204831843
Std is 0.034285354120619616


In [21]:
svm = SVC(probability=True, kernel='linear', random_state=21, C=50)
res = crosval(10, X, y, svm)
for item in res:
    print(item)

train -  0.9053254437869822   |   valid -  0.6222808174027686
train -  0.9289940828402367   |   valid -  0.5893210283454186
train -  0.9408284023668639   |   valid -  0.5965721819380356
train -  0.8757396449704142   |   valid -  0.5820698747528016
train -  0.9349112426035503   |   valid -  0.6361239288068556
train -  0.9349112426035503   |   valid -  0.6756756756756757
train -  0.8869047619047619   |   valid -  0.5948616600790514
train -  0.875   |   valid -  0.6007905138339921
train -  0.9345238095238095   |   valid -  0.6567852437417655
train -  0.8809523809523809   |   valid -  0.613965744400527
Average accuracy on crossval is 0.6168446668976892
Std is 0.02929960077030114


## 4. Tree

### a. Default regularization

1. Train a baseline model with the only parameter `max_depth=10` and `random_state=21`.
2. Use stratified K-fold cross-validation with `10` splits to evaluate the accuracy of the model.
3. The format of the result of the code where you trained and evaluated the baseline model should be similar to what you have got for the logreg.

In [22]:
dtree = DecisionTreeClassifier(max_depth=10, random_state=21)
res = crosval(10, X, y, dtree)
for item in res:
    print(item)

train -  0.9408284023668639   |   valid -  0.6255767963085036
train -  0.9467455621301775   |   valid -  0.5965721819380356
train -  0.863905325443787   |   valid -  0.5655899802241265
train -  0.863905325443787   |   valid -  0.47198417930125247
train -  0.9230769230769231   |   valid -  0.5510876730388925
train -  0.9526627218934911   |   valid -  0.5919578114700066
train -  0.9166666666666666   |   valid -  0.530961791831357
train -  0.9107142857142857   |   valid -  0.5559947299077734
train -  0.9285714285714286   |   valid -  0.5961791831357048
train -  0.9226190476190477   |   valid -  0.5770750988142292
Average accuracy on crossval is 0.5662979425969883
Std is 0.04071819712153305


### b. Optimizing regularization parameters

1. In the cells below try different values of the parameter `max_depth`.
2. As a bonus, play with other regularization parameters trying to find the best combination.

In [23]:
dtree = DecisionTreeClassifier(max_depth=50, random_state=21)
res = crosval(10, X, y, dtree)
for item in res:
    print(item)

train -  1.0   |   valid -  0.6216216216216216
train -  1.0   |   valid -  0.6077785102175346
train -  1.0   |   valid -  0.6288727752142387
train -  1.0   |   valid -  0.5398813447593935
train -  1.0   |   valid -  0.6130520764667106
train -  1.0   |   valid -  0.6229400131839157
train -  1.0   |   valid -  0.6067193675889329
train -  1.0   |   valid -  0.6119894598155468
train -  1.0   |   valid -  0.6205533596837944
train -  1.0   |   valid -  0.6429512516469038
Average accuracy on crossval is 0.6116359780198592
Std is 0.026028212230653155


In [24]:
dtree = DecisionTreeClassifier(max_depth=15, random_state=21)
res = crosval(10, X, y, dtree)
for item in res:
    print(item)

train -  1.0   |   valid -  0.6216216216216216
train -  1.0   |   valid -  0.6077785102175346
train -  0.9881656804733728   |   valid -  0.6203032300593276
train -  0.9940828402366864   |   valid -  0.5227422544495716
train -  0.9881656804733728   |   valid -  0.6071193144363876
train -  1.0   |   valid -  0.6229400131839157
train -  0.9940476190476191   |   valid -  0.5994729907773386
train -  0.9940476190476191   |   valid -  0.6080368906455863
train -  0.9880952380952381   |   valid -  0.616600790513834
train -  0.9940476190476191   |   valid -  0.6258234519104084
Average accuracy on crossval is 0.6052439067815525
Std is 0.028667454831113107


In [25]:
dtree = DecisionTreeClassifier(max_depth=15, random_state=21, min_samples_leaf=2)
res = crosval(10, X, y, dtree)
for item in res:
    print(item)

train -  0.8461538461538461   |   valid -  0.5411997363216875
train -  0.834319526627219   |   valid -  0.5082399472643375
train -  0.8757396449704142   |   valid -  0.5425181278839816
train -  0.8402366863905325   |   valid -  0.47659854976928145
train -  0.834319526627219   |   valid -  0.48450889914304546
train -  0.8224852071005917   |   valid -  0.5306526038233356
train -  0.8095238095238095   |   valid -  0.4670619235836627
train -  0.8154761904761905   |   valid -  0.5355731225296443
train -  0.8273809523809523   |   valid -  0.5118577075098815
train -  0.8392857142857143   |   valid -  0.5480895915678524
Average accuracy on crossval is 0.514630020939671
Std is 0.02821966201885479


In [26]:
dtree = DecisionTreeClassifier(max_depth=20, random_state=21, min_samples_leaf=1, max_leaf_nodes=7, min_samples_split=4)
res = crosval(10, X, y, dtree)
for item in res:
    print(item)

train -  0.621301775147929   |   valid -  0.4614370468029005
train -  0.5384615384615384   |   valid -  0.4752801582069875
train -  0.4970414201183432   |   valid -  0.41924851680949243
train -  0.46153846153846156   |   valid -  0.40540540540540543
train -  0.5207100591715976   |   valid -  0.38760711931443637
train -  0.4970414201183432   |   valid -  0.3981542518127884
train -  0.44642857142857145   |   valid -  0.42226613965744403
train -  0.5416666666666666   |   valid -  0.4795783926218709
train -  0.5119047619047619   |   valid -  0.46179183135704877
train -  0.5178571428571429   |   valid -  0.4137022397891963
Average accuracy on crossval is 0.43244711017775705
Std is 0.0321008734786129


## 5. Random forest

### a. Default regularization

1. Train a baseline model with the only parameters `n_estimators=50`, `max_depth=14`, `random_state=21`.
2. Use stratified K-fold cross-validation with `10` splits to evaluate the accuracy of the model.
3. The format of the result of the code where you trained and evaluated the baseline model should be similar to what you have got for the logreg.

In [27]:
rfor = RandomForestClassifier(n_estimators=50, max_depth=14, random_state=21)
res = crosval(10, X, y, rfor)
for item in res:
    print(item)

train -  1.0   |   valid -  0.6532630191166776
train -  1.0   |   valid -  0.6776532630191167
train -  0.9940828402366864   |   valid -  0.7000659195781147
train -  0.9881656804733728   |   valid -  0.6288727752142387
train -  1.0   |   valid -  0.6466710613052077
train -  0.9940828402366864   |   valid -  0.6855636123928807
train -  1.0   |   valid -  0.691699604743083
train -  1.0   |   valid -  0.6910408432147562
train -  1.0   |   valid -  0.6877470355731226
train -  0.9880952380952381   |   valid -  0.650197628458498
Average accuracy on crossval is 0.6712774762615696
Std is 0.0230758188964823


### b. Optimizing regularization parameters

1. In the new cells try different values of the parameters `max_depth` and `n_estimators`.
2. As a bonus, play with other regularization parameters trying to find the best combination.

In [28]:
rfor = RandomForestClassifier(n_estimators=100, max_depth=25, random_state=21)
res = crosval(10, X, y, rfor)
for item in res:
    print(item)

train -  1.0   |   valid -  0.6453526697429136
train -  1.0   |   valid -  0.6558998022412657
train -  1.0   |   valid -  0.7112722478576137
train -  1.0   |   valid -  0.6486486486486487
train -  1.0   |   valid -  0.6671061305207646
train -  1.0   |   valid -  0.6875411997363217
train -  1.0   |   valid -  0.6851119894598156
train -  1.0   |   valid -  0.686429512516469
train -  1.0   |   valid -  0.6949934123847167
train -  1.0   |   valid -  0.6620553359683794
Average accuracy on crossval is 0.6744410949076908
Std is 0.02066521110107003


In [29]:
rfor = RandomForestClassifier(n_estimators=10, max_depth=20, random_state=21)
res = crosval(10, X, y, rfor)
for item in res:
    print(item)

train -  0.9881656804733728   |   valid -  0.6348055372445617
train -  1.0   |   valid -  0.6400791034937376
train -  0.9940828402366864   |   valid -  0.6914963744232037
train -  0.9822485207100592   |   valid -  0.5741595253790376
train -  0.9822485207100592   |   valid -  0.6334871456822676
train -  0.9822485207100592   |   valid -  0.6466710613052077
train -  0.9880952380952381   |   valid -  0.6594202898550725
train -  0.9761904761904762   |   valid -  0.6291172595520421
train -  0.9880952380952381   |   valid -  0.6357048748353096
train -  0.9761904761904762   |   valid -  0.6712779973649539
Average accuracy on crossval is 0.6416219169135394
Std is 0.02926908006812206


In [30]:
rfor = RandomForestClassifier(n_estimators=80, max_depth=25, random_state=21)
res = crosval(10, X, y, rfor)
for item in res:
    print(item)

train -  1.0   |   valid -  0.6453526697429136
train -  1.0   |   valid -  0.6519446275543836
train -  1.0   |   valid -  0.7033618984838497
train -  1.0   |   valid -  0.6433750823994726
train -  1.0   |   valid -  0.6624917600527357
train -  1.0   |   valid -  0.6862228081740277
train -  1.0   |   valid -  0.6752305665349143
train -  1.0   |   valid -  0.6798418972332015
train -  1.0   |   valid -  0.6989459815546772
train -  1.0   |   valid -  0.6673254281949934
Average accuracy on crossval is 0.6714092719925169
Std is 0.020060726600076975


In [31]:
rfor = RandomForestClassifier(n_estimators=50, max_depth=7, random_state=21)
res = crosval(10, X, y, rfor)
for item in res:
    print(item)

train -  0.8402366863905325   |   valid -  0.5886618325642716
train -  0.8520710059171598   |   valid -  0.6058009228740936
train -  0.8402366863905325   |   valid -  0.5919578114700066
train -  0.8520710059171598   |   valid -  0.5721819380355966
train -  0.9112426035502958   |   valid -  0.6011865524060646
train -  0.8579881656804734   |   valid -  0.5985497692814766
train -  0.8333333333333334   |   valid -  0.6146245059288538
train -  0.8095238095238095   |   valid -  0.6258234519104084
train -  0.8392857142857143   |   valid -  0.5882740447957839
train -  0.8095238095238095   |   valid -  0.5764163372859025
Average accuracy on crossval is 0.5963477166552459
Std is 0.015637325439283378


In [32]:
rfor = RandomForestClassifier(n_estimators=100, max_depth=7, random_state=21)
res = crosval(10, X, y, rfor)
for item in res:
    print(item)

train -  0.8579881656804734   |   valid -  0.5945945945945946
train -  0.8579881656804734   |   valid -  0.6090969017798286
train -  0.8698224852071006   |   valid -  0.6203032300593276
train -  0.863905325443787   |   valid -  0.5662491760052736
train -  0.8994082840236687   |   valid -  0.6011865524060646
train -  0.8875739644970414   |   valid -  0.6189848384970337
train -  0.8452380952380952   |   valid -  0.6238471673254282
train -  0.8452380952380952   |   valid -  0.6172595520421608
train -  0.8809523809523809   |   valid -  0.5988142292490118
train -  0.8333333333333334   |   valid -  0.5849802371541502
Average accuracy on crossval is 0.6035316479112873
Std is 0.017303169607889482


In [33]:
rfor = RandomForestClassifier(n_estimators=100, max_depth=8, random_state=21)
res = crosval(10, X, y, rfor)
for item in res:
    print(item)

train -  0.8994082840236687   |   valid -  0.6110744891232696
train -  0.9230769230769231   |   valid -  0.6394199077125906
train -  0.893491124260355   |   valid -  0.6361239288068556
train -  0.9171597633136095   |   valid -  0.6025049439683586
train -  0.9349112426035503   |   valid -  0.5992089650626236
train -  0.9289940828402367   |   valid -  0.6354647330257086
train -  0.9345238095238095   |   valid -  0.6469038208168643
train -  0.8690476190476191   |   valid -  0.6304347826086957
train -  0.9226190476190477   |   valid -  0.6324110671936759
train -  0.9107142857142857   |   valid -  0.6080368906455863
Average accuracy on crossval is 0.6241583528964229
Std is 0.01628254456728134


## 6. Predictions

1. Choose the best model and use it to make predictions for the test dataset.
2. Calculate the final accuracy.
3. Analyze: for which weekday your model makes the most errors (in % of the total number of samples of that class in your test dataset).
4. Save the model.

In [34]:
rfor = RandomForestClassifier(n_estimators=100, max_depth=25, random_state=21)

In [35]:
rfor.fit(X_train, y_train)

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,25
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [36]:
pred = rfor.predict(X_test)

In [37]:
score = accuracy_score(y_test, pred)
score

0.9378698224852071

In [38]:
day_names = {
    0: "Monday",
    1: "Tuesday",
    2: "Wednesday",
    3: "Thursday",
    4: "Friday",
    5: "Saturday",
    6: "Sunday"
}

In [39]:
classes = np.unique(y_test)

errors_per_class = {}

for cls in classes:
    # Индексы объектов этого класса
    indices = np.where(y_test == cls)[0]
    total_in_class = len(indices)
    
    # Находим предсказания для этих объектов
    pred_in_class = pred[indices]
    
    # Подсчитываем ошибки в этом классе
    incorrect = np.sum(pred_in_class != cls)
    
    # Процент ошибок
    error_percent = (incorrect / total_in_class) * 100 if total_in_class > 0 else 0
    
    errors_per_class[cls] = error_percent

# Находим класс с максимальной ошибкой
most_error_pro_class = max(errors_per_class, key=errors_per_class.get)
max_error_percentage = errors_per_class[most_error_pro_class]

print(f"Most mistakes: {day_names[most_error_pro_class]} ({max_error_percentage:.2f}%)")

Most mistakes: Monday (25.93%)


In [40]:
joblib.dump(rfor, 'random_forest.pkl')

['random_forest.pkl']