In [31]:
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score, recall_score, roc_auc_score
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import roc_curve, auc
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, BaggingClassifier, StackingClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix

## 1. Preprocessing

1. Create the same dataframe as in the previous exercise.
2. Using `train_test_split` with parameters `test_size=0.2`, `random_state=21` get `X_train`, `y_train`, `X_test`, `y_test` and then get `X_train`, `y_train`, `X_valid`, `y_valid` from the previous `X_train`, `y_train`. Use the additional parameter `stratify`.

In [2]:
df = pd.read_csv('../../datasets/day-of-week-not-scaled.csv')
df1 = pd.read_csv('../../datasets/dayofweek.csv')
df['dayofweek'] = df1['dayofweek']
df

Unnamed: 0,numTrials,hour,uid_user_0,uid_user_1,uid_user_10,uid_user_11,uid_user_12,uid_user_13,uid_user_14,uid_user_15,...,labname_lab03,labname_lab03s,labname_lab05s,labname_laba04,labname_laba04s,labname_laba05,labname_laba06,labname_laba06s,labname_project1,dayofweek
0,1,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4
1,2,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4
2,3,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4
3,4,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4
4,5,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1681,9,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,3
1682,6,20,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,3
1683,7,20,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,3
1684,8,20,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,3


In [4]:
X = df.drop('dayofweek', axis=1)
y = df['dayofweek']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21, stratify=y)
XX = X_train.copy()
yy = y_train.copy()

In [5]:
X_train, X_valid, y_train, y_valid = train_test_split(XX, yy, test_size=0.2, random_state=21, stratify=yy)

## 2. Individual classifiers

1. Train SVM, decision tree and random forest again with the best parameters that you got from the 01 exercise with `random_state=21` for all of them.
2. Evaluate `accuracy`, `precision`, and `recall` for them on the validation set.
3. The result of each cell of the section should look like this:

```
accuracy is 0.87778
precision is 0.88162
recall is 0.87778
```

In [7]:
svm = SVC(C=10, gamma='auto', probability=True, random_state=21, kernel='rbf')
svm.fit(X_train, y_train)
y_pred = svm.predict(X_test)
print(f"accuracy is {accuracy_score(y_test, y_pred):.5f}")
print(f"precision is {precision_score(y_test, y_pred, average='weighted'):.5f}")
print(f"recall is {recall_score(y_test, y_pred, average='weighted'):.5f}")

accuracy is 0.87870
precision is 0.88001
recall is 0.87870


In [8]:
dtr = DecisionTreeClassifier(max_depth=22, class_weight='balanced', random_state=21, criterion='gini')
dtr.fit(X_train, y_train)
y_pred = dtr.predict(X_test)
print(f"accuracy is {accuracy_score(y_test, y_pred):.5f}")
print(f"precision is {precision_score(y_test, y_pred, average='weighted'):.5f}")
print(f"recall is {recall_score(y_test, y_pred, average='weighted'):.5f}")

accuracy is 0.87574
precision is 0.87889
recall is 0.87574


In [9]:
rfr = RandomForestClassifier(n_estimators=50, max_depth=28, random_state=21, criterion='gini')
rfr.fit(X_train, y_train)
y_pred = rfr.predict(X_test)
print(f"accuracy is {accuracy_score(y_test, y_pred):.5f}")
print(f"precision is {precision_score(y_test, y_pred, average='weighted'):.5f}")
print(f"recall is {recall_score(y_test, y_pred, average='weighted'):.5f}")

accuracy is 0.90828
precision is 0.90957
recall is 0.90828


## 3. Voting classifiers

1. Using `VotingClassifier` and the three models that you have just trained, calculate the `accuracy`, `precision`, and `recall` on the validation set.
2. Play with the other parameteres.
3. Calculate the `accuracy`, `precision` and `recall` on the test set for the model with the best weights in terms of accuracy (if there are several of them with equal values, choose the one with the higher precision).

In [11]:
vc = VotingClassifier(estimators=[('svm', svm), ('tree', dtr), ('forest', rfr)], voting='hard', verbose=True)

In [12]:
vc.fit(X_train, y_train)

[Voting] ...................... (1 of 3) Processing svm, total=   2.6s
[Voting] ..................... (2 of 3) Processing tree, total=   0.0s
[Voting] ................... (3 of 3) Processing forest, total=   0.4s


In [13]:
y_pred = vc.predict(X_test)

In [14]:
print(f"Accuracy: {accuracy_score(y_test, y_pred):.5f}")
print(f"Precision: {precision_score(y_test, y_pred, average='weighted'):.5f}")
print(f"Recall: {recall_score(y_test, y_pred, average='weighted'):.5f}")

Accuracy: 0.91124
Precision: 0.91206
Recall: 0.91124


In [15]:
vc = VotingClassifier(estimators=[('svm', svm), ('tree', dtr), ('forest', rfr)], voting='soft', verbose=True)
vc.fit(X_train, y_train)
y_pred = vc.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred):.5f}")
print(f"Precision: {precision_score(y_test, y_pred, average='weighted'):.5f}")
print(f"Recall: {recall_score(y_test, y_pred, average='weighted'):.5f}")

[Voting] ...................... (1 of 3) Processing svm, total=   3.6s
[Voting] ..................... (2 of 3) Processing tree, total=   0.0s
[Voting] ................... (3 of 3) Processing forest, total=   0.8s
Accuracy: 0.89645
Precision: 0.89940
Recall: 0.89645


In [16]:
vc = VotingClassifier(estimators=[('svm', svm), ('tree', dtr), ('forest', rfr)], voting='hard', verbose=True, weights=[1,2,2])
vc.fit(X_train, y_train)
y_pred = vc.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred):.5f}")
print(f"Precision: {precision_score(y_test, y_pred, average='weighted'):.5f}")
print(f"Recall: {recall_score(y_test, y_pred, average='weighted'):.5f}")

[Voting] ...................... (1 of 3) Processing svm, total=   4.1s
[Voting] ..................... (2 of 3) Processing tree, total=   0.0s
[Voting] ................... (3 of 3) Processing forest, total=   1.5s
Accuracy: 0.91716
Precision: 0.91824
Recall: 0.91716


## 4. Bagging classifiers

1. Using `BaggingClassifier` and `SVM` with the best parameters create an ensemble, try different values of the `n_estimators`, use `random_state=21`.
2. Play with the other parameters.
3. Calculate the `accuracy`, `precision`, and `recall` for the model with the best parameters (in terms of accuracy) on the test set (if there are several of them with equal values, choose the one with the higher precision)

In [17]:
svc = SVC(C=10, gamma='auto', probability=True, random_state=21, kernel='rbf')
bc = BaggingClassifier(estimator=svc, n_estimators=10, random_state=21)
bc.fit(X_train, y_train)
y_pred = bc.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred):.5f}")
print(f"Precision: {precision_score(y_test, y_pred, average='weighted'):.5f}")
print(f"Recall: {recall_score(y_test, y_pred, average='weighted'):.5f}")

Accuracy: 0.86391
Precision: 0.86966
Recall: 0.86391


In [18]:
param_grid = {'n_estimators': [10, 30, 50],
             'warm_start': [True, False],
             'bootstrap': [True, False]}

gs = GridSearchCV(bc, param_grid, scoring='accuracy', n_jobs=-1, verbose=2)
gs.fit(X_train, y_train)
print(f'лучшие параметры: {gs.best_params_}')
print(f'лучший score: {gs.best_score_}')
y_pred = gs.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred):.5f}")
print(f"Precision: {precision_score(y_test, y_pred, average='weighted'):.5f}")
print(f"Recall: {recall_score(y_test, y_pred, average='weighted'):.5f}")

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV] END ...bootstrap=True, n_estimators=10, warm_start=True; total time=   7.5s
[CV] END ...bootstrap=True, n_estimators=10, warm_start=True; total time=   7.0s
[CV] END ...bootstrap=True, n_estimators=10, warm_start=True; total time=   8.5s
[CV] END ...bootstrap=True, n_estimators=10, warm_start=True; total time=   6.4s
[CV] END ...bootstrap=True, n_estimators=10, warm_start=True; total time=   4.7s
[CV] END ..bootstrap=True, n_estimators=10, warm_start=False; total time=   4.3s
[CV] END ..bootstrap=True, n_estimators=10, warm_start=False; total time=   5.5s
[CV] END ..bootstrap=True, n_estimators=10, warm_start=False; total time=   6.6s
[CV] END ..bootstrap=True, n_estimators=10, warm_start=False; total time=   7.0s
[CV] END ..bootstrap=True, n_estimators=10, warm_start=False; total time=   4.8s
[CV] END ...bootstrap=True, n_estimators=30, warm_start=True; total time=  12.6s
[CV] END ...bootstrap=True, n_estimators=30, war

## 5. Stacking classifiers

1. To achieve reproducibility in this case you will have to create an object of cross-validation generator: `StratifiedKFold(n_splits=n, shuffle=True, random_state=21)`, where `n` you will try to optimize (the details are below).
2. Using `StackingClassifier` and the three models that you have recently trained, calculate the `accuracy`, `precision` and `recall` on the validation set, try different values of `n_splits` `[2, 3, 4, 5, 6, 7]` in the cross-validation generator and parameter `passthrough` in the classifier itself,
3. Calculate the `accuracy`, `precision`, and `recall` for the model with the best parameters (in terms of accuracy) on the test set (if there are several of them with equal values, choose the one with the higher precision). Use `final_estimator=LogisticRegression(solver='liblinear')`.

In [27]:
def evaluate_stacking_classifier(n_splits, passthrough):
    cv_generator = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=21)
    
    stacking_clf = StackingClassifier(
        estimators=[('svm', svm), ('tree', dtr), ('forest', rfr)],
        final_estimator=LogisticRegression(solver='liblinear'),
        cv=cv_generator,
        passthrough=passthrough
    )
    stacking_clf.fit(X_train, y_train)
    y_pred = stacking_clf.predict(X_valid)
    accuracy = accuracy_score(y_valid, y_pred)
    precision = precision_score(y_valid, y_pred, average='macro')
    recall = recall_score(y_valid, y_pred, average='macro')
    
    return accuracy, precision, recall

results = []
for n_splits in [2, 3, 4, 5, 6, 7]:
    for passthrough in [True, False]:
        accuracy, precision, recall = evaluate_stacking_classifier(n_splits, passthrough)
        results.append((n_splits, passthrough, accuracy, precision, recall))
        
best_result = max(results, key=lambda x: x[2])
print(f'Best params: n_splits={best_result[0]}, passthrough={best_result[1]}')
print(f'Accuracy: {best_result[2]:.4f}, Precision: {best_result[3]:.4f}, Recall: {best_result[4]:.4f}')

Best params: n_splits=5, passthrough=False
Accuracy: 0.9148, Precision: 0.9221, Recall: 0.9123


In [29]:
stacking_clf_best = StackingClassifier(
    estimators=[('svm', svm), ('tree', dtr), ('forest', rfr)],
    final_estimator=LogisticRegression(solver='liblinear'),
    cv=StratifiedKFold(n_splits=best_result[0], shuffle=True, random_state=21),
    passthrough=best_result[1]
)

stacking_clf_best.fit(pd.concat([X_train, X_valid]), pd.concat([y_train, y_valid]))
y_pred_test = stacking_clf_best.predict(X_test)
test_accuracy = accuracy_score(y_test, y_pred_test)
test_precision = precision_score(y_test, y_pred_test, average='macro')
test_recall = recall_score(y_test, y_pred_test, average='macro')
print(f'Test Accuracy: {test_accuracy:.4f}, Test Precision: {test_precision:.4f}, Test Recall: {test_recall:.4f}')

Test Accuracy: 0.9320, Test Precision: 0.9335, Test Recall: 0.9148


## 6. Predictions

1. Choose the best model in terms of accuracy (if there are several of them with equal values, choose the one with the higher precision).
2. Analyze: for which weekday your model makes the most errors (in % of the total number of samples of that class in your full dataset), for which labname and for which users.
3. Save the model.

In [32]:
y_pred = stacking_clf_best.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
cm

array([[21,  0,  1,  0,  1,  1,  3],
       [ 0, 51,  1,  2,  0,  1,  0],
       [ 0,  0, 28,  2,  0,  0,  0],
       [ 1,  0,  0, 77,  0,  1,  1],
       [ 0,  0,  0,  0, 19,  2,  0],
       [ 0,  0,  0,  2,  1, 50,  1],
       [ 0,  0,  0,  1,  0,  1, 69]])

In [33]:
df_cm = pd.DataFrame(cm, index=['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'],
                     columns=['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'])

total_samples_per_class = cm.sum(axis=1)
error_rates = {}
for i, day in enumerate(['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']):
    correct_predictions = cm[i][i]
    error_rate = (total_samples_per_class[i] - correct_predictions) / total_samples_per_class[i]
    error_rates[day] = error_rate * 100

sorted_error_rates = sorted(error_rates.items(), key=lambda x: -x[1])
print("Days with highest error rates:")
for day, rate in sorted_error_rates:
    print(f"{day}: {rate:.2f}%")

Days with highest error rates:
Mon: 22.22%
Fri: 9.52%
Sat: 7.41%
Tue: 7.27%
Wed: 6.67%
Thu: 3.75%
Sun: 2.82%


In [35]:
joblib.dump(stacking_clf_best, 'best_model.joblib')

['best_model.joblib']