# Day 09. Exercise 02
# Metrics

## 0. Imports

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
from sklearn.preprocessing import label_binarize
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
import joblib

## 1. Preprocessing

1. Create the same dataframe as in the previous exercise.
2. Using `train_test_split` with parameters `test_size=0.2`, `random_state=21` get `X_train`, `y_train`, `X_test`, `y_test`. Use the additional parameter `stratify`.

In [3]:
df = pd.read_csv('../data/day-of-week-not-scaled.csv')
df.head()

Unnamed: 0,numTrials,hour,dayofweek,uid_user_0,uid_user_1,uid_user_10,uid_user_11,uid_user_12,uid_user_13,uid_user_14,...,labname_lab02,labname_lab03,labname_lab03s,labname_lab05s,labname_laba04,labname_laba04s,labname_laba05,labname_laba06,labname_laba06s,labname_project1
0,1,5,4,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,2,5,4,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,3,5,4,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,4,5,4,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,5,5,4,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [4]:
X = df.drop(columns=['dayofweek'])
y = df['dayofweek']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=21, stratify=y
)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1348, 43), (338, 43), (1348,), (338,))

## 2. SVM

1. Use the best parameters from the previous exercise and train the model of SVM.
2. You need to calculate `accuracy`, `precision`, `recall`, `ROC AUC`.

 - `precision` and `recall` should be calculated for each class (use `average='weighted'`)
 - `ROC AUC` should be calculated for each class against any other class (all possible pairwise combinations) and then weighted average should be applied for the final metric
 - the code in the cell should display the result as below:

```
accuracy is 0.88757
precision is 0.89267
recall is 0.88757
roc_auc is 0.97878
```

In [5]:
svm = SVC(kernel='rbf', C=10, gamma='auto', class_weight=None, random_state=21, probability=True)
svm.fit(X_train, y_train)

y_pred = svm.predict(X_test)
y_score = svm.predict_proba(X_test)

print(f"accuracy is {accuracy_score(y_test, y_pred):.5f}")
print(f"precision is {precision_score(y_test, y_pred, average='weighted'):.5f}")
print(f"recall is {recall_score(y_test, y_pred, average='weighted'):.5f}")
print(f"roc_auc is {roc_auc_score(y_test, y_score, multi_class='ovo', average='weighted'):.5f}")

accuracy is 0.88757
precision is 0.89267
recall is 0.88757
roc_auc is 0.97878


## 3. Decision tree

1. The same task for decision tree

In [6]:
tree = DecisionTreeClassifier(criterion='gini', max_depth=21, class_weight='balanced', random_state=21)
tree.fit(X_train, y_train)

y_pred = tree.predict(X_test)
y_score = tree.predict_proba(X_test)

print(f"accuracy is {accuracy_score(y_test, y_pred):.5f}")
print(f"precision is {precision_score(y_test, y_pred, average='weighted'):.5f}")
print(f"recall is {recall_score(y_test, y_pred, average='weighted'):.5f}")
print(f"roc_auc is {roc_auc_score(y_test, y_score, multi_class='ovo', average='weighted'):.5f}")

accuracy is 0.89053
precision is 0.89402
recall is 0.89053
roc_auc is 0.93793


## 4. Random forest

1. The same task for random forest.

In [7]:
forest = RandomForestClassifier(n_estimators=100, max_depth=24, criterion='entropy', class_weight='balanced', random_state=21)
forest.fit(X_train, y_train)

y_pred = forest.predict(X_test)
y_score = forest.predict_proba(X_test)

print(f"accuracy is {accuracy_score(y_test, y_pred):.5f}")
print(f"precision is {precision_score(y_test, y_pred, average='weighted'):.5f}")
print(f"recall is {recall_score(y_test, y_pred, average='weighted'):.5f}")
print(f"roc_auc is {roc_auc_score(y_test, y_score, multi_class='ovo', average='weighted'):.5f}")

accuracy is 0.93195
precision is 0.93402
recall is 0.93195
roc_auc is 0.98803


## 5. Predictions

1. Choose the best model.
2. Analyze: for which `weekday` your model makes the most errors (in % of the total number of samples of that class in your full dataset), for which `labname` and for which `users`.
3. Save the model.

best model is RandomForestClassifier
with parameters
    class_weight=None, 
    criterion='gini', 
    max_depth=28, 
    n_estimators=50, 
    random_state=21

In [8]:
# Вычисление ошибок по дням недели
errors = y_test[y_test != y_pred]
total_per_day = y_test.value_counts()
errors_per_day = errors.value_counts()
error_rate_per_day = (errors_per_day / total_per_day).fillna(0) * 100

# Вычисление ошибок по лабораторным работам
lab_columns = [col for col in df.columns if col.startswith("labname_")]
errors_labs = X_test.loc[errors.index, lab_columns].sum()
error_rate_per_lab = (errors_labs / len(X_test)) * 100

# Вычисление ошибок по пользователям
user_columns = [col for col in df.columns if col.startswith("uid_user_")]
errors_users = X_test.loc[errors.index, user_columns].sum()
error_rate_per_user = (errors_users / len(X_test)) * 100

print("Ошибка по дням недели (% от общего количества выборок для этого дня):")
print(error_rate_per_day.sort_values(ascending=False))

print("\nОшибка по лабораторным работам (% от общего количества тестовых данных):")
print(error_rate_per_lab.sort_values(ascending=False))

print("\nОшибка по пользователям (% от общего количества тестовых данных):")
print(error_rate_per_user.sort_values(ascending=False))


Ошибка по дням недели (% от общего количества выборок для этого дня):
dayofweek
0    22.222222
4    14.285714
5     9.259259
2     6.666667
1     5.454545
6     2.816901
3     2.500000
Name: count, dtype: float64

Ошибка по лабораторным работам (% от общего количества тестовых данных):
labname_project1    2.662722
labname_laba04      1.775148
labname_laba04s     0.591716
labname_code_rvw    0.295858
labname_lab03       0.295858
labname_lab05s      0.295858
labname_laba05      0.295858
labname_laba06      0.295858
labname_laba06s     0.295858
labname_lab02       0.000000
labname_lab03s      0.000000
dtype: float64

Ошибка по пользователям (% от общего количества тестовых данных):
uid_user_2     0.887574
uid_user_25    0.591716
uid_user_19    0.591716
uid_user_6     0.591716
uid_user_4     0.591716
uid_user_31    0.591716
uid_user_3     0.591716
uid_user_24    0.295858
uid_user_27    0.295858
uid_user_29    0.295858
uid_user_30    0.295858
uid_user_18    0.295858
uid_user_16    0.295858


In [None]:
forest = RandomForestClassifier(n_estimators=100, max_depth=24, criterion='entropy', class_weight='balanced', random_state=21)
forest.fit(X_train, y_train)
y_pred = forest.predict(X_test)

df = pd.merge(X_test, y_test, left_index=True, right_index=True)
df['prediction'] = y_pred
df.head()

joblib.dump(forest, '../data/model_02.pkl')

## 6. Function

1. Write a function that takes a list of different models and a corresponding list of parameters (dicts) and returns a dict that contains all the 4 metrics for each model.

In [9]:
def show_metrics(model):
    data = {}
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_score = model.predict_proba(X_test)
    data['accuracy'] = accuracy_score(y_test, y_pred)
    data['precision'] = precision_score(y_test, y_pred, average='weighted')
    data['recall'] = recall_score(y_test, y_pred, average='weighted')
    data['roc_auc'] = roc_auc_score(y_test, y_score, multi_class='ovo', average='weighted')
    return data

In [11]:
show_metrics(svm)

{'accuracy': 0.8875739644970414,
 'precision': 0.8926729169690374,
 'recall': 0.8875739644970414,
 'roc_auc': np.float64(0.9787793228216216)}

In [12]:
show_metrics(tree)

{'accuracy': 0.8905325443786982,
 'precision': 0.8940158937843722,
 'recall': 0.8905325443786982,
 'roc_auc': np.float64(0.9379290651156622)}

In [13]:
show_metrics(forest)

{'accuracy': 0.9319526627218935,
 'precision': 0.9340183677910212,
 'recall': 0.9319526627218935,
 'roc_auc': np.float64(0.9880347330255915)}