# Day 09. Exercise 03
# Ensembles

## 0. Imports

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, BaggingClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
import numpy as np

## 1. Preprocessing

1. Create the same dataframe as in the previous exercise.
2. Using `train_test_split` with parameters `test_size=0.2`, `random_state=21` get `X_train`, `y_train`, `X_test`, `y_test` and then get `X_train`, `y_train`, `X_valid`, `y_valid` from the previous `X_train`, `y_train`. Use the additional parameter `stratify`.

In [2]:
df = pd.read_csv('../data/day-of-week-not-scaled.csv')
dayofweek = pd.read_csv('../data/dayofweek.csv')['dayofweek']
df['dayofweek'] = dayofweek
df.head()

Unnamed: 0,numTrials,hour,uid_user_0,uid_user_1,uid_user_10,uid_user_11,uid_user_12,uid_user_13,uid_user_14,uid_user_15,...,labname_lab03,labname_lab03s,labname_lab05s,labname_laba04,labname_laba04s,labname_laba05,labname_laba06,labname_laba06s,labname_project1,dayofweek
0,1,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4
1,2,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4
2,3,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4
3,4,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4
4,5,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4


In [3]:
X = df.drop(columns=['dayofweek'])
y = df['dayofweek']

In [4]:
X_train1, X_test, y_train1, y_test = train_test_split(X, y, test_size=0.2, random_state=21, stratify=y)

In [5]:
X_train, X_valid, y_train, y_valid = train_test_split(X_train1, y_train1, test_size=0.2, random_state=21, stratify=y_train1)

## 2. Individual classifiers

1. Train SVM, decision tree and random forest again with the best parameters that you got from the 01 exercise with `random_state=21` for all of them.
2. Evaluate `accuracy`, `precision`, and `recall` for them on the validation set.
3. The result of each cell of the section should look like this:

```
accuracy is 0.87778
precision is 0.88162
recall is 0.87778
```

In [6]:
def compute_metrics(y_test, y_pred, mode='print'):
    
    # 1. Accuracy
    accuracy = accuracy_score(y_test, y_pred)
    
    # 2. Precision и Recall (среднее по весам)
    precision, recall, _, _ = precision_recall_fscore_support(y_test, y_pred, average='weighted')
    
    if mode == 'return':
        return {
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
        }
    elif mode == 'crosval':
        print(f"accuracy={accuracy:.5f} -- precision={precision:.5f} -- recall={recall:.5f}")
    else:
        # Вывод результатов
        print(f"accuracy is {accuracy:.5f}")
        print(f"precision is {precision:.5f}")
        print(f"recall is {recall:.5f}")

In [7]:
svm = SVC(random_state=21, kernel='linear', C=10, gamma='scale', probability=True)
svm.fit(X_train, y_train)
pred = svm.predict(X_valid)
compute_metrics(y_valid, pred)

accuracy is 0.69630
precision is 0.69761
recall is 0.69630


In [8]:
dtree = DecisionTreeClassifier(random_state=21, class_weight=None, criterion='entropy', max_depth=16)
dtree.fit(X_train, y_train)
pred = dtree.predict(X_valid)
compute_metrics(y_valid, pred)

accuracy is 0.87407
precision is 0.87393
recall is 0.87407


In [9]:
rfor = RandomForestClassifier(random_state=21, class_weight='balanced', criterion='entropy', max_depth=16, n_estimators=100)
rfor.fit(X_train, y_train)
pred = rfor.predict(X_valid)
compute_metrics(y_valid, pred)

accuracy is 0.89630
precision is 0.89660
recall is 0.89630


## 3. Voting classifiers

1. Using `VotingClassifier` and the three models that you have just trained, calculate the `accuracy`, `precision`, and `recall` on the validation set.
2. Play with the other parameteres.
3. Calculate the `accuracy`, `precision` and `recall` on the test set for the model with the best weights in terms of accuracy (if there are several of them with equal values, choose the one with the higher precision).

In [10]:
vc1 = VotingClassifier(
    estimators=[
        ('svc', svm),
        ('dt', dtree), 
        ('rfor', rfor)
    ],
    voting='soft'
)

vc1.fit(X_train, y_train)
pred = vc1.predict(X_valid)
compute_metrics(y_valid, pred)

accuracy is 0.88889
precision is 0.88785
recall is 0.88889


In [11]:
vc2 = VotingClassifier(
    estimators=[
        ('svc', svm),
        ('dt', dtree), 
        ('rfor', rfor)
    ],
    voting='hard'
)

vc2.fit(X_train, y_train)
pred = vc2.predict(X_valid)
compute_metrics(y_valid, pred)

accuracy is 0.87037
precision is 0.87055
recall is 0.87037


In [12]:
vc3 = VotingClassifier(
    estimators=[
        ('svc', svm),
        ('dt', dtree), 
        ('rfor', rfor)
    ],
    voting='soft',
    weights=[1,2,6],
    flatten_transform=True
)

vc3.fit(X_train, y_train)
pred = vc3.predict(X_valid)
compute_metrics(y_valid, pred)

accuracy is 0.90370
precision is 0.90333
recall is 0.90370


In [13]:
vc4 = VotingClassifier(
    estimators=[
        ('svc', svm),
        ('dt', dtree), 
        ('rfor', rfor)
    ],
    voting='hard',
    weights=[1,2,6],
    flatten_transform=True
)

vc4.fit(X_train, y_train)
pred = vc4.predict(X_valid)
compute_metrics(y_valid, pred)

accuracy is 0.89630
precision is 0.89660
recall is 0.89630


In [14]:
vc5 = VotingClassifier(
    estimators=[
        ('svc', svm),
        ('dt', dtree), 
        ('rfor', rfor)
    ],
    voting='soft',
    weights=[1,2,6],
    flatten_transform=False
)

vc5.fit(X_train, y_train)
pred = vc5.predict(X_valid)
compute_metrics(y_valid, pred)

accuracy is 0.90370
precision is 0.90333
recall is 0.90370


In [15]:
vc6 = VotingClassifier(
    estimators=[
        ('svc', svm),
        ('dt', dtree), 
        ('rfor', rfor)
    ],
    voting='hard',
    weights=[1,2,6],
    flatten_transform=False
)

vc6.fit(X_train, y_train)
pred = vc6.predict(X_valid)
compute_metrics(y_valid, pred)

accuracy is 0.89630
precision is 0.89660
recall is 0.89630


In [16]:
vc7 = VotingClassifier(
    estimators=[
        ('svc', svm),
        ('dt', dtree), 
        ('rfor', rfor)
    ],
    voting='soft',
    weights=[1,2,6],
    flatten_transform=True,
    n_jobs=-1
)

vc7.fit(X_train, y_train)
pred = vc7.predict(X_valid)
compute_metrics(y_valid, pred)

accuracy is 0.90370
precision is 0.90333
recall is 0.90370


### Best 

In [17]:
vc = VotingClassifier(
    estimators=[
        ('svc', svm),
        ('dt', dtree), 
        ('rfor', rfor)
    ],
    voting='soft',
    weights=[1,2,6],
)

vc.fit(X_train, y_train)
pred = vc.predict(X_test)
compute_metrics(y_test, pred)

accuracy is 0.89349
precision is 0.89501
recall is 0.89349


## 4. Bagging classifiers

1. Using `BaggingClassifier` and `SVM` with the best parameters create an ensemble, try different values of the `n_estimators`, use `random_state=21`.
2. Play with the other parameters.
3. Calculate the `accuracy`, `precision`, and `recall` for the model with the best parameters (in terms of accuracy) on the test set (if there are several of them with equal values, choose the one with the higher precision)

In [18]:
for n_estimators in [10, 20, 30]:
    bagging_clf = BaggingClassifier(
        estimator=svm,
        n_estimators=n_estimators, 
        random_state=21,
        # n_jobs=-1
    )
    
    bagging_clf.fit(X_train, y_train)
    pred = bagging_clf.predict(X_valid)

    print(f"n_estimators={n_estimators}:")
    compute_metrics(y_valid, pred)


n_estimators=10:
accuracy is 0.70000
precision is 0.72051
recall is 0.70000
n_estimators=20:
accuracy is 0.68889
precision is 0.71733
recall is 0.68889
n_estimators=30:
accuracy is 0.66296
precision is 0.67952
recall is 0.66296


In [19]:
for n_estimators in [7, 8, 9, 11, 12]:
    bagging_clf = BaggingClassifier(
        estimator=svm,
        n_estimators=n_estimators, 
        random_state=21,
        # n_jobs=-1
    )
    
    bagging_clf.fit(X_train, y_train)
    pred = bagging_clf.predict(X_valid)

    print(f"n_estimators={n_estimators}:")
    compute_metrics(y_valid, pred)

n_estimators=7:
accuracy is 0.69259
precision is 0.71620
recall is 0.69259
n_estimators=8:
accuracy is 0.68889
precision is 0.71042
recall is 0.68889
n_estimators=9:
accuracy is 0.68519
precision is 0.70878
recall is 0.68519
n_estimators=11:
accuracy is 0.69259
precision is 0.71382
recall is 0.69259
n_estimators=12:
accuracy is 0.69259
precision is 0.71710
recall is 0.69259


In [20]:
bc = BaggingClassifier(
    estimator=svm,
    n_estimators=10, 
    random_state=21,
    n_jobs=-1
)

bc.fit(X_train, y_train)
pred = bc.predict(X_test)

compute_metrics(y_test, pred)

accuracy is 0.72781
precision is 0.75332
recall is 0.72781


## 5. Stacking classifiers

1. To achieve reproducibility in this case you will have to create an object of cross-validation generator: `StratifiedKFold(n_splits=n, shuffle=True, random_state=21)`, where `n` you will try to optimize (the details are below).
2. Using `StackingClassifier` and the three models that you have recently trained, calculate the `accuracy`, `precision` and `recall` on the validation set, try different values of `n_splits` `[2, 3, 4, 5, 6, 7]` in the cross-validation generator and parameter `passthrough` in the classifier itself,
3. Calculate the `accuracy`, `precision`, and `recall` for the model with the best parameters (in terms of accuracy) on the test set (if there are several of them with equal values, choose the one with the higher precision). Use `final_estimator=LogisticRegression(solver='liblinear')`.

In [21]:
for n in [2, 3, 4, 5, 6, 7]:
    print(f'splits={n}: ')
    skf = StratifiedKFold(n_splits=n, shuffle=True, random_state=21)
    stack = StackingClassifier(
        estimators=[
            ('svc', svm),
            ('dt', dtree), 
            ('rfor', rfor)
        ],
        cv=skf,
        passthrough=False #default
    )

    stack.fit(X_train, y_train)
    pred = stack.predict(X_valid)
    compute_metrics(y_valid, pred)
    print()

splits=2: 
accuracy is 0.89259
precision is 0.89363
recall is 0.89259

splits=3: 
accuracy is 0.89259
precision is 0.89283
recall is 0.89259

splits=4: 
accuracy is 0.90000
precision is 0.89960
recall is 0.90000

splits=5: 
accuracy is 0.90000
precision is 0.89943
recall is 0.90000

splits=6: 
accuracy is 0.90000
precision is 0.90044
recall is 0.90000

splits=7: 
accuracy is 0.90000
precision is 0.89943
recall is 0.90000



In [22]:
for n in [2, 3, 4, 5, 6, 7]:
    print(f'splits={n}: ')
    skf = StratifiedKFold(n_splits=n, shuffle=True, random_state=21)
    stack = StackingClassifier(
        estimators=[
            ('svc', svm),
            ('dt', dtree), 
            ('rfor', rfor)
        ],
        passthrough=True,
        cv=skf,
    )

    stack.fit(X_train, y_train)
    pred = stack.predict(X_valid)
    compute_metrics(y_valid, pred)
    print()

splits=2: 


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


accuracy is 0.89259
precision is 0.89396
recall is 0.89259

splits=3: 


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


accuracy is 0.89630
precision is 0.89721
recall is 0.89630

splits=4: 


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


accuracy is 0.89259
precision is 0.89262
recall is 0.89259

splits=5: 


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


accuracy is 0.90741
precision is 0.90746
recall is 0.90741

splits=6: 


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


accuracy is 0.89259
precision is 0.89281
recall is 0.89259

splits=7: 
accuracy is 0.90370
precision is 0.90370
recall is 0.90370



STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### Best

In [23]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=21)
stack = StackingClassifier(
    estimators=[
        ('svc', svm),
        ('dt', dtree), 
        ('rfor', rfor)
    ],
    passthrough=True,
    cv=skf,
    final_estimator=LogisticRegression(solver='liblinear'),
)

stack.fit(X_train, y_train)
pred = stack.predict(X_test)
compute_metrics(y_test, pred)

accuracy is 0.89941
precision is 0.90067
recall is 0.89941




## 6. Predictions

1. Choose the best model in terms of accuracy (if there are several of them with equal values, choose the one with the higher precision).
2. Analyze: for which weekday your model makes the most errors (in % of the total number of samples of that class in your full dataset), for which labname and for which users.
3. Save the model.

In [24]:
day_names = {
    0: "Monday",
    1: "Tuesday",
    2: "Wednesday",
    3: "Thursday",
    4: "Friday",
    5: "Saturday",
    6: "Sunday"
}

In [25]:
import numpy as np 

classes = np.unique(y_test)
users = [col[4:] for col in X_test.columns if col.startswith('uid')]
labs = [col[8:] for col in X_test.columns if col.startswith('labname')]

errors_per_class = {}
errors_per_user = {}
errors_per_lab = {}

for cls in classes:
    indices = np.where(y_test == cls)[0]
    total_in_class = len(indices)
    pred_in_class = pred[indices]
    incorrect = np.sum(pred_in_class != cls)
    error_percent = (incorrect / total_in_class) * 100 if total_in_class > 0 else 0
    errors_per_class[cls] = error_percent

for cls in classes:
    # Создаём маску для объектов этого класса
    class_mask = (y_test == cls)
    total_in_class = np.sum(class_mask)

    if total_in_class == 0:
        error_percent = 0
    else:
        incorrect = np.sum((pred != cls) & class_mask)
        error_percent = (incorrect / total_in_class) * 100

    errors_per_class[cls] = error_percent

# Анализ по пользователям
for user in users:
    col_name = 'uid_' + user
    user_mask = X_test[col_name]
    # Булева маска - где пользователь активен
    boolean_mask = user_mask.astype(bool)
    total = np.sum(boolean_mask)
    if total > 0:
        incorrect = np.sum((pred != y_test) & boolean_mask)
        errors_per_user[user] = (incorrect / total) * 100
    else:
        errors_per_user[user] = 0

# Анализ по лабораториям
for lab in labs:
    col_name = 'labname_' + lab
    lab_mask = X_test[col_name]
    boolean_mask = lab_mask.astype(bool)
    total = np.sum(boolean_mask)
    if total > 0:
        incorrect = np.sum((pred != y_test) & boolean_mask)
        errors_per_lab[lab] = (incorrect / total) * 100
    else:
        errors_per_lab[lab] = 0

# Находим максимум
most_error_pro_class = max(errors_per_class, key=errors_per_class.get)
max_error_percentage = errors_per_class[most_error_pro_class]

most_error_pro_user = max(errors_per_user, key=errors_per_user.get)
max_error_user = errors_per_user[most_error_pro_user]

most_error_pro_lab = max(errors_per_lab, key=errors_per_lab.get)
max_error_lab = errors_per_lab[most_error_pro_lab]


print(f"Most mistakes in class: {day_names[most_error_pro_class]} ({max_error_percentage:.2f}%)")
print(f"User with most errors: {most_error_pro_user} ({max_error_user:.2f}%)")
print(f"Lab with most errors: {most_error_pro_lab} ({max_error_lab:.2f}%)")

Most mistakes in class: Monday (25.93%)
User with most errors: user_6 (25.00%)
Lab with most errors: lab03 (100.00%)


In [26]:
import joblib 

joblib.dump(stack, 'StackingKFold.pkl')

['StackingKFold.pkl']