# Day 09. Exercise 03
# Ensembles

## 0. Imports

In [31]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression

## 1. Preprocessing

1. Create the same dataframe as in the previous exercise.
2. Using `train_test_split` with parameters `test_size=0.2`, `random_state=21` get `X_train`, `y_train`, `X_test`, `y_test` and then get `X_train`, `y_train`, `X_valid`, `y_valid` from the previous `X_train`, `y_train`. Use the additional parameter `stratify`.

In [2]:
data = pd.read_csv('../data/day-of-week-not-scaled.csv')
df = pd.read_csv("../data/dayofweek.csv")

X = data
y = df['dayofweek']

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21, stratify=y)

X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=21, stratify=y_train)

## 2. Individual classifiers

1. Train SVM, decision tree and random forest again with the best parameters that you got from the 01 exercise with `random_state=21` for all of them.
2. Evaluate `accuracy`, `precision`, and `recall` for them on the validation set.
3. The result of each cell of the section should look like this:

```
accuracy is 0.87778
precision is 0.88162
recall is 0.87778
```

In [5]:
best_svm_params = {'C': 10, 'class_weight': None, 'gamma': 'auto', 'kernel': 'rbf'}
best_tree_params = {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 22}
best_rf_params = {'class_weight': None, 'criterion': 'gini', 'max_depth': 28, 'n_estimators': 50}

In [6]:
svm_model = SVC(**best_svm_params, random_state=21)
svm_model.fit(X_train, y_train)
y_pred_svm = svm_model.predict(X_valid)

tree_model = DecisionTreeClassifier(**best_tree_params, random_state=21)
tree_model.fit(X_train, y_train)
y_pred_tree = tree_model.predict(X_valid)

rf_model = RandomForestClassifier(**best_rf_params, random_state=21)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_valid)


In [9]:
print("SVM:")
print(f"accuracy is {accuracy_score(y_valid, y_pred_svm):.5f}")
print(f"precision is {precision_score(y_valid, y_pred_svm, average='macro'):.5f}")
print(f"recall is {recall_score(y_valid, y_pred_svm, average='macro'):.5f}")
print()
print("Decision Tree:")
print(f"accuracy is {accuracy_score(y_valid, y_pred_tree):.5f}")
print(f"precision is {precision_score(y_valid, y_pred_tree, average='macro'):.5f}")
print(f"recall is {recall_score(y_valid, y_pred_tree, average='macro'):.5f}")
print()
print("Random Forest:")
print(f"accuracy is {accuracy_score(y_valid, y_pred_rf):.5f}")
print(f"precision is {precision_score(y_valid, y_pred_rf, average='macro'):.5f}")
print(f"recall is {recall_score(y_valid, y_pred_rf, average='macro'):.5f}")


SVM:
accuracy is 0.87778
precision is 0.90081
recall is 0.86328

Decision Tree:
accuracy is 0.86667
precision is 0.87231
recall is 0.86481

Random Forest:
accuracy is 0.89259
precision is 0.90566
recall is 0.88055


## 3. Voting classifiers

1. Using `VotingClassifier` and the three models that you have just trained, calculate the `accuracy`, `precision`, and `recall` on the validation set.
2. Play with the other parameteres.
3. Calculate the `accuracy`, `precision` and `recall` on the test set for the model with the best weights in terms of accuracy (if there are several of them with equal values, choose the one with the higher precision).

In [11]:
voting_clf = VotingClassifier(estimators=[
    ('svm', svm_model),
    ('tree', tree_model),
    ('rf', rf_model)
], voting='hard')

In [12]:
voting_clf.fit(X_train, y_train)

In [13]:
y_pred_voting = voting_clf.predict(X_valid)

In [14]:
print("Voting Classifier (Validation Set):")
print(f"accuracy is {accuracy_score(y_valid, y_pred_voting):.5f}")
print(f"precision is {precision_score(y_valid, y_pred_voting, average='macro'):.5f}")
print(f"recall is {recall_score(y_valid, y_pred_voting, average='macro'):.5f}")


Voting Classifier (Validation Set):
accuracy is 0.89630
precision is 0.89902
recall is 0.88754


In [16]:
voting_clf_weighted = VotingClassifier(estimators=[
    ('svm', svm_model),
    ('tree', tree_model),
    ('rf', rf_model)
], voting='hard', weights=[2, 1, 3])

voting_clf_weighted.fit(X_train, y_train)
y_pred_voting_weighted = voting_clf_weighted.predict(X_valid)


In [17]:
print("Voting Classifier with Weights (Validation Set):")
print(f"accuracy is {accuracy_score(y_valid, y_pred_voting_weighted):.5f}")
print(f"precision is {precision_score(y_valid, y_pred_voting_weighted, average='macro'):.5f}")
print(f"recall is {recall_score(y_valid, y_pred_voting_weighted, average='macro'):.5f}")

Voting Classifier with Weights (Validation Set):
accuracy is 0.89630
precision is 0.90812
recall is 0.88380


In [18]:
y_pred_test = voting_clf_weighted.predict(X_test)

print("Best Voting Classifier (Test Set):")
print(f"accuracy is {accuracy_score(y_test, y_pred_test):.5f}")
print(f"precision is {precision_score(y_test, y_pred_test, average='macro'):.5f}")
print(f"recall is {recall_score(y_test, y_pred_test, average='macro'):.5f}")

Best Voting Classifier (Test Set):
accuracy is 0.90533
precision is 0.91197
recall is 0.88254


## 4. Bagging classifiers

1. Using `BaggingClassifier` and `SVM` with the best parameters create an ensemble, try different values of the `n_estimators`, use `random_state=21`.
2. Play with the other parameters.
3. Calculate the `accuracy`, `precision`, and `recall` for the model with the best parameters (in terms of accuracy) on the test set (if there are several of them with equal values, choose the one with the higher precision)

In [20]:
best_svm_params = {'C': 10, 'class_weight': None, 'gamma': 'auto', 'kernel': 'rbf'}

svm_base = SVC(**best_svm_params, random_state=21)


In [21]:
n_estimators_list = [10, 50, 100]
best_model = None
best_accuracy = 0
best_precision = 0
best_params = {}

In [23]:
for n in n_estimators_list:
    bagging_clf = BaggingClassifier(estimator=svm_base, n_estimators=n, random_state=21)
    bagging_clf.fit(X_train, y_train)
    y_pred = bagging_clf.predict(X_valid)

    accuracy = accuracy_score(y_valid, y_pred)
    precision = precision_score(y_valid, y_pred, average='macro')
    recall = recall_score(y_valid, y_pred, average='macro')

    print(f"BaggingClassifier (n_estimators={n}):")
    print(f"accuracy is {accuracy:.5f}")
    print(f"precision is {precision:.5f}")
    print(f"recall is {recall:.5f}")

    if accuracy > best_accuracy or (accuracy == best_accuracy and precision > best_precision):
        best_model = bagging_clf
        best_accuracy = accuracy
        best_precision = precision
        best_params = {'n_estimators': n}

BaggingClassifier (n_estimators=10):
accuracy is 0.87778
precision is 0.91223
recall is 0.86418
BaggingClassifier (n_estimators=50):
accuracy is 0.88519
precision is 0.91667
recall is 0.86640
BaggingClassifier (n_estimators=100):
accuracy is 0.88148
precision is 0.91377
recall is 0.86653


In [24]:
y_pred_test = best_model.predict(X_test)

print("Best BaggingClassifier (Test Set):")
print(f"accuracy is {accuracy_score(y_test, y_pred_test):.5f}")
print(f"precision is {precision_score(y_test, y_pred_test, average='macro'):.5f}")
print(f"recall is {recall_score(y_test, y_pred_test, average='macro'):.5f}")
print(f"Best parameters: {best_params}")


Best BaggingClassifier (Test Set):
accuracy is 0.87278
precision is 0.89938
recall is 0.84994
Best parameters: {'n_estimators': 50}


## 5. Stacking classifiers

1. To achieve reproducibility in this case you will have to create an object of cross-validation generator: `StratifiedKFold(n_splits=n, shuffle=True, random_state=21)`, where `n` you will try to optimize (the details are below).
2. Using `StackingClassifier` and the three models that you have recently trained, calculate the `accuracy`, `precision` and `recall` on the validation set, try different values of `n_splits` `[2, 3, 4, 5, 6, 7]` in the cross-validation generator and parameter `passthrough` in the classifier itself,
3. Calculate the `accuracy`, `precision`, and `recall` for the model with the best parameters (in terms of accuracy) on the test set (if there are several of them with equal values, choose the one with the higher precision). Use `final_estimator=LogisticRegression(solver='liblinear')`.

In [26]:
best_svm_params = {'C': 10, 'class_weight': None, 'gamma': 'auto', 'kernel': 'rbf', 'probability': True}
svm_model = SVC(**best_svm_params, random_state=21)

best_tree_params = {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 22}
tree_model = DecisionTreeClassifier(**best_tree_params, random_state=21)

best_rf_params = {'class_weight': None, 'criterion': 'gini', 'max_depth': 28, 'n_estimators': 50}
rf_model = RandomForestClassifier(**best_rf_params, random_state=21)

In [27]:
n_splits_list = [2, 3, 4, 5, 6, 7]
best_model = None
best_accuracy = 0
best_precision = 0
best_params = {}

In [32]:
for n_splits in n_splits_list:
    for passthrough in [True, False]:
        cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=21)
        
        stacking_clf = StackingClassifier(
            estimators=[('svm', svm_model), ('tree', tree_model), ('rf', rf_model)],
            final_estimator=LogisticRegression(solver='liblinear'),
            cv=cv,
            passthrough=passthrough
        )

        stacking_clf.fit(X_train, y_train)
        y_pred = stacking_clf.predict(X_valid)

        accuracy = accuracy_score(y_valid, y_pred)
        precision = precision_score(y_valid, y_pred, average='macro')
        recall = recall_score(y_valid, y_pred, average='macro')

        print(f"StackingClassifier (n_splits={n_splits}, passthrough={passthrough}):")
        print(f"accuracy is {accuracy:.5f}")
        print(f"precision is {precision:.5f}")
        print(f"recall is {recall:.5f}")
        print()

        if accuracy > best_accuracy or (accuracy == best_accuracy and precision > best_precision):
            best_model = stacking_clf
            best_accuracy = accuracy
            best_precision = precision
            best_params = {'n_splits': n_splits, 'passthrough': passthrough}

StackingClassifier (n_splits=2, passthrough=True):
accuracy is 0.90370
precision is 0.90464
recall is 0.89863

StackingClassifier (n_splits=2, passthrough=False):
accuracy is 0.89259
precision is 0.89974
recall is 0.88368

StackingClassifier (n_splits=3, passthrough=True):
accuracy is 0.90000
precision is 0.90956
recall is 0.89804

StackingClassifier (n_splits=3, passthrough=False):
accuracy is 0.90000
precision is 0.90080
recall is 0.89405

StackingClassifier (n_splits=4, passthrough=True):
accuracy is 0.90370
precision is 0.91784
recall is 0.89730

StackingClassifier (n_splits=4, passthrough=False):
accuracy is 0.90741
precision is 0.91358
recall is 0.90650

StackingClassifier (n_splits=5, passthrough=True):
accuracy is 0.91111
precision is 0.92478
recall is 0.90657

StackingClassifier (n_splits=5, passthrough=False):
accuracy is 0.91481
precision is 0.92211
recall is 0.91232

StackingClassifier (n_splits=6, passthrough=True):
accuracy is 0.90741
precision is 0.91557
recall is 0.9037

In [33]:
y_pred_test = best_model.predict(X_test)

In [34]:
print("Best StackingClassifier (Test Set):")
print(f"accuracy is {accuracy_score(y_test, y_pred_test):.5f}")
print(f"precision is {precision_score(y_test, y_pred_test, average='macro'):.5f}")
print(f"recall is {recall_score(y_test, y_pred_test, average='macro'):.5f}")
print(f"Best parameters: {best_params}")

Best StackingClassifier (Test Set):
accuracy is 0.90828
precision is 0.92517
recall is 0.89133
Best parameters: {'n_splits': 5, 'passthrough': False}


## 6. Predictions

1. Choose the best model in terms of accuracy (if there are several of them with equal values, choose the one with the higher precision).
2. Analyze: for which weekday your model makes the most errors (in % of the total number of samples of that class in your full dataset), for which labname and for which users.
3. Save the model.

In [37]:
type(best_model)

sklearn.ensemble._stacking.StackingClassifier

In [50]:
y_pred_test = best_model.predict(X_test)

errors = (y_pred_test != y_test)

test_data = X_test.copy()
test_data["true_label"] = y_test
test_data["predicted_label"] = y_pred_test  
test_data["error"] = errors.astype(int)


In [51]:
error_analysis = pd.DataFrame({"true_weekday": y_test, "error": errors.astype(int)})
weekday_errors = error_analysis.groupby("true_weekday")["error"].sum()
weekday_total = error_analysis.groupby("true_weekday")["error"].count()
weekday_error_percent = (weekday_errors / weekday_total * 100).sort_values(ascending=False)

In [52]:
user_columns = [col for col in test_data.columns if "uid_user_" in col]
test_data["user_id"] = test_data[user_columns].idxmax(axis=1)  

user_errors = test_data.groupby("user_id")["error"].sum()
user_total = test_data.groupby("user_id")["error"].count()
user_error_percent = (user_errors / user_total * 100).sort_values(ascending=False)


lab_columns = [col for col in test_data.columns if "labname_" in col]
test_data["labname"] = test_data[lab_columns].idxmax(axis=1)

lab_errors = test_data.groupby("labname")["error"].sum()
lab_total = test_data.groupby("labname")["error"].count()
lab_error_percent = (lab_errors / lab_total * 100).sort_values(ascending=False)

In [53]:
weekday_error_percent.head(1), lab_error_percent.head(1), user_error_percent.head(1)

(true_weekday
 0    25.925926
 Name: error, dtype: float64,
 labname
 labname_lab03    100.0
 Name: error, dtype: float64,
 user_id
 uid_user_22    100.0
 Name: error, dtype: float64)

In [54]:
import joblib

joblib.dump(best_model, "best_model.pkl")

['best_model.pkl']