# Day 09. Exercise 03
# Ensembles

## 0. Imports

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

## 1. Preprocessing

1. Create the same dataframe as in the previous exercise.
2. Using `train_test_split` with parameters `test_size=0.2`, `random_state=21` get `X_train`, `y_train`, `X_test`, `y_test` and then get `X_train`, `y_train`, `X_valid`, `y_valid` from the previous `X_train`, `y_train`. Use the additional parameter `stratify`.

In [4]:
df = pd.read_csv('../data/day-of-week-not-scaled.csv')
df['dayofweek'] = pd.read_csv('../data/dayofweek.csv')['weekday']
df

Unnamed: 0,numTrials,hour,uid_user_0,uid_user_1,uid_user_10,uid_user_11,uid_user_12,uid_user_13,uid_user_14,uid_user_15,...,labname_lab03,labname_lab03s,labname_lab05s,labname_laba04,labname_laba04s,labname_laba05,labname_laba06,labname_laba06s,labname_project1,dayofweek
0,1,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4
1,2,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4
2,3,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4
3,4,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4
4,5,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1681,9,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,3
1682,6,20,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,3
1683,7,20,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,3
1684,8,20,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,3


In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    df.drop('dayofweek', axis=1),
    df['dayofweek'],
    stratify=df['dayofweek'],
    test_size=0.2,
    random_state=21,
    shuffle=True,
)

In [6]:
X_train, X_valid, y_train, y_valid = train_test_split(
    X_train,
    y_train,
    stratify=y_train,
    test_size=0.2,
    random_state=21,
    shuffle=True,
)

## 2. Individual classifiers

1. Train SVM, decision tree and random forest again with the best parameters that you got from the 01 exercise with `random_state=21` for all of them.
2. Evaluate `accuracy`, `precision`, and `recall` for them on the validation set.
3. The result of each cell of the section should look like this:

```
accuracy is 0.87778
precision is 0.88162
recall is 0.87778
```

In [7]:
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score
)

from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [24]:
def calculate_metrics(y_test, preds):
    metrics = {    
    'accuracy' : accuracy_score(y_test, preds),
    'precision' : precision_score(y_test, preds, average='weighted'),
    'recall' : recall_score(y_test, preds, average='weighted')
    }
    
    return metrics

def print_metrics(metrics):
    for metric, v in metrics.items():
        print(f'{metric} is {v:.5f}')

In [11]:
svc_params = {
    'C': 10,
    'class_weight': None,
    'gamma': 'auto',
    'kernel': 'rbf',
    'probability': True,
    'random_state': 21
}

tree_params = {
    'class_weight': 'balanced',
    'criterion': 'gini',
    'max_depth': 22,
    'random_state': 21
}

forest_params = {
    'class_weight': None,
    'criterion': 'gini',
    'max_depth': 28,
    'n_estimators': 50,
    'random_state': 21
}

%store svc_params
%store forest_params
%store tree_params

Stored 'svc_params' (dict)
Stored 'forest_params' (dict)
Stored 'tree_params' (dict)


  db[ 'autorestore/' + arg ] = obj
  db[ 'autorestore/' + arg ] = obj
  db[ 'autorestore/' + arg ] = obj


In [32]:
svc = SVC(**svc_params)
svc = svc.fit(X_train, y_train)

predict = pd.DataFrame({
    'svc_pred' : svc.predict(X_valid),
    'y_valid' : y_valid
})

print_metrics(calculate_metrics(y_valid, predict['svc_pred']))

accuracy is 0.87778
precision is 0.88162
recall is 0.87778


In [None]:
tree = DecisionTreeClassifier(**tree_params)
tree = tree.fit(X_train, y_train)

predict['tree_pred'] = tree.predict(X_valid)

print_metrics(calculate_metrics(y_valid, predict['tree_pred']))

accuracy is 0.86667
precision is 0.86984
recall is 0.86667


In [35]:
forest = RandomForestClassifier(**forest_params)
forest = forest.fit(X_train, y_train)

predict['forest_pred'] = forest.predict(X_valid)

print_metrics(calculate_metrics(y_valid, predict['forest_pred']))

accuracy is 0.89259
precision is 0.89361
recall is 0.89259


## 3. Voting classifiers

1. Using `VotingClassifier` and the three models that you have just trained, calculate the `accuracy`, `precision`, and `recall` on the validation set.
2. Play with the other parameteres.
3. Calculate the `accuracy`, `precision` and `recall` on the test set for the model with the best weights in terms of accuracy (if there are several of them with equal values, choose the one with the higher precision).

In [65]:
from sklearn.ensemble import (
    VotingClassifier,
    BaggingClassifier,
    StackingClassifier
)

In [37]:
voting = VotingClassifier(
    estimators=[
        ('svc', svc),
        ('tree', tree),
        ('forest', forest)
    ]
)

voting.fit(X_train, y_train)

0,1,2
,estimators,"[('svc', ...), ('tree', ...), ...]"
,voting,'hard'
,weights,
,n_jobs,
,flatten_transform,True
,verbose,False

0,1,2
,C,10
,kernel,'rbf'
,degree,3
,gamma,'auto'
,coef0,0.0
,shrinking,True
,probability,True
,tol,0.001
,cache_size,200
,class_weight,

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,22
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,21
,max_leaf_nodes,
,min_impurity_decrease,0.0

0,1,2
,n_estimators,50
,criterion,'gini'
,max_depth,28
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [38]:
print_metrics(calculate_metrics(y_valid, voting.predict(X_valid)))

accuracy is 0.89630
precision is 0.89605
recall is 0.89630


In [None]:
from sklearn.model_selection import (
    ParameterGrid
)

from tqdm.notebook import tqdm

from itertools import product

In [62]:
def choose_params(estimator, param_dict, X_train, X_valid, y_train, y_valid):
    grid = ParameterGrid(param_dict)
    
    best_accuracy = 0
    best_params = None
    for params in tqdm(grid, desc='Choosing parameters'):
        estimator.set_params(**params)
        estimator.fit(X_train, y_train)
        
        accuracy = accuracy_score(y_valid, estimator.predict(X_valid))
        
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_params = params
    
    return best_params

In [48]:
param_grid = {
    'voting' : ['hard', 'soft'],
    'weights' : [list(w) for w in product(range(1, 5), repeat=3)]
}

In [64]:
voting_best_params = choose_params(
    voting,
    param_grid,
    X_train,
    X_valid,
    y_train,
    y_valid
)

Choosing parameters:   0%|          | 0/128 [00:00<?, ?it/s]

In [63]:
voting.get_params(deep=False)

{'estimators': [('svc',
   SVC(C=10, gamma='auto', probability=True, random_state=21)),
  ('tree',
   DecisionTreeClassifier(class_weight='balanced', max_depth=22, random_state=21)),
  ('forest',
   RandomForestClassifier(max_depth=28, n_estimators=50, random_state=21))],
 'flatten_transform': True,
 'n_jobs': None,
 'verbose': False,
 'voting': 'soft',
 'weights': [4, 4, 4]}

In [89]:
print_metrics(calculate_metrics(y_valid, voting.predict(X_valid)))

accuracy is 0.88148
precision is 0.88418
recall is 0.88148


## 4. Bagging classifiers

1. Using `BaggingClassifier` and `SVM` with the best parameters create an ensemble, try different values of the `n_estimators`, use `random_state=21`.
2. Play with the other parameters.
3. Calculate the `accuracy`, `precision`, and `recall` for the model with the best parameters (in terms of accuracy) on the test set (if there are several of them with equal values, choose the one with the higher precision)

In [66]:
bagging = BaggingClassifier(svc, random_state=21, n_estimators=100)
bagging.fit(X_train, y_train)

0,1,2
,estimator,"SVC(C=10, gam...ndom_state=21)"
,n_estimators,100
,max_samples,1.0
,max_features,1.0
,bootstrap,True
,bootstrap_features,False
,oob_score,False
,warm_start,False
,n_jobs,
,random_state,21

0,1,2
,C,10
,kernel,'rbf'
,degree,3
,gamma,'auto'
,coef0,0.0
,shrinking,True
,probability,True
,tol,0.001
,cache_size,200
,class_weight,


In [67]:
print_metrics(calculate_metrics(y_valid, bagging.predict(X_valid)))

accuracy is 0.88519
precision is 0.89396
recall is 0.88519


In [68]:
param_grid = {
    'n_estimators' : [1, 5, 10, 50, 100, 150, 200]
}

bagging_best_params = choose_params(
    bagging,
    param_grid,
    X_train,
    X_valid,
    y_train,
    y_valid
)

Choosing parameters:   0%|          | 0/7 [00:00<?, ?it/s]

In [69]:
bagging_best_params

{'n_estimators': 10}

In [71]:
bagging.set_params(**bagging_best_params)
bagging.fit(X_train, y_train)

0,1,2
,estimator,"SVC(C=10, gam...ndom_state=21)"
,n_estimators,10
,max_samples,1.0
,max_features,1.0
,bootstrap,True
,bootstrap_features,False
,oob_score,False
,warm_start,False
,n_jobs,
,random_state,21

0,1,2
,C,10
,kernel,'rbf'
,degree,3
,gamma,'auto'
,coef0,0.0
,shrinking,True
,probability,True
,tol,0.001
,cache_size,200
,class_weight,


In [72]:
print_metrics(calculate_metrics(y_valid, bagging.predict(X_valid)))

accuracy is 0.88519
precision is 0.89427
recall is 0.88519


## 5. Stacking classifiers

1. To achieve reproducibility in this case you will have to create an object of cross-validation generator: `StratifiedKFold(n_splits=n, shuffle=True, random_state=21)`, where `n` you will try to optimize (the details are below).
2. Using `StackingClassifier` and the three models that you have recently trained, calculate the `accuracy`, `precision` and `recall` on the validation set, try different values of `n_splits` `[2, 3, 4, 5, 6, 7]` in the cross-validation generator and parameter `passthrough` in the classifier itself,
3. Calculate the `accuracy`, `precision`, and `recall` for the model with the best parameters (in terms of accuracy) on the test set (if there are several of them with equal values, choose the one with the higher precision). Use `final_estimator=LogisticRegression(solver='liblinear')`.

In [77]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold

In [78]:
stacking = StackingClassifier(
    estimators=[('svc', svc), ('tree', tree), ('forest', forest)],
    final_estimator=LogisticRegression(solver='liblinear')
)

stacking_params = {
    'passthrough' : [True, False],
    'cv' : [StratifiedKFold(n_splits=n, shuffle=True, random_state=21) for n in range(2, 8)]
}

In [80]:
best_stacking = choose_params(stacking, stacking_params, X_train, X_valid, y_train, y_valid)

Choosing parameters:   0%|          | 0/12 [00:00<?, ?it/s]



In [None]:
stacking.set_params(**best_stacking)

0,1,2
,estimators,"[('svc', ...), ('tree', ...), ...]"
,final_estimator,LogisticRegre...r='liblinear')
,cv,StratifiedKFo... shuffle=True)
,stack_method,'auto'
,n_jobs,
,passthrough,False
,verbose,0

0,1,2
,C,10
,kernel,'rbf'
,degree,3
,gamma,'auto'
,coef0,0.0
,shrinking,True
,probability,True
,tol,0.001
,cache_size,200
,class_weight,

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,22
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,21
,max_leaf_nodes,
,min_impurity_decrease,0.0

0,1,2
,n_estimators,50
,criterion,'gini'
,max_depth,28
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'liblinear'
,max_iter,100


In [85]:
stacking.fit(X_train, y_train)



0,1,2
,estimators,"[('svc', ...), ('tree', ...), ...]"
,final_estimator,LogisticRegre...r='liblinear')
,cv,StratifiedKFo... shuffle=True)
,stack_method,'auto'
,n_jobs,
,passthrough,False
,verbose,0

0,1,2
,C,10
,kernel,'rbf'
,degree,3
,gamma,'auto'
,coef0,0.0
,shrinking,True
,probability,True
,tol,0.001
,cache_size,200
,class_weight,

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,22
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,21
,max_leaf_nodes,
,min_impurity_decrease,0.0

0,1,2
,n_estimators,50
,criterion,'gini'
,max_depth,28
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'liblinear'
,max_iter,100


In [87]:
print_metrics(calculate_metrics(y_test, stacking.predict(X_test)))

accuracy is 0.90828
precision is 0.91189
recall is 0.90828


In [None]:
stacking.get_params(deep=False)

{'cv': StratifiedKFold(n_splits=5, random_state=21, shuffle=True),
 'estimators': [('svc',
   SVC(C=10, gamma='auto', probability=True, random_state=21)),
  ('tree',
   DecisionTreeClassifier(class_weight='balanced', max_depth=22, random_state=21)),
  ('forest',
   RandomForestClassifier(max_depth=28, n_estimators=50, random_state=21))],
 'final_estimator': LogisticRegression(solver='liblinear'),
 'n_jobs': None,
 'passthrough': False,
 'stack_method': 'auto',
 'verbose': 0}

In [91]:
best_stacking = stacking

## 6. Predictions

1. Choose the best model in terms of accuracy (if there are several of them with equal values, choose the one with the higher precision).
2. Analyze: for which weekday your model makes the most errors (in % of the total number of samples of that class in your full dataset), for which labname and for which users.
3. Save the model.

In [92]:
best_results = pd.DataFrame(
    {
        'dayofweek' : y_test,
        'best_predict' : best_stacking.predict(X_test),
        'errors' : y_test != best_stacking.predict(X_test)
    }
)

sums = best_results.groupby('dayofweek').sum()

sums['errors percentage'] = sums['errors'] / sums['best_predict'] * 100
sums['errors percentage'].sort_values(ascending=False)

dayofweek
0    21.875000
1     5.084746
2     4.411765
5     2.583026
4     2.325581
6     1.442308
3     1.204819
Name: errors percentage, dtype: float64

In [None]:
import joblib

joblib.dump(best_stacking, 'model.joblib')