# Voting classifier
___

This model aims to classify sleep stages based on two EEG channel. We will use the features extracted in the `pipeline.ipynb` notebook as the input to a voting classifier. As written in the docs, it "[...] combines conceptually different machine learning classifiers and uses a majority vote or the average predicted probabilities (soft vote) to predict the class labels."

In [None]:
%load_ext autoreload
%autoreload 2

import os
import sys

# Ensure parent folder is in PYTHONPATH
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [None]:
%matplotlib inline

import matplotlib.pyplot as plt
import numpy as np
import joblib

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import (GridSearchCV,
                                     RandomizedSearchCV,
                                     GroupKFold,
                                     cross_validate)
from sklearn.metrics import (accuracy_score,
                             confusion_matrix,
                             classification_report,
                             f1_score,
                             cohen_kappa_score,
                             make_scorer)
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.decomposition import PCA
from sklearn.ensemble import (RandomForestClassifier,
                              VotingClassifier)
from sklearn.svm import LinearSVC
from constants import (SLEEP_STAGES_VALUES,
                       N_STAGES,
                       EPOCH_DURATION)
from model_utils import (print_hypnogram,
                         train_test_split_one_subject,
                         train_test_split_according_to_age,
                         evaluate_hyperparams_grid,
                         print_results_cv,
                         print_results_cv_scores,
                         print_hyperparam_tuning_results)

## Load the features
___

In [None]:
# position of the subject information and night information in the X matrix
SUBJECT_IDX = 0 
NIGHT_IDX = 1
USE_CONTINUOUS_AGE = False
DOWNSIZE_SET = False
TEST_SET_SUBJECTS = [0.0, 24.0, 49.0, 71.0]

if USE_CONTINUOUS_AGE:
    X_file_name = "../data/x_features-age-continuous.npy"
    y_file_name = "../data/y_observations-age-continuous.npy"
else:
    X_file_name = "../data/x_features.npy"
    y_file_name = "../data/y_observations.npy"
    
X_init = np.load(X_file_name, allow_pickle=True)
y_init = np.load(y_file_name, allow_pickle=True)

X_init = np.vstack(X_init)
y_init = np.hstack(y_init)
print(X_init.shape)
print(y_init.shape)
print("Number of subjects: ", np.unique(X_init[:,SUBJECT_IDX]).shape[0]) # Some subject indexes are skipped, thus total number is below 83 (as we can see in https://physionet.org/content/sleep-edfx/1.0.0/)
print("Number of nights: ", len(np.unique([f"{int(x[0])}-{int(x[1])}" for x in X_init[:,SUBJECT_IDX:NIGHT_IDX+1]])))
print("Subjects available: ", np.unique(X_init[:,SUBJECT_IDX]))

In [None]:
X_test, X_train_valid, y_test, y_train_valid = train_test_split_according_to_age(
    X_init,
    y_init,
    use_continuous_age=USE_CONTINUOUS_AGE,
    subjects_test=TEST_SET_SUBJECTS)
    
print(X_test.shape, X_train_valid.shape, y_test.shape, y_train_valid.shape)

In [None]:
NB_KFOLDS = 5
NB_CATEGORICAL_FEATURES = 2
NB_FEATURES = 48
CLASSIFIER_PIPELINE_KEY = 'classifier'
RANDOM_STATE = 42 

def get_cv_iterator(n_splits=2):
    return GroupKFold(n_splits=n_splits).split(
        X_train_valid, groups=X_train_valid[:,SUBJECT_IDX]
    )
    
def cross_validate_with_confusion_matrix(pipeline, n_fold):
    accuracies = []
    macro_f1_scores = []
    weighted_f1_scores = []
    kappa_agreements = []
    emission_matrix = np.zeros((N_STAGES,N_STAGES))

    for train_index, valid_index in get_cv_iterator(n_splits=n_fold):
        # We drop the subject and night indexes
        X_train, X_valid = X_train_valid[train_index, 2:], X_train_valid[valid_index, 2:]
        y_train, y_valid = y_train_valid[train_index], y_train_valid[valid_index]

        # Scaling features and model training
        training_pipeline = pipeline
        training_pipeline.fit(X_train, y_train)

        # Validation
        y_valid_pred = training_pipeline.predict(X_valid)

        print("----------------------------- FOLD RESULTS --------------------------------------\n")
        current_kappa = cohen_kappa_score(y_valid, y_valid_pred)

        print("TRAIN:", train_index, "VALID:", valid_index, "\n\n")
        print(confusion_matrix(y_valid, y_valid_pred), "\n")
        print(classification_report(y_valid, y_valid_pred, target_names=SLEEP_STAGES_VALUES.keys()), "\n")
        print("Agreement score (Cohen Kappa): ", current_kappa, "\n")

        accuracies.append(round(accuracy_score(y_valid, y_valid_pred),2))
        macro_f1_scores.append(f1_score(y_valid, y_valid_pred, average="macro"))
        weighted_f1_scores.append(f1_score(y_valid, y_valid_pred, average="weighted"))
        kappa_agreements.append(current_kappa)
        
        for y_pred, y_true in zip(y_valid_pred, y_valid):
            emission_matrix[y_true, y_pred] += 1

    print_results_cv(accuracies, macro_f1_scores, weighted_f1_scores, kappa_agreements)
    return emission_matrix

In [None]:
def get_pipeline(with_svc=True, with_knn=True, with_rf=True):
    NB_CATEGORICAL_FEATURES = 2
    NB_FEATURES = 48
    estimator_list = []
    
    if with_rf:
        rf_clf = RandomForestClassifier(
            random_state=42, # enables deterministic behaviour
            n_jobs=-1
        )
        estimator_list.append(('random_forest', rf_clf))

    if with_knn:
        knn_clf = Pipeline([
            ('knn_dim_red', LinearDiscriminantAnalysis()),
            ('knn_clf', KNeighborsClassifier(
                weights='uniform',
                n_neighbors=300,
                leaf_size=100,
                metric='chebyshev',
                n_jobs=-1
            ))
        ])
        estimator_list.append(('knn', knn_clf))

    if with_svc:
        svc_clf = Pipeline([
            ('svc_dim_red', PCA(n_components=35)),
            ('svc_clf', LinearSVC(
                dual=False,
                C=2.105,
                class_weight="balanced",
                random_state=RANDOM_STATE
            ))
        ])
        estimator_list.append(('svc', svc_clf))
    
    voting_clf = VotingClassifier(
        estimators=estimator_list,
        n_jobs=-1
    )
    
    return Pipeline([
            ('scaling', ColumnTransformer([
                ('pass-through-categorical', 'passthrough', list(range(NB_CATEGORICAL_FEATURES))),
                ('scaling-continuous', StandardScaler(copy=False), list(range(NB_CATEGORICAL_FEATURES,NB_FEATURES)))
            ])),
            ('voting_clf', voting_clf)
        ])


In [None]:
%%time

emission_matrix = cross_validate_with_confusion_matrix(
    get_pipeline(), 5
)

```
Mean accuracy          : 0.72 ± 0.030
Mean macro F1-score    : 0.64 ± 0.027
Mean weighted F1-score : 0.70 ± 0.030
Mean Kappa's agreement : 0.61 ± 0.044
CPU times: user 1min 4s, sys: 7.75 s, total: 1min 12s
Wall time: 5min 52s
```
## Hyperparameter testing
___

The hyperparameters of a Voting classifier are:
- `estimators`: list of classifier

- `voting`: {'hard', 'soft'}
    
    If ‘hard’, uses predicted class labels for majority rule voting. Else if ‘soft’, predicts the class label based on the argmax of the sums of the predicted probabilities, which is recommended for an ensemble of well-calibrated classifiers

- `weights`: array-like of shape (n_classifiers,), default=None

    Sequence of weights (float or int) to weight the occurrences of predicted class labels (hard voting) or class probabilities before averaging (soft voting). Uses uniform weights if None.
    

In [None]:
CLASSIFIER_PIPELINE_KEY = 'voting_clf'
N_CLASSIFIER = 3

#### 1. `estimators`
___

We will check all combinations of our three estimators that have at least two estimators.

In [None]:

for title, pipeline in [
#     ('-- all estimators --', get_pipeline()),
    ('-- without LinearSVC --', get_pipeline(with_svc=False)),
    ('-- without KNN --', get_pipeline(with_knn=False)),
#     ('-- without RF --', get_pipeline(with_rf=False))
]:
    scores = cross_validate(
        estimator=pipeline,
        X=X_train_valid,
        y=y_train_valid,
        groups=X_train_valid[:,SUBJECT_IDX],
        scoring={
            "agreement": make_scorer(cohen_kappa_score),
            "accuracy": 'accuracy',
            "f1-score-macro": 'f1_macro',
            "f1-score-weighted": 'f1_weighted',
        },
        cv=get_cv_iterator(n_splits=),
        verbose=1,
        n_jobs=-1
    )
    
    print(title)
    print_results_cv_scores(scores)

1st experiment: [all, without_svc, without_knn, without_rf]

|Rank| With             |Mean accuracy     |Mean macro F1-score       |Mean weighted F1-score    | Mean Kappa's agreement    |
|----|------------------|------------------|--------------------------|--------------------------|---------------------------|
|1   | RF,      SVC     | 0.72 ± 0.008     | 0.65 ± 0.008             | 0.71 ± 0.013             | 0.61 ± 0.009              |
|2   | RF, KNN          | 0.72 ± 0.010     | 0.63 ± 0.010             | 0.70 ± 0.015             | 0.60 ± 0.012              |
|3   | RF, KNN, SVC     | 0.71 ± 0.005     | 0.63 ± 0.006             | 0.69 ± 0.011             | 0.60 ± 0.006              |
|4   |     KNN, SVC     | 0.70 ± 0.010     | 0.62 ± 0.009             | 0.68 ± 0.015             | 0.58 ± 0.013              |

We will run only [RF, KNN] and [RF, SVC] with more splits to have a better view on their scores.

2nd experiment: [without_svc, without_knn]

|Rank| With             |Mean accuracy     |Mean macro F1-score       |Mean weighted F1-score    | Mean Kappa's agreement    | Time |
|----|------------------|------------------|--------------------------|--------------------------|---------------------------|------|
|1   | RF,      SVC     | 0.72 ± 0.030     | 0.65 ± 0.030             | 0.71 ± 0.031             | 0.6145 ± 0.043            | 6.0 m|
|2   | RF, KNN          | 0.72 ± 0.030     | 0.63 ± 0.029             | 0.69 ± 0.030             | 0.60   ± 0.043            | 6.2 m|

We can see we obtain better results without the KNN classifier.

#### 2. `voting` hyperparameter
___

Setting this hyperparameter to `hard` involves summing all `predict_proba` results and choosing the class that maximizes this sum. We then have to only include classifier that implements `predict_proba`. In the case of `LinearSVC`, it is not defined, so it is excluded from this tuning.

In [None]:
%%time

evaluate_hyperparams_grid(
    params={
        f"{CLASSIFIER_PIPELINE_KEY}__voting": ['hard', 'soft'],
    },
    estimator=get_pipeline(with_svc=False),
    X=X_train_valid,
    y=y_train_valid,
    cv=get_cv_iterator(n_splits=2),
    use_randomized=True
)

1st experiment: `voting` ['hard', 'soft']

|Rank| voting           | Test score     |
|----|------------------|----------------|
|1   | soft             | 0.6155 ± 0.001 |
|2   | hard             | 0.5984 ± 0.013 |

Wall time: 4min 21s

With the preceding results, we will then compare models with RF, KNN with soft voting, which seems promising vs RF, SVC with hard voting.

In [None]:
# RF, KNN with soft voting

pipeline = get_pipeline(with_svc=False)
pipeline.set_params(voting_clf__voting='soft')

scores = cross_validate(
    estimator=pipeline,
    X=X_train_valid,
    y=y_train_valid,
    groups=X_train_valid[:,SUBJECT_IDX],
    scoring={
        "agreement": make_scorer(cohen_kappa_score),
        "accuracy": 'accuracy',
        "f1-score-macro": 'f1_macro',
        "f1-score-weighted": 'f1_weighted',
    },
    cv=get_cv_iterator(n_splits=5),
    verbose=1,
    n_jobs=-1
)

print_results_cv_scores(scores)

In [None]:
scores['test_agreement'].mean()

|Rank| With             |Mean accuracy     |Mean macro F1-score       |Mean weighted F1-score    | Mean Kappa's agreement    | Time |
|----|------------------|------------------|--------------------------|--------------------------|---------------------------|------|
|1   | RF, SVC (hard)   | 0.72 ± 0.030     | 0.65 ± 0.030             | 0.71 ± 0.031             | 0.6145 ± 0.043            | 6.0 m|
|2   | RF, KNN (soft)   | 0.72 ± 0.031     | 0.63 ± 0.026             | 0.70 ± 0.030             | 0.6154 ± 0.043            | 5.4 m|

Both models are pretty much equivalent.

#### 3. `weights` hyperparam
___

In [None]:
%%time

pipeline = get_pipeline(with_svc=False)
pipeline.set_params(voting_clf__voting='soft')

evaluate_hyperparams_grid(
    params={
        f"{CLASSIFIER_PIPELINE_KEY}__weights": np.random.dirichlet(np.ones(2),size=40),
    },
    estimator=pipeline,
    X=X_train_valid,
    y=y_train_valid,
    cv=get_cv_iterator(n_splits=2),
    use_randomized=True
)

We can see that the best results gives RF a bigger weight than KNN.

|Rank| weights                   | Test score     |
|----|---------------------------|----------------|
|1   | [0.83756205, 0.16243795]  | 0.6237 ± 0.002 |
|2   | [0.84033876, 0.15966124]  | 0.6236 ± 0.002 |
|3   | [0.66680774, 0.33319226]  | 0.6233 ± 0.001 |
|4   | [0.90868334, 0.09131666]  | 0.6213 ± 0.002 |
|5   | [0.95630919, 0.04369081]  | 0.6185 ± 0.003 |
|6   | [0.47451369, 0.52548631]  | 0.6185 ± 0.003 |
|7   | [0.38562595, 0.61437405]  | 0.6140 ± 0.001 |
|8   | [0.1670429, 0.8329571]    | 0.5931 ± 0.001 |
|9   | [0.14224379, 0.85775621]  | 0.5912 ± 0.001 |
|10  | [0.10095773, 0.89904227]  | 0.5879 ± 0.000 |

```
CPU times: user 1.32 s, sys: 2.16 s, total: 3.48 s
Wall time: 19min 23s
```


## Testing
___


In [None]:
%%time

testing_pipeline = get_pipeline(with_svc=False)
testing_pipeline.set_params(voting_clf__voting='soft')
testing_pipeline.set_params(voting_clf__weights=[0.83756205, 0.16243795])

testing_pipeline.fit(X_train_valid[:, 2:], y_train_valid);

In [None]:
y_test_pred = testing_pipeline.predict(X_test[:,2:])

print(confusion_matrix(y_test, y_test_pred))

print(classification_report(y_test, y_test_pred, target_names=SLEEP_STAGES_VALUES.keys()))

print("Agreement score (Cohen Kappa): ", cohen_kappa_score(y_test, y_test_pred))

### Test results
___

#### 1) With default parameters and three classifiers
____
```
              precision    recall  f1-score   support

           W       0.76      0.92      0.83      1624
          N1       0.45      0.18      0.26       983
          N2       0.84      0.87      0.86      3603
          N3       0.71      0.95      0.81       611
         REM       0.69      0.63      0.66      1302

    accuracy                           0.77      8123
   macro avg       0.69      0.71      0.69      8123
weighted avg       0.74      0.77      0.75      8123

Agreement score (Cohen Kappa):  0.6714418913120306
```

#### 2) With weights, soft voting, with SVC & KNN
___

```
              precision    recall  f1-score   support

           W       0.82      0.94      0.87      1624
          N1       0.43      0.15      0.22       983
          N2       0.84      0.89      0.86      3603
          N3       0.75      0.94      0.83       611
         REM       0.68      0.69      0.68      1302

    accuracy                           0.78      8123
   macro avg       0.70      0.72      0.70      8123
weighted avg       0.75      0.78      0.76      8123

Agreement score (Cohen Kappa):  0.6913101923642638
```

## Saving trained model
___

In [None]:
SAVED_DIR = "../trained_model"

if not os.path.exists(SAVED_DIR):
    os.mkdir(SAVED_DIR);

if USE_CONTINUOUS_AGE: 
    joblib.dump(testing_pipeline, f"{SAVED_DIR}/classifier_voting_age_continuous.joblib")
else:
    joblib.dump(testing_pipeline, f"{SAVED_DIR}/classifier_voting.joblib")

In [None]:
np.save(f"{SAVED_DIR}/HMM_emissionprob_voting.npy", emission_matrix)
