# Prediction on anonymous by our classifier
___

This notebook aims to try on our dataset, without including the anonymous subject, and then to see what is the classification accuracy.

The recording that was randomly selected is `SC4441EC-Hypnogram.edf`. It corresponds to an 74 year old woman, and the subject index is then 44.

In [39]:
%matplotlib inline

import numpy as np
import json

from datetime import datetime, timedelta

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import (RandomForestClassifier,
                              VotingClassifier)
from sklearn.metrics import (confusion_matrix,
                             classification_report,
                             cohen_kappa_score)
from hmmlearn.hmm import MultinomialHMM

from models.model_utils import (train_test_split_according_to_age)
from constants import (SLEEP_STAGES_VALUES, EPOCH_DURATION,N_STAGES)
from models.model_utils import (print_hypnogram)

## Generate trained pipeline
____

In [28]:
SUBJECT_IDX = 0 
NIGHT_IDX = 1
USE_CONTINUOUS_AGE = False
DOWNSIZE_SET = False
TEST_SET_SUBJECTS = [44]
MODELS_DIR = "trained_model"

NIGHT = [1]

In [24]:
def load_features():
    if USE_CONTINUOUS_AGE:
        X_file_name = "data/x_features-age-continuous.npy"
        y_file_name = "data/y_observations-age-continuous.npy"
    else:
        X_file_name = "data/x_features.npy"
        y_file_name = "data/y_observations.npy"

    X_init = np.load(X_file_name, allow_pickle=True)
    y_init = np.load(y_file_name, allow_pickle=True)

    X_init = np.vstack(X_init)
    y_init = np.hstack(y_init)

    print(X_init.shape)
    print(y_init.shape)
    print("Number of subjects: ", np.unique(X_init[:,SUBJECT_IDX]).shape[0]) # Some subject indexes are skipped, thus total number is below 83 (as we can see in https://physionet.org/content/sleep-edfx/1.0.0/)
    print("Number of nights: ", len(np.unique([f"{int(x[0])}-{int(x[1])}" for x in X_init[:,SUBJECT_IDX:NIGHT_IDX+1]])))
    print("Subjects available: ", np.unique(X_init[:,SUBJECT_IDX]))
    
    return X_init, y_init

def split_data(X_init, y_init):
    X_test, X_train_valid, y_test, y_train_valid = train_test_split_according_to_age(
        X_init,
        y_init,
        use_continuous_age=USE_CONTINUOUS_AGE,
        subjects_test=TEST_SET_SUBJECTS)
    
    print(X_test.shape, X_train_valid.shape, y_test.shape, y_train_valid.shape)
    
    return X_test, X_train_valid, y_test, y_train_valid

X_init, y_init = load_features()
X_test, X_train_valid, y_test, y_train_valid = split_data(X_init, y_init)

(168954, 50)
(168954,)
Number of subjects:  78
Number of nights:  153
Subjects available:  [ 0.  1.  2.  3.  4.  5.  6.  7.  8.  9. 10. 11. 12. 13. 14. 15. 16. 17.
 18. 19. 20. 21. 22. 23. 24. 25. 26. 27. 28. 29. 30. 31. 32. 33. 34. 35.
 36. 37. 38. 40. 41. 42. 43. 44. 45. 46. 47. 48. 49. 50. 51. 52. 53. 54.
 55. 56. 57. 58. 59. 60. 61. 62. 63. 64. 65. 66. 67. 70. 71. 72. 73. 74.
 75. 76. 77. 80. 81. 82.]
Selected subjects for the test set are:  [44]
(2223, 50) (166731, 50) (2223,) (166731,)


In [29]:
hmm_model = MultinomialHMM(n_components=N_STAGES)


hmm_model.emissionprob_ = np.load(f"{MODELS_DIR}/HMM_emissionprob.npy", allow_pickle=True)
hmm_model.startprob_ = np.load(f"{MODELS_DIR}/HMM_startprob.npy", allow_pickle=True)
hmm_model.transmat_ = np.load(f"{MODELS_DIR}/HMM_transmat.npy", allow_pickle=True)

In [42]:
def get_voting_classifier_pipeline():
    NB_CATEGORICAL_FEATURES = 2
    NB_FEATURES = 48

    estimator_list = [
        ('random_forest', RandomForestClassifier(
            random_state=42, # enables deterministic behaviour
            n_jobs=-1
        )),
        ('knn', Pipeline([
            ('knn_dim_red', LinearDiscriminantAnalysis()),
            ('knn_clf', KNeighborsClassifier(
                weights='uniform',
                n_neighbors=300,
                leaf_size=100,
                metric='chebyshev',
                n_jobs=-1
            ))
        ])),
    ]
        
    return Pipeline([
        ('scaling', ColumnTransformer([
            ('pass-through-categorical', 'passthrough', list(range(NB_CATEGORICAL_FEATURES))),
            ('scaling-continuous', StandardScaler(copy=False), list(range(NB_CATEGORICAL_FEATURES,NB_FEATURES)))
        ])),
        ('voting_clf', VotingClassifier(
            estimators=estimator_list,
            voting='soft',
            weights=np.array([0.83756205, 0.16243795]),
            flatten_transform=False,
            n_jobs=-1,
        ))
    ])

vtg_testing_pipeline = get_voting_classifier_pipeline()
vtg_testing_pipeline.fit(X_train_valid[:, 2:], y_train_valid)

Pipeline(memory=None,
         steps=[('scaling',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('pass-through-categorical',
                                                  'passthrough', [0, 1]),
                                                 ('scaling-continuous',
                                                  StandardScaler(copy=False,
                                                                 with_mean=True,
                                                                 with_std=True),
                                                  [2, 3, 4, 5, 6, 7, 8, 9, 10,
                                                   11, 12, 13, 14, 15, 16, 17,
                                                   18, 19, 20, 21, 22, 23, 24,
                                                   25, 26, 27, 28,

In [43]:
X_test_selected_night = X_test[X_test[:,1] == NIGHT[0]]
y_test_selected_night = y_test[X_test[:,1] == NIGHT[0]]

voting_y_test_pred = vtg_testing_pipeline.predict(X_test_selected_night[:,2:])
voting_y_test_pred = hmm_model.predict(voting_y_test_pred.reshape(-1, 1))

print(confusion_matrix(y_test_selected_night, voting_y_test_pred))
print(classification_report(y_test_selected_night, voting_y_test_pred, target_names=SLEEP_STAGES_VALUES.keys()))
print("Agreement score (Cohen Kappa): ", cohen_kappa_score(y_test_selected_night, voting_y_test_pred))

[[394  11   2   0   2]
 [ 37  66  52   0  28]
 [  5  24 321   1  54]
 [  0   0  52  39   0]
 [  0   0   0   0 100]]
              precision    recall  f1-score   support

           W       0.90      0.96      0.93       409
          N1       0.65      0.36      0.46       183
          N2       0.75      0.79      0.77       405
          N3       0.97      0.43      0.60        91
         REM       0.54      1.00      0.70       100

    accuracy                           0.77      1188
   macro avg       0.77      0.71      0.69      1188
weighted avg       0.79      0.77      0.76      1188

Agreement score (Cohen Kappa):  0.687724728854155


## Generate RF trained pipeline
___

In [38]:
%%time

def get_random_forest_model():
    NB_CATEGORICAL_FEATURES = 2
    NB_FEATURES = 48
    
    return Pipeline([
        ('scaling', ColumnTransformer([
            ('pass-through-categorical', 'passthrough', list(range(NB_CATEGORICAL_FEATURES))),
            ('scaling-continuous', StandardScaler(copy=False), list(range(NB_CATEGORICAL_FEATURES,NB_FEATURES)))
        ])),
        ('classifier', RandomForestClassifier(
            n_estimators=100,
            max_depth=24,
            random_state=42, # enables deterministic behaviour
            n_jobs=-1
        ))
    ])

rf_testing_pipeline = get_random_forest_model()
rf_testing_pipeline.fit(X_train_valid[:, 2:], y_train_valid)

CPU times: user 3min 51s, sys: 2.08 s, total: 3min 53s
Wall time: 1min 25s


In [44]:
X_test_selected_night = X_test[X_test[:,1] == NIGHT[0]]
y_test_selected_night = y_test[X_test[:,1] == NIGHT[0]]

rf_y_test_pred = rf_testing_pipeline.predict(X_test_selected_night[:,2:])
rf_y_test_pred = hmm_model.predict(rf_y_test_pred.reshape(-1, 1))

print(confusion_matrix(y_test_selected_night, rf_y_test_pred))
print(classification_report(y_test_selected_night, rf_y_test_pred, target_names=SLEEP_STAGES_VALUES.keys()))
print("Agreement score (Cohen Kappa): ", cohen_kappa_score(y_test_selected_night, rf_y_test_pred))

[[398   8   1   0   2]
 [ 48  64  45   0  26]
 [  4  26 313   0  62]
 [  0   1  59  31   0]
 [  0   0   0   0 100]]
              precision    recall  f1-score   support

           W       0.88      0.97      0.93       409
          N1       0.65      0.35      0.45       183
          N2       0.75      0.77      0.76       405
          N3       1.00      0.34      0.51        91
         REM       0.53      1.00      0.69       100

    accuracy                           0.76      1188
   macro avg       0.76      0.69      0.67      1188
weighted avg       0.78      0.76      0.75      1188

Agreement score (Cohen Kappa):  0.6709289967448082


## Saving to display in performance page
___

She went to bed at 1989-07-04 00:18:00, so in UTC its 615514680.

In [48]:
rf_hypnogram = {
    'timestamps': [
        int((datetime.fromtimestamp(615514680) + timedelta(seconds=index*EPOCH_DURATION)).timestamp())
        for index, stage in enumerate(rf_y_test_pred)
    ],
    'stages': [
        str(list(SLEEP_STAGES_VALUES.keys())[list(SLEEP_STAGES_VALUES.values()).index(stage)])
        for index, stage in enumerate(rf_y_test_pred)
    ]
}

with open('data/predicted_woman78yo_sleepedf.json', 'w') as fp:
    json.dump(dict(epochs=rf_hypnogram), fp)

In [20]:
voting_hypnogram = {
    'timestamps': [int((datetime.fromtimestamp(615514680) + timedelta(seconds=index*EPOCH_DURATION)).timestamp()) for index, stage in enumerate(voting_y_test_pred)],
    'stages': [str(list(SLEEP_STAGES_VALUES.keys())[list(SLEEP_STAGES_VALUES.values()).index(stage)]) for index, stage in enumerate(voting_y_test_pred)]
}

