# Sleep stages classification pipeline
___

This notebooks aims to construct our feature matrix from our sleep records [sleep-edf](https://physionet.org/content/sleep-edfx/1.0.0/) from _physionet_. 

We will reuse the chosen features from our exploration notebook.

Since all of the dataset cannot be loaded in memory at the same time, we will have to implement a pipeline, where each step can then be run with only one recording at a time. At the end of this notebook, we will be able to concatenate all resulting features in a single matrix.

In [21]:
%matplotlib inline

from multiprocessing import Pool, cpu_count

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.model_selection import (train_test_split, KFold)
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (accuracy_score, confusion_matrix, classification_report, f1_score)

import mne
from mne.time_frequency import psd_welch
from scipy.stats import (skew, kurtosis)

from utils import fetch_data

We define a few constants.

In [22]:
NB_EPOCHS_AWAKE_MORNING = 60

SAMPLING_FREQ = 100
EPOCH_DURATION = 30. # in seconds
MAX_TIME = EPOCH_DURATION - 1. / SAMPLING_FREQ  # tmax in included

EEG_CHANNELS = [
    'EEG Fpz-Cz',
    'EEG Pz-Oz'
]

# Values
W = 0
N1 = 1
N2 = 2
N3 = 3
REM = 4

ANNOTATIONS_EVENT_ID = {
    'Sleep stage W': W,
    'Sleep stage 1': N1,
    'Sleep stage 2': N2,
    'Sleep stage 3': N3,
    'Sleep stage 4': N3,
    'Sleep stage R': REM
}

EVENT_ID = {
    "W": W,
    "N1": N1,
    "N2": N2,
    "N3": N3,
    "REM": REM
}

## Preprocessing
___

Our dataset consists of 39 recordings, each containing about 20 hours of EEG, EOG, EMG, and other signals, sampled at 100 or 1Hz.

### 1. Loading recordings informations
___

This file contains information about when a recording was started, at which time the subject went to bed and the amount of sleep he got.

In [23]:
df_records = pd.read_csv("data/recordings-info.csv")
df_records.head(5)

Unnamed: 0.1,Unnamed: 0,subject,night,age,sex,LightsOff,StartRecord,LightsOffSecond,NightDuration
0,0,0,1,33,1,1989-04-25 00:38:00,1989-04-24 16:13:00,30300.0,21810.0
1,1,0,2,33,1,1989-04-25 21:57:00,1989-04-25 14:50:00,25620.0,30330.0
2,2,1,1,33,1,1989-03-29 22:44:00,1989-03-29 16:49:00,21300.0,29250.0
3,3,1,2,33,1,1989-03-30 22:15:00,1989-03-30 17:00:00,18900.0,31260.0
4,4,2,1,26,1,1989-04-05 22:50:00,1989-04-05 16:48:00,21720.0,27150.0




### 2. Retrieve data from file and filter to only one channel of EEG
___

This step includes:
- Retrieving edf file
- Excluding channels that are not EEG signals

In [24]:
def fetch_signal(psg_file_name, hypno_file_name):
    """
    returns: mne.Raw of the whole night recording
    """
    raw_data = mne.io.read_raw_edf(psg_file_name, preload=True, stim_channel=None, verbose=False)
    annot = mne.read_annotations(hypno_file_name)
    raw_data.set_annotations(annot, emit_warning=False)
    
    return raw_data

def drop_other_channels(raw_data, channel_to_keep):
    """
    returns: mne.Raw with the two EEG channels and the signal
        between the time the subject closed the lights and the time
        at which the subject woke up
    """
    raw_data.drop_channels([ch for ch in raw_data.info['ch_names'] if ch != channel_to_keep])
    
    return raw_data


### 3. Convert Raw signal to epochs or matrices
___

In [25]:
def convert_to_epochs(raw_data, info):
    """
    returns:
        mne.Epochs, where the epochs are only choosen if the subject was in bed.
        y: np array of shape (nb_epochs,), which contains each epoch label.
    """

    # Number of seconds since file began
    closed_lights_time = info['LightsOffSecond']
    woke_up_time = closed_lights_time + info['NightDuration'] + NB_EPOCHS_AWAKE_MORNING*EPOCH_DURATION
    
    raw_data.crop(tmin=closed_lights_time, tmax=min(woke_up_time, raw_data.n_times*(1/SAMPLING_FREQ)-1))
    
    events, annot_event_id = mne.events_from_annotations(
        raw_data,
        event_id=ANNOTATIONS_EVENT_ID,
        chunk_duration=EPOCH_DURATION,
        verbose=False)
    
    # Few files do not have N3 sleep (i.e. SC4202EC-Hypnogram), so we have to filter out key-value pairs that are not in the annotations.
    event_id = { 
        event_key: EVENT_ID[event_key] 
        for event_key in EVENT_ID
        if EVENT_ID[event_key] in annot_event_id.values()
    }
    
    epochs = mne.Epochs(
        raw=raw_data,
        events=events,
        event_id=event_id,
        tmin=0.,
        tmax=MAX_TIME,
        preload=True,
        baseline=None,
        verbose=False)
    
    y = np.array([event[-1] for event in epochs.events])
    
    return epochs, y 


def convert_to_matrices(data):
    """
    data: mne.Epochs with only one EEG channel
    
    returns
        - X: Matrix of input values, of size (nb_epochs, sampling_rate*epoch_length=3000)
        - y: Vector of observation labels, of size (nb_epochs,)
    """
    df = data.to_data_frame(picks="eeg", long_format=True)
    df = df.drop(columns=['ch_type', 'channel'])
    df = df.sort_values(by=['epoch', 'time'])
    
    y = df[['epoch', 'condition']].drop_duplicates(keep="first")['condition'].to_numpy()
    X = np.matrix(
        [df[df['epoch'] == epoch]['observation'].to_numpy() for epoch in df['epoch'].unique()]
    )

    return X, y

Complete function to apply in order to preprocess our data:

In [26]:
def preprocess(data, current_channel, df_info, convert_to_matrix=False):
    """
    data: mne.Raw 
        Instance of all of the night recording and all channels
    current_channel: str 
        Current EEG channel
        
    returns
        - X: Matrix of input values, of size (nb_epochs, sampling_rate*epoch_length=3000)
        - y: Vector of observation labels, of size (nb_epochs,)
    """
    data = drop_other_channels(data, current_channel)
    data, y = convert_to_epochs(data, df_info)
    
    if not convert_to_matrix:
        return data, y
    
    return convert_to_matrices(data)


## Feature extraction
___


```
        Frequency domain
        features                +----> Average delta band +-+
                +-------+       |                           |
      +-------->+  FFT  +-------+      ...                  |
      |         +-------+       |                           |
      |                         +----> Mean frequency  +----+
      |                                                     |
      |                                                     |
      |                                                     |
  X +-+                                                     +-> X'
Input |                                                     |   Features
mne.  |                         +-----> Variance +----------+   np.array
Epochs|                         |                           |   shape:
      |         +----------+    |                           |    (nb_epochs, nb_features)
      +-------->+ get_data +-------------> Mean +-----------+
                +----------+    |                           |
                                |       ...                 |
                                |                           |
                                +-----> Zero cross rate ----+
        Time
        domain features

```

We first have to define a transformer that will extract the values out of an epoch instance.

In [27]:
def get_data_from_epochs(epochs):
    """
    epochs: mne.Epochs
    
    returns np array of shape (nb_epochs, sampling_rate*epoch_length)
    """
    return epochs.get_data().squeeze()

get_data_from_epochs_transformer = FunctionTransformer(get_data_from_epochs, validate=False)

We then define a skeleton fonction which receives a fonction that is called for every epochs. 

In [28]:
def get_transformer(get_feature):
    
    def get_one_feature_per_epoch(X, get_feature):
        """
        X: Input matrix (nb_epochs, sampling_rate*epoch_length)
        get_feature: callable 
            generates one feature for each epoch

        returns matrix (nb_epoch,1)
        """
        return [[get_feature(epoch)] for epoch in X]

    return lambda X: get_one_feature_per_epoch(X, get_feature)

#### 1. Time domain features
___

##### a) Standard statistics
____

We extract features on the distribution of the time domain values of each epoch.

In [29]:
mean_transformer = FunctionTransformer(get_transformer(np.mean), validate=True)
std_transformer = FunctionTransformer(get_transformer(np.std), validate=True)
skew_transformer = FunctionTransformer(get_transformer(skew), validate=True)
kurtosis_transformer = FunctionTransformer(get_transformer(kurtosis), validate=True)

##### b) Mean crossing rate
____

In [30]:
def get_zero_crossing_rate(signal):
    """
    Multiplies signal by itself shifted by one.
    If the signal crosses the horizontal axis, the sign will be negative and vice-versa.
    
    Returns nb of time the signal crossed the horizontal axis
    """
    return ((signal[:-1] * signal[1:]) < 0).sum()

def get_mean_crossing_rate(signal):
    return get_zero_crossing_rate(signal - np.mean(signal))

mean_crossing_rate_transformer = FunctionTransformer(get_transformer(get_mean_crossing_rate), validate=True)

##### Merging all time domain features with `FeatureUnion`
___

In [31]:
time_domain_feature_union = FeatureUnion([
    ('mean', mean_transformer),
    ('std', std_transformer),
    ('skew', skew_transformer),
    ('kurtosis', kurtosis_transformer),
    ('mean-crossing-rate', mean_crossing_rate_transformer)
], n_jobs=1)

time_domain_pipeline = Pipeline([
    ('epochs_to_data', get_data_from_epochs_transformer),
    ('time_domain_features', time_domain_feature_union)
])

#### 2. Frequency domain features
___

In [32]:
def eeg_power_band(epochs):
    """EEG relative power band feature extraction.

    This function takes an ``mne.Epochs`` object and creates EEG features based
    on relative power in specific frequency bands that are compatible with
    scikit-learn.

    Parameters
    ----------
    epochs : Epochs
        The data.

    Returns
    -------
    X : numpy array of shape [n_samples, 5]
        Transformed data.
    """
    # specific frequency bands
    FREQ_BANDS = {"delta": [0.5, 4.5],
                  "theta": [4.5, 8.5],
                  "alpha": [8.5, 11.5],
                  "sigma": [11.5, 15.5],
                  "beta": [15.5, 30]}

    psds, freqs = psd_welch(epochs, fmin=0.5, fmax=30.)
    # Normalize the PSDs
    psds /= np.sum(psds, axis=-1, keepdims=True)

    X = []
    for fmin, fmax in FREQ_BANDS.values():
        psds_band = psds[:, :, (freqs >= fmin) & (freqs < fmax)].mean(axis=-1)
        X.append(psds_band.reshape(len(psds), -1))

    return np.concatenate(X, axis=1)

eeg_power_bands_transformer = FunctionTransformer(eeg_power_band, validate=False)

In [33]:
frequency_domain_feature_union = FeatureUnion([
    ('power_band', eeg_power_bands_transformer)
])
# frequency_domain_pipeline = make_pipeline()

#### 3. Complete feature extraction pipeline
___

In [34]:
feature_union = FeatureUnion([
    ('time_domain', time_domain_pipeline),
    ('frequency_domain', frequency_domain_feature_union)
], n_jobs=1)

## Extraction
___

In [35]:
SUBJECTS = range(20)
NIGHT_RECORDINGS = [1, 2]

subject_file_names = fetch_data(subjects=SUBJECTS, recording=NIGHT_RECORDINGS)

psg_file_names = [names[0] for names in subject_file_names]
stage_file_names = [names[1] for names in subject_file_names]

Using default location ~/mne_data for PHYSIONET_SLEEP...


In [36]:
%%time

def get_features(recording_index):
    """
    recording_index: index starting at 0..nb_files-1.
        ** It does not corresponds to the file indexes if we don't include the first files in the subjects range. **
    Returns features X in a vector of (nb_epochs, nb_features)
    """
    print("Calculating for file ", SUBJECTS[0] + recording_index)
    df_info = df_records.iloc[SUBJECTS[0] + recording_index]
    raw_data = fetch_signal(psg_file_names[recording_index], stage_file_names[recording_index])
    features_file = []
    
    for channel in EEG_CHANNELS:
        X_file_channel, y_file_channel = preprocess(raw_data.copy(), channel, df_info)
        X_features = feature_union.fit_transform(X_file_channel)
        features_file.append(X_features)
        
        print(f"Done extracting {X_features.shape[1]} features on {X_features.shape[0]} epochs for {channel} for file {psg_file_names[recording_index][-16:]}\n")
        
        assert X_features.shape[0] == len(y_file_channel), "Features and labels must have the same number of epochs"

    # Only returns y one time, because both channels refer to the same epochs
    return np.hstack(tuple(features_file)), y_file_channel

with Pool(processes=cpu_count()) as pool:
    observations = pool.map(get_features, range(len(psg_file_names)))
    X, y = zip(*observations)

Calculating for file  2
Calculating for file  0
Calculating for file  4
Calculating for file  8
Calculating for file  10
Calculating for file  6
Calculating for file  12
Calculating for file  14
Effective window size : 2.560 (s)
Effective window size : 2.560 (s)
Effective window size : 2.560 (s)
Effective window size : 2.560 (s)
Done extracting 10 features on 787 epochs for EEG Fpz-Cz for file SC4001E0-PSG.edf

Done extracting 10 features on 780 epochs for EEG Fpz-Cz for file SC4061E0-PSG.edf
Done extracting 10 features on 1035 epochs for EEG Fpz-Cz for file SC4011E0-PSG.edf

Done extracting 10 features on 897 epochs for EEG Fpz-Cz for file SC4031E0-PSG.edf


Effective window size : 2.560 (s)
Effective window size : 2.560 (s)
Done extracting 10 features on 825 epochs for EEG Fpz-Cz for file SC4051E0-PSG.edf

Done extracting 10 features on 940 epochs for EEG Fpz-Cz for file SC4071E0-PSG.edf

Effective window size : 2.560 (s)
Effective window size : 2.560 (s)
Done extracting 10 features 

In [37]:
X = np.vstack(X)
y = np.hstack(y)
print(X.shape)
print(y.shape)

(38991, 20)
(38991,)


In [39]:
NB_KFOLDS = 5

accuracies = []
f1_scores = []

for train_index, test_index in KFold(n_splits=NB_KFOLDS).split(X):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    classifier = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
    classifier.fit(X_train, y_train)
    y_test_pred = classifier.predict(X_test)
    accuracies.append(round(accuracy_score(y_test, y_test_pred),2))
    f1_scores.append(f1_score(y_test, y_test_pred, average="micro"))
    
    print(confusion_matrix(y_test, y_test_pred))
    
    print(classification_report(y_test, y_test_pred, target_names=EVENT_ID.keys()))

print(f"\n\nAccuracies accross {NB_KFOLDS} folds: {accuracies}")
print(f"Mean F1-score: {np.mean(f1_scores):0.2f}")

TRAIN: [ 7799  7800  7801 ... 38988 38989 38990] TEST: [   0    1    2 ... 7796 7797 7798]
[[ 701   45    8    3   28]
 [ 177   82  161   10  312]
 [  32    6 3202  172  304]
 [   5    0   51 1064    0]
 [  41   42  112    2 1239]]
              precision    recall  f1-score   support

           W       0.73      0.89      0.81       785
          N1       0.47      0.11      0.18       742
          N2       0.91      0.86      0.88      3716
          N3       0.85      0.95      0.90      1120
         REM       0.66      0.86      0.75      1436

    accuracy                           0.81      7799
   macro avg       0.72      0.74      0.70      7799
weighted avg       0.79      0.81      0.79      7799

TRAIN: [    0     1     2 ... 38988 38989 38990] TEST: [ 7799  7800  7801 ... 15594 15595 15596]
[[1096   28   13    8   38]
 [ 178  126  109    0  316]
 [  32   40 3103   25  273]
 [   4    0  199  841    1]
 [  68  145  107    0 1048]]
              precision    recall  f1-sco