# Sleep stages classification pipeline
___

This notebooks aims to construct our feature matrix from our sleep records [sleep-edf](https://physionet.org/content/sleep-edfx/1.0.0/) from _physionet_. 

We will reuse the chosen features from our exploration notebook.

Since all of the dataset cannot be loaded in memory at the same time, we will have to implement a pipeline, where each step can then be run with only one recording at a time. At the end of this notebook, we will be able to concatenate all resulting features in a single matrix.

In [None]:
%matplotlib inline

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (accuracy_score, confusion_matrix, classification_report)

import mne
from mne.datasets.sleep_physionet.age import fetch_data
from mne.time_frequency import psd_welch

In [None]:
SUBJECTS = range(20)
NIGHT_RECORDINGS = [1, 2]
NB_EPOCHS_AWAKE_MORNING = 60

SAMPLING_FREQ = 100
EPOCH_DURATION = 30. # in seconds
MAX_TIME = EPOCH_DURATION - 1. / SAMPLING_FREQ  # tmax in included

EEG_CHANNELS = [
    'EEG Fpz-Cz',
    'EEG Pz-Oz'
]

# Values
W = 0
N1 = 1
N2 = 2
N3 = 3
REM = 4

ANNOTATIONS_EVENT_ID = {
    'Sleep stage W': W,
    'Sleep stage 1': N1,
    'Sleep stage 2': N2,
    'Sleep stage 3': N3,
    'Sleep stage 4': N3,
    'Sleep stage R': REM
}

EVENT_ID = {
    "W": W,
    "N1": N1,
    "N2": N2,
    "N3": N3,
    "REM": REM
}

## Preprocessing
___

Our dataset consists of 39 recordings, each containing about 20 hours of EEG, EOG, EMG, and other signals, sampled at 100 or 1Hz.

### 1. Loading recordings informations
___

This file contains information about when a recording was started, at which time the subject went to bed and the amount of sleep he got.

In [None]:
df_records = pd.read_csv("data/recordings-info.csv")
df_records.head(5)



### 2. Retrieve data from file and filter to only one channel of EEG
___

This step includes:
- Retrieving edf file
- Excluding none EEG channels

In [None]:
def fetch_signal(psg_file_name, hypno_file_name):
    """
    returns: mne.Raw of the whole night recording
    """
    raw_data = mne.io.read_raw_edf(psg_file_name, preload=True, stim_channel=None, verbose=False)
    annot = mne.read_annotations(hypno_file_name)
    raw_data.set_annotations(annot, emit_warning=False)
    
    return raw_data

def drop_other_channels(raw_data, channel_to_keep):
    """
    returns: mne.Raw with the two EEG channels and the signal
        between the time the subject closed the lights and the time
        at which the subject woke up
    """
    raw_data.drop_channels([ch for ch in raw_data.info['ch_names'] if ch != channel_to_keep])
    
    return raw_data


### 3. Convert Raw signal to matrices
___

In [None]:
def convert_to_epochs(raw_data, info):
    """
    returns:
        mne.Epochs, where the epochs are only choosen if the subject was in bed.
        y: np array of shape (nb_epochs,), which contains each epoch label.
    """

    # Number of seconds since file began
    closed_lights_time = info['LightsOffSecond']
    woke_up_time = closed_lights_time + info['NightDuration'] + NB_EPOCHS_AWAKE_MORNING*EPOCH_DURATION
    
    raw_data.crop(tmin=closed_lights_time, tmax=woke_up_time)
    
    events, _ = mne.events_from_annotations(
        raw_data,
        event_id=ANNOTATIONS_EVENT_ID,
        chunk_duration=EPOCH_DURATION,
        verbose=False)
    
    epochs = mne.Epochs(
        raw=raw_data,
        events=events,
        event_id=EVENT_ID,
        tmin=0.,
        tmax=MAX_TIME,
        preload=True,
        baseline=None,
        verbose=False)
    
    y = np.array([event[-1] for event in epochs.events])
    
    return epochs, y 


def convert_to_matrices(data):
    """
    data: mne.Epochs with only one EEG channel
    
    returns
        - X: Matrix of input values, of size (nb_epochs, sampling_rate*epoch_length=3000)
        - y: Vector of observation labels, of size (nb_epochs,)
    """
    df = data.to_data_frame(picks="eeg", long_format=True)
    df = df.drop(columns=['ch_type', 'channel'])
    df = df.sort_values(by=['epoch', 'time'])
    
    y = df[['epoch', 'condition']].drop_duplicates(keep="first")['condition'].to_numpy()
    X = np.matrix(
        [df[df['epoch'] == epoch]['observation'].to_numpy() for epoch in df['epoch'].unique()]
    )

    return X, y

Complete function to apply in order to preprocess our data:

In [None]:
def preprocess(data, current_channel, convert_to_matrices=False):
    """
    data: mne.Raw 
        Instance of all of the night recording and all channels
    current_channel: str 
        Current EEG channel
        
    returns
        - X: Matrix of input values, of size (nb_epochs, sampling_rate*epoch_length=3000)
        - y: Vector of observation labels, of size (nb_epochs,)
    """
    data = drop_other_channels(data, channel)
    data, y = convert_to_epochs(data, df_records.iloc[i])
    
    if not convert_to_matrices:
        return data, y
    
    return convert_to_matrices(data)


## Feature extraction
___


```
        Frequency domain
        features                +----> Average delta band +-+
                +-------+       |                           |
      +-------->+  FFT  +-------+      ...                  |
      |         +-------+       |                           |
      |                         +----> Mean frequency  +----+
      |                                                     |
      |                                                     |
      |                                                     |
  X +-+                                                     +-> X'
Input |                                                     |   Features
mne.  |                         +-----> Variance +----------+   np.array
Epochs|                         |                           |   shape: (nb_epochs, nb_features)
      |         +----------+    |                           |
      +-------->+ get_data +-------------> Mean +--------------+
                +----------+    |                           |
                                |       ...                 |
                                |                           |
                                +-----> Zero cross rate ----+
        Time
        domain features

```

In [None]:
def get_data_from_epochs(epochs):
    """
    epochs: mne.Epochs
    
    returns np array of shape (nb_epochs, sampling_rate*epoch_length)
    """
    return epochs.get_data().squeeze()

get_data_from_epochs_transformer = FunctionTransformer(get_data_from_epochs, validate=False)

In [None]:
def get_transformer(get_feature):
    
    def get_one_feature_per_epoch(X, get_feature):
        """
        X: Input matrix (nb_epochs, sampling_rate*epoch_length)
        get_feature: callable 
            generates one feature for each epoch

        returns matrix (nb_epoch1)
        """
        return [[get_feature(epoch)] for epoch in X]

    return lambda X: get_one_feature_per_epoch(X, get_feature)

#### 1. Time domain features
___

In [None]:
mean_transformer = FunctionTransformer(get_transformer(np.mean))
std_transformer = FunctionTransformer(get_transformer(np.std))

In [None]:
time_domain_feature_union = FeatureUnion([
    ('mean', mean_transformer),
    ('std', std_transformer)
], n_jobs=-1)

time_domain_pipeline = Pipeline([
    ('epochs_to_data', get_data_from_epochs_transformer),
    ('time_domain_features', time_domain_feature_union)
])

#### 2. Frequency domain features
___

In [None]:
def eeg_power_band(epochs):
    """EEG relative power band feature extraction.

    This function takes an ``mne.Epochs`` object and creates EEG features based
    on relative power in specific frequency bands that are compatible with
    scikit-learn.

    Parameters
    ----------
    epochs : Epochs
        The data.

    Returns
    -------
    X : numpy array of shape [n_samples, 5]
        Transformed data.
    """
    # specific frequency bands
    FREQ_BANDS = {"delta": [0.5, 4.5],
                  "theta": [4.5, 8.5],
                  "alpha": [8.5, 11.5],
                  "sigma": [11.5, 15.5],
                  "beta": [15.5, 30]}

    psds, freqs = psd_welch(epochs, fmin=0.5, fmax=30.)
    # Normalize the PSDs
    psds /= np.sum(psds, axis=-1, keepdims=True)

    X = []
    for fmin, fmax in FREQ_BANDS.values():
        psds_band = psds[:, :, (freqs >= fmin) & (freqs < fmax)].mean(axis=-1)
        X.append(psds_band.reshape(len(psds), -1))

    return np.concatenate(X, axis=1)

eeg_power_bands_transformer = FunctionTransformer(eeg_power_band, validate=False)

In [None]:
frequency_domain_feature_union = FeatureUnion([
    ('power_band', eeg_power_bands_transformer)
])
# frequency_domain_pipeline = make_pipeline()

#### 3. Complete feature extraction pipeline
___

In [None]:
feature_union = FeatureUnion([
    ('time_domain', time_domain_pipeline),
    ('frequency_domain', frequency_domain_feature_union)
], n_jobs=-1)

## Extraction
___

In [None]:
subject_file_names = fetch_data(subjects=SUBJECTS, recording=NIGHT_RECORDINGS)
psg_file_names = [names[0] for names in subject_file_names]
stage_file_names = [names[1] for names in subject_file_names]

In [None]:
# We don't initially know the number of observations (epoch), so we cannot instantiate numpy
# arrays. We will convert at the end the lists into arrays.
X = []
y = []

for i in range(len(subject_file_names)):
    raw_data = fetch_signal(psg_file_names[i], stage_file_names[i])
    features_file = []
    
    for channel in EEG_CHANNELS:
        X_file_channel, y_file_channel = preprocess(raw_data.copy(), channel)
        X_features = feature_union.fit_transform(X_file_channel)
        features_file.append(X_features)            
        
        print(f"Done extracting {X_features.shape[1]} features on {X_features.shape[0]} epochs for {channel} for file {psg_file_names[i]}\n")
        
        assert X_features.shape[0] == len(y_file_channel), "Features and labels must have the same number of epochs"

    # Only extends y one time, because both channels refer to the same epochs
    y.extend(y_file_channel) 
    X.extend(np.hstack(tuple(features_file)))

X = np.asarray(X)
y = np.asarray(y)
print(X.shape)
print(y.shape)

del raw_data

In [None]:
X.shape
y.shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
classifier = RandomForestClassifier(n_estimators=100, random_state=42)

classifier.fit(X_train, y_train)
y_test_pred = classifier.predict(X_test)

print(f"Accuracy score: {accuracy_score(y_test, y_test_pred)}")

In [None]:
print(confusion_matrix(y_test, y_test_pred))

In [None]:
print(classification_report(y_test, y_test_pred, target_names=EVENT_ID.keys()))