In [1]:
import sys
import gc
import os
sys.path.append('../../')

from scoring.event_detection_matrix import competition_score
from models.mlp.mlp import PyTorchMLP, LightningModel
from models.mlp.data import CustomDataSet

import pandas as pd
import numpy as np
import torch
from tqdm import tqdm
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import accuracy_score

In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

## Load Model

In [3]:
checkpoint_path = "./checkpoints/mlp-2zhn0l6i.ckpt"
model = LightningModel.load_from_checkpoint(checkpoint_path).to(device)

## Load Data

In [4]:
df_validation_series_overview = pd.read_parquet('../../data/processed/lag-features-v2/validation/overview.parquet')
df_validation_events = pd.read_csv('../../data/processed/validation_events_split.csv')

In [5]:
df_validation_series_overview

Unnamed: 0,num_series_id,step,awake,series_index
0,7,0,1,0
1,7,1,1,1
2,7,2,1,2
3,7,3,1,3
4,7,4,1,4
...,...,...,...,...
16484066,276,620635,1,603354
16484067,276,620636,1,603355
16484068,276,620637,1,603356
16484069,276,620638,1,603357


In [6]:
df_validation_events = df_validation_events[df_validation_events.step.notnull()]
df_validation_events

Unnamed: 0,series_id,night,event,step,timestamp,num_series_id
0,062dbd4c95e6,1,onset,7872.0,2018-08-22T23:11:00-0400,7
1,062dbd4c95e6,1,wakeup,14484.0,2018-08-23T08:22:00-0400,7
6,062dbd4c95e6,4,onset,60720.0,2018-08-26T00:35:00-0400,7
7,062dbd4c95e6,4,wakeup,68400.0,2018-08-26T11:15:00-0400,7
8,062dbd4c95e6,5,onset,77304.0,2018-08-26T23:37:00-0400,7
...,...,...,...,...,...,...
2955,fcca183903b7,33,wakeup,565824.0,2019-04-28T06:52:00-0400,276
2956,fcca183903b7,34,onset,577344.0,2019-04-28T22:52:00-0400,276
2957,fcca183903b7,34,wakeup,584052.0,2019-04-29T08:11:00-0400,276
2958,fcca183903b7,35,onset,595344.0,2019-04-29T23:52:00-0400,276


In [7]:
#df_validation_series_overview = df_validation_series_overview[df_validation_series_overview.series_id == '038441c925bb']
#df_validation_series_overview = df_validation_series_overview[df_validation_series_overview.series_id == '038441c925bb']
print(df_validation_series_overview.shape)

(16484071, 4)


## Data

In [8]:
dirname = os.path.dirname(os.path.abspath(''))
validation_root_dir = os.path.join(dirname, "../data/processed/lag-features-v2/validation")

validation_overview = pd.read_parquet(os.path.join(validation_root_dir, 'overview.parquet'), columns=['num_series_id', 'awake', 'series_index'])
validation_overview = validation_overview.astype('int64')

validation_dataset = CustomDataSet(torch.from_numpy(validation_overview.values), validation_root_dir)
validation_dataloader = DataLoader(validation_dataset, batch_size=10000)

## Predict

In [9]:
def predict(batch):
    X, y = batch
    with torch.no_grad():
        logits = model(X.to(device))
    label = torch.argmax(logits, dim=-1)
    confidence = torch.softmax(logits, dim=-1)
    confidence_0 = confidence[:, 0]
    confidence_1 = confidence[:, 1]
    return label, confidence_0, confidence_1

In [10]:
model.eval()
label_list = []
confidence_0_list = []
confidence_1_list = []

for index, batch in enumerate(tqdm(validation_dataloader)) :
    label, confidence_0, confidence_1 = predict(batch)
    
    label_list.append(label)
    confidence_0_list.append(confidence_0)
    confidence_1_list.append(confidence_1)      

100%|██████████| 1649/1649 [05:25<00:00,  5.06it/s]


In [11]:
label_list = torch.cat(label_list).to('cpu').numpy()
confidence_0_list = torch.cat(confidence_0_list).to('cpu').numpy()
confidence_1_list = torch.cat(confidence_1_list).to('cpu').numpy()

In [12]:
df_validation_series_overview['prediction_class'] = label_list
df_validation_series_overview['prediction_confidence_0'] = confidence_0_list
df_validation_series_overview['prediction_confidence_1'] = confidence_1_list

In [13]:
df_validation_series_overview = df_validation_series_overview[['num_series_id', 'step', 'awake', 'prediction_class', 'prediction_confidence_0', 'prediction_confidence_1']]
df_validation_series_overview

Unnamed: 0,num_series_id,step,awake,prediction_class,prediction_confidence_0,prediction_confidence_1
0,7,0,1,1,0.000661,0.999339
1,7,1,1,1,0.002068,0.997932
2,7,2,1,1,0.005746,0.994254
3,7,3,1,1,0.003405,0.996595
4,7,4,1,1,0.000627,0.999373
...,...,...,...,...,...,...
16484066,276,620635,1,0,0.595211,0.404789
16484067,276,620636,1,0,0.595211,0.404789
16484068,276,620637,1,0,0.595211,0.404789
16484069,276,620638,1,0,0.595211,0.404789


## Extract Events

In [14]:
def get_events_smoothed(test_series) :
    series_ids = test_series['num_series_id'].unique()
    events = []

    for idx in tqdm(series_ids):
        # Collecting sample and normalizing features
        X = test_series[test_series.num_series_id == idx]
                
        smoothing_length = 12 * 60 # 30 Minutes
        # We average the confidence, that the participant is awake
        X["confidence_awake"] = X["prediction_confidence_1"].rolling(smoothing_length, center=True).mean().bfill().ffill()
        X["asleep"] = X["prediction_confidence_0"].rolling(smoothing_length, center=True).mean().bfill().ffill()

        # Binarize the asleep column
        X["asleep"] = X["asleep"].round()

        # Getting predicted onset and wakeup time steps
        pred_onsets = X[X['asleep'].diff() > 0]['step'].tolist() # diff is > 0 if it changes from 0 (awake) to 1 (asleep)
        pred_wakeups = X[X['asleep'].diff() < 0]['step'].tolist() # diff is < 0 if it changes from 1 (asleep) to 0 (awake)
     
        if len(pred_onsets) > 0:

            # Ensuring all predicted sleep periods begin and end
            if min(pred_wakeups) < min(pred_onsets):
                pred_wakeups = pred_wakeups[1:]

            if max(pred_onsets) > max(pred_wakeups):
                pred_onsets = pred_onsets[:-1]

            # Keeping sleep periods longer than 30 minutes
            sleep_periods = [(onset, wakeup) for onset, wakeup in zip(pred_onsets, pred_wakeups) if wakeup - onset >= 12 * 30]

            for onset, wakeup in sleep_periods:
                # We take the score
                score = 1 - X[(X['step'] >= onset) & (X['step'] < wakeup)]['confidence_awake'].mean()

                # Adding sleep event to dataframe
                onset_row = {'row_id': len(events), 'series_id': idx, 'step': onset, 'event': 'onset', 'score': score}                
                events.append(onset_row)

                wakeup_row = {'row_id': len(events), 'series_id': idx, 'step': wakeup, 'event': 'wakeup', 'score': score}
                events.append(wakeup_row)

    return pd.DataFrame(events)

In [15]:
predicted_validation_events = get_events_smoothed(df_validation_series_overview)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["confidence_awake"] = X["prediction_confidence_1"].rolling(smoothing_length, center=True).mean().bfill().ffill()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["asleep"] = X["prediction_confidence_0"].rolling(smoothing_length, center=True).mean().bfill().ffill()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-v

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["score"] = X["prediction_confidence_1"].rolling(smoothing_length, center=True).mean().bfill().ffill()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["smooth"] = X["prediction_confidence_0"].rolling(smoothing_length, center=True).mean().bfill().ffill()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-cop

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["score"] = X["prediction_confidence_1"].rolling(smoothing_length, center=True).mean().bfill().ffill()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["smooth"] = X["prediction_confidence_0"].rolling(smoothing_length, center=True).mean().bfill().ffill()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-cop

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["score"] = X["prediction_confidence_1"].rolling(smoothing_length, center=True).mean().bfill().ffill()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["smooth"] = X["prediction_confidence_0"].rolling(smoothing_length, center=True).mean().bfill().ffill()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-cop

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["score"] = X["prediction_confidence_1"].rolling(smoothing_length, center=True).mean().bfill().ffill()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["smooth"] = X["prediction_confidence_0"].rolling(smoothing_length, center=True).mean().bfill().ffill()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-cop

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["score"] = X["prediction_confidence_1"].rolling(smoothing_length, center=True).mean().bfill().ffill()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["smooth"] = X["prediction_confidence_0"].rolling(smoothing_length, center=True).mean().bfill().ffill()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-cop

In [16]:
predicted_validation_events

Unnamed: 0,row_id,series_id,step,event,score
0,0,7,7907,onset,0.734813
1,1,7,14624,wakeup,0.734813
2,2,7,37136,onset,0.601768
3,3,7,37998,wakeup,0.601768
4,4,7,60658,onset,0.755124
...,...,...,...,...,...
2409,2409,276,565790,wakeup,0.844873
2410,2410,276,577418,onset,0.840131
2411,2411,276,583962,wakeup,0.840131
2412,2412,276,595420,onset,0.804650


## Evaluate

### Accuracy

In [17]:
accuracy_score(df_validation_series_overview['awake'].values, df_validation_series_overview['prediction_class'].values)

0.9051713014339722

In [18]:
df_validation_events.series_id = df_validation_events.num_series_id

### Competition Score

In [19]:
competition_score(df_validation_events, predicted_validation_events)

0.4246432915501572

In [21]:
df_validation_series_overview.to_parquet('./data/predicted_series_mlp_lag_features.parquet')