In [1]:
import sys
import os
import gc
sys.path.append('../../')

from scoring.event_detection_matrix import competition_score
from models.transformer.encoder import LightningModel

from sklearn.preprocessing import StandardScaler
from joblib import load

import pandas as pd
import numpy as np
import torch
from tqdm import tqdm
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import accuracy_score
import time

In [2]:
checkpoint_path = "../../models/transformer/neural-nappers/613dj1tv/checkpoints/transformer.ckpt"
model = LightningModel.load_from_checkpoint(checkpoint_path)

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [4]:
model.to(device)

LightningModel(
  (model): TransformerEncoderClassifier(
    (embedding): Linear(in_features=2, out_features=64, bias=True)
    (pos_encoder): PositionalEncoding(
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): TransformerEncoder(
      (layers): ModuleList(
        (0-1): 2 x TransformerEncoderLayer(
          (self_attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=64, out_features=64, bias=True)
          )
          (linear1): Linear(in_features=64, out_features=2048, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (linear2): Linear(in_features=2048, out_features=64, bias=True)
          (norm1): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
          (norm2): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
          (dropout1): Dropout(p=0.1, inplace=False)
          (dropout2): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (classifier): ClassificationHead(
      (norm):

In [5]:
validation_overview = pd.read_parquet('../../data/processed/transformer/validation/overview.parquet', columns=['num_series_id', 'step', 'awake', 'series_index'])
df_validation_events = pd.read_csv('../../data/processed/validation_events_split.csv')

In [6]:
df_validation_events = df_validation_events[df_validation_events.step.notnull()]
df_validation_events

Unnamed: 0,series_id,night,event,step,timestamp,num_series_id
0,062dbd4c95e6,1,onset,7872.0,2018-08-22T23:11:00-0400,7
1,062dbd4c95e6,1,wakeup,14484.0,2018-08-23T08:22:00-0400,7
6,062dbd4c95e6,4,onset,60720.0,2018-08-26T00:35:00-0400,7
7,062dbd4c95e6,4,wakeup,68400.0,2018-08-26T11:15:00-0400,7
8,062dbd4c95e6,5,onset,77304.0,2018-08-26T23:37:00-0400,7
...,...,...,...,...,...,...
2955,fcca183903b7,33,wakeup,565824.0,2019-04-28T06:52:00-0400,276
2956,fcca183903b7,34,onset,577344.0,2019-04-28T22:52:00-0400,276
2957,fcca183903b7,34,wakeup,584052.0,2019-04-29T08:11:00-0400,276
2958,fcca183903b7,35,onset,595344.0,2019-04-29T23:52:00-0400,276


In [7]:
def predict(batch):
    X = batch
    with torch.no_grad():
        logits = model(X[0])
    label = torch.argmax(logits, dim=-1)
    confidence = torch.softmax(logits, dim=-1)
    confidence_0 = confidence[:, 0]
    confidence_1 = confidence[:, 1]
    return label, confidence_0, confidence_1

In [8]:
def predict_series(validation_dataloader):
    model.eval()
    label_list = []
    confidence_0_list = []
    confidence_1_list = []
    
    for index, batch in enumerate(tqdm(validation_dataloader)) :
        label, confidence_0, confidence_1 = predict(batch)

        label_list.extend(label)
        confidence_0_list.extend(confidence_0)
        confidence_1_list.extend(confidence_1)
    return label_list, confidence_0_list, confidence_1_list

In [9]:
steps = []
num_series_ids = []
true_values = []
label_list = []
confidence_0_list = []
confidence_1_list = []

starttime = time.time()

for i, num_series_id in enumerate(validation_overview.num_series_id.unique()):
    print(f'Series {i + 1} of {validation_overview.num_series_id.nunique()}')
    series_X = torch.load('../../data/processed/transformer/validation/' + str(num_series_id) + '.pt').to(device)
    validation_dataset = TensorDataset(series_X)
    validation_dataloader = DataLoader(validation_dataset, batch_size=10000)
    series_label, series_confidence_0, series_confidence_1 = predict_series(validation_dataloader)

    steps.extend(validation_overview[validation_overview.num_series_id == num_series_id]['step'])
    num_series_ids.extend(validation_overview[validation_overview.num_series_id == num_series_id]['num_series_id'])
    true_values.extend(validation_overview[validation_overview.num_series_id == num_series_id]['awake'])
    label_list.extend([tensor.item() for tensor in series_label])
    confidence_0_list.extend([tensor.item() for tensor in series_confidence_0])
    confidence_1_list.extend([tensor.item() for tensor in series_confidence_1])

print(f'Prediction took {time.time() - starttime:.2f} seconds')

Series 1 of 54


100%|██████████| 75/75 [00:41<00:00,  1.81it/s]


Series 2 of 54


100%|██████████| 28/28 [00:11<00:00,  2.44it/s]


Series 3 of 54


100%|██████████| 37/37 [00:17<00:00,  2.16it/s]


Series 4 of 54


100%|██████████| 40/40 [00:19<00:00,  2.10it/s]


Series 5 of 54


100%|██████████| 13/13 [00:01<00:00,  6.62it/s]


Series 6 of 54


100%|██████████| 43/43 [00:20<00:00,  2.05it/s]


Series 7 of 54


100%|██████████| 45/45 [00:22<00:00,  1.97it/s]


Series 8 of 54


100%|██████████| 39/39 [00:18<00:00,  2.11it/s]


Series 9 of 54


100%|██████████| 38/38 [00:18<00:00,  2.06it/s]


Series 10 of 54


100%|██████████| 47/47 [00:23<00:00,  2.00it/s]


Series 11 of 54


100%|██████████| 35/35 [00:15<00:00,  2.20it/s]


Series 12 of 54


100%|██████████| 38/38 [00:18<00:00,  2.06it/s]


Series 13 of 54


100%|██████████| 41/41 [00:19<00:00,  2.08it/s]


Series 14 of 54


100%|██████████| 30/30 [00:12<00:00,  2.36it/s]


Series 15 of 54


100%|██████████| 54/54 [00:27<00:00,  1.93it/s]


Series 16 of 54


100%|██████████| 39/39 [00:18<00:00,  2.09it/s]


Series 17 of 54


100%|██████████| 29/29 [00:12<00:00,  2.33it/s]


Series 18 of 54


100%|██████████| 9/9 [00:00<00:00, 16.57it/s]


Series 19 of 54


100%|██████████| 40/40 [00:19<00:00,  2.06it/s]


Series 20 of 54


100%|██████████| 39/39 [00:19<00:00,  2.04it/s]


Series 21 of 54


100%|██████████| 68/68 [00:36<00:00,  1.85it/s]


Series 22 of 54


100%|██████████| 13/13 [00:01<00:00,  6.65it/s]


Series 23 of 54


100%|██████████| 56/56 [00:29<00:00,  1.92it/s]


Series 24 of 54


100%|██████████| 11/11 [00:01<00:00,  7.71it/s]


Series 25 of 54


100%|██████████| 41/41 [00:19<00:00,  2.08it/s]


Series 26 of 54


100%|██████████| 38/38 [00:17<00:00,  2.14it/s]


Series 27 of 54


100%|██████████| 39/39 [00:19<00:00,  2.05it/s]


Series 28 of 54


100%|██████████| 39/39 [00:19<00:00,  2.04it/s]


Series 29 of 54


100%|██████████| 62/62 [00:33<00:00,  1.88it/s]


Series 30 of 54


100%|██████████| 28/28 [00:11<00:00,  2.44it/s]


Series 31 of 54


100%|██████████| 42/42 [00:20<00:00,  2.05it/s]


Series 32 of 54


100%|██████████| 59/59 [00:31<00:00,  1.89it/s]


Series 33 of 54


100%|██████████| 16/16 [00:03<00:00,  4.13it/s]


Series 34 of 54


100%|██████████| 14/14 [00:03<00:00,  4.03it/s]


Series 35 of 54


100%|██████████| 47/47 [00:23<00:00,  2.00it/s]


Series 36 of 54


100%|██████████| 11/11 [00:01<00:00,  7.51it/s]


Series 37 of 54


100%|██████████| 42/42 [00:20<00:00,  2.04it/s]


Series 38 of 54


100%|██████████| 40/40 [00:19<00:00,  2.02it/s]


Series 39 of 54


100%|██████████| 39/39 [00:19<00:00,  2.02it/s]


Series 40 of 54


100%|██████████| 25/25 [00:09<00:00,  2.61it/s]


Series 41 of 54


100%|██████████| 59/59 [00:31<00:00,  1.86it/s]


Series 42 of 54


100%|██████████| 32/32 [00:13<00:00,  2.29it/s]


Series 43 of 54


100%|██████████| 41/41 [00:19<00:00,  2.08it/s]


Series 44 of 54


100%|██████████| 7/7 [00:01<00:00,  5.27it/s]


Series 45 of 54


100%|██████████| 30/30 [00:13<00:00,  2.24it/s]


Series 46 of 54


100%|██████████| 40/40 [00:19<00:00,  2.09it/s]


Series 47 of 54


100%|██████████| 49/49 [00:24<00:00,  1.98it/s]


Series 48 of 54


100%|██████████| 47/47 [00:23<00:00,  1.96it/s]


Series 49 of 54


100%|██████████| 16/16 [00:03<00:00,  4.12it/s]


Series 50 of 54


100%|██████████| 64/64 [00:34<00:00,  1.83it/s]


Series 51 of 54


100%|██████████| 28/28 [00:11<00:00,  2.44it/s]


Series 52 of 54


100%|██████████| 41/41 [00:20<00:00,  2.03it/s]


Series 53 of 54


100%|██████████| 21/21 [00:07<00:00,  2.94it/s]


Series 54 of 54


100%|██████████| 63/63 [00:33<00:00,  1.87it/s]


Prediction took 1942.96 seconds


In [10]:
df_validation = pd.DataFrame(
    {'step': steps,
     'num_series_id': num_series_ids,
     'awake': true_values,
     'prediction_class': label_list,
     'prediction_confidence_0': confidence_0_list,
     'prediction_confidence_1': confidence_1_list
    })

In [11]:
df_validation

Unnamed: 0,step,num_series_id,awake,prediction_class,prediction_confidence_0,prediction_confidence_1
0,0,7,1,1,0.005660,0.994340
1,1,7,1,1,0.003542,0.996458
2,2,7,1,1,0.001510,0.998490
3,3,7,1,1,0.001320,0.998680
4,4,7,1,1,0.000846,0.999154
...,...,...,...,...,...,...
20009335,620635,276,1,1,0.107081,0.892919
20009336,620636,276,1,1,0.107081,0.892919
20009337,620637,276,1,1,0.107081,0.892919
20009338,620638,276,1,1,0.107081,0.892919


In [12]:
def get_events_smoothed(test_series) :
    series_ids = test_series['num_series_id'].unique()
    events = []

    for idx in tqdm(series_ids):
        # Collecting sample and normalizing features
        X = test_series[test_series.num_series_id == idx]
                
        smoothing_length = 12 * 30 # 30 Minutes
        # We average the confidence, that the participant is awake
        X["confidence_awake"] = X["prediction_confidence_1"].rolling(smoothing_length, center=True).mean().bfill().ffill()
        X["asleep"] = X["prediction_confidence_0"].rolling(smoothing_length, center=True).mean().bfill().ffill()

        # Binarize the asleep column
        X["asleep"] = X["asleep"].round()

        # Getting predicted onset and wakeup time steps
        pred_onsets = X[X['asleep'].diff() > 0]['step'].tolist() # diff is > 0 if it changes from 0 (awake) to 1 (asleep)
        pred_wakeups = X[X['asleep'].diff() < 0]['step'].tolist() # diff is < 0 if it changes from 1 (asleep) to 0 (awake)
     
        if len(pred_onsets) > 0:

            # Ensuring all predicted sleep periods begin and end
            if min(pred_wakeups) < min(pred_onsets):
                pred_wakeups = pred_wakeups[1:]

            if max(pred_onsets) > max(pred_wakeups):
                pred_onsets = pred_onsets[:-1]

            # Keeping sleep periods longer than 30 minutes
            sleep_periods = [(onset, wakeup) for onset, wakeup in zip(pred_onsets, pred_wakeups) if wakeup - onset >= 12 * 30]

            for onset, wakeup in sleep_periods:
                # We take the score
                score = 1 - X[(X['step'] >= onset) & (X['step'] < wakeup)]['confidence_awake'].mean()

                # Adding sleep event to dataframe
                onset_row = {'row_id': len(events), 'series_id': idx, 'step': onset, 'event': 'onset', 'score': score}                
                events.append(onset_row)

                wakeup_row = {'row_id': len(events), 'series_id': idx, 'step': wakeup, 'event': 'wakeup', 'score': score}
                events.append(wakeup_row)

    return pd.DataFrame(events)

In [13]:
predicted_validation_events = get_events_smoothed(df_validation)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["confidence_awake"] = X["prediction_confidence_1"].rolling(smoothing_length, center=True).mean().bfill().ffill()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["asleep"] = X["prediction_confidence_0"].rolling(smoothing_length, center=True).mean().bfill().ffill()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-v

## Evaluation

### Accuracy

In [14]:
accuracy_score(df_validation['awake'].values, df_validation['prediction_class'].values)

0.8707910405840472

### Competition Score

In [15]:
df_validation_events['series_id'] = df_validation_events['num_series_id']

In [16]:
competition_score(df_validation_events, predicted_validation_events)

0.415571627903395

## Competition Score with only closest events

In [17]:
def get_closest_events(predicted_events, true_events):
    predicted_events['closest'] = 0

    for i, row in tqdm(true_events.iterrows()):
        series = predicted_events[predicted_events.series_id == row.series_id]
        correct_step = row.step
        closest_index = (series['step'] - correct_step).abs().idxmin()
    
        # Set the 'closest' column to 1 for the row with the closest value
        predicted_events.loc[closest_index, 'closest'] = 1

    closest_events = predicted_events[predicted_events.closest == 1]
    return closest_events

In [18]:
closest_events = get_closest_events(predicted_validation_events, df_validation_events)

1730it [00:00, 2294.53it/s]


In [21]:
len(predicted_validation_events)

3976

In [22]:
len(closest_events)

1725

In [23]:
competition_score(df_validation_events, closest_events)

0.5071233965237003

In [24]:
df_validation.to_parquet('./data/predicted_series.parquet')