In [84]:
import pandas as pd
import gc
from joblib import load
import math
import numpy as np
from time import time

# Data
from sklearn.preprocessing import StandardScaler
from torch.utils.data import DataLoader, TensorDataset


# Modeling
import pytorch_lightning as pl
import torch
import torch.nn.functional as F
from torch import nn, Tensor
from torchmetrics import MetricCollection
from torchmetrics.classification import MulticlassAccuracy, MulticlassPrecision, MulticlassRecall, MulticlassF1Score
from torch.optim import AdamW
from torch.optim.lr_scheduler import LinearLR

# Disable warnings
pd.options.mode.chained_assignment = None

In [85]:
LAGS_FUTURE = [f"t_lag_{i}" for i in range(-1, -25, -1)]
LAGS_PAST = reversed([f"t_lag_{i}" for i in range(1, 25)])
FEATURES = [*LAGS_PAST, 't_0', *LAGS_FUTURE]

In [86]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cpu


## Data

### Data Cleaning

In [87]:
### PARAMS
ANGLEZ_VARIANCE_SEQUENCE_LENGTH = 6 * 60 * 12 # 8h
ANGLEZ_REPETITION_SEQUENCE_LENGTH = 4 * 60 * 12 # 4h

CLEAN_BUFFER = 3 * 60 * 12 # 3h

def mark_clean_anglez_too_low_variance(series):
    last_step = series.iloc[-1]['step']       
    
    for current_start_step in range(0, len(series), ANGLEZ_VARIANCE_SEQUENCE_LENGTH):
        current_end_step = current_start_step + ANGLEZ_VARIANCE_SEQUENCE_LENGTH

        series_chunk = series[current_start_step:current_end_step]

        series_chunk_anglez = series_chunk['anglez'].abs()
        if not (series_chunk_anglez > 50).any():
            clean_from = max(0, current_start_step - CLEAN_BUFFER)
            clean_to = min(last_step, current_end_step + CLEAN_BUFFER)
            
            series.loc[clean_from:clean_to, 'clean'] = 1

    return series


def mark_clean_repetition(series):
    last_step = series.iloc[-1]['step']       

    for current_start_step in range(0, len(series), ANGLEZ_REPETITION_SEQUENCE_LENGTH):
        current_end_step = current_start_step + ANGLEZ_REPETITION_SEQUENCE_LENGTH

        series_chunk = series[current_start_step:current_end_step].reset_index(drop=True)

        for comparing_start_step in range(current_end_step, len(series), ANGLEZ_REPETITION_SEQUENCE_LENGTH):
            comparing_end_step = comparing_start_step + ANGLEZ_REPETITION_SEQUENCE_LENGTH
            comparing_series_chunk = series[comparing_start_step:comparing_end_step].reset_index(drop=True)

            if series_chunk['anglez'].equals(comparing_series_chunk['anglez']):
                clean_from = max(0, current_start_step - CLEAN_BUFFER)
                clean_to = min(last_step, current_end_step + CLEAN_BUFFER)
                series.loc[clean_from:clean_to, 'clean'] = 1
                
                clean_from = max(0, comparing_start_step - CLEAN_BUFFER)
                clean_to = min(last_step, comparing_end_step + CLEAN_BUFFER)
                series.loc[clean_from:clean_to, 'clean'] = 1

    return series


def data_cleaning(series):
    series['clean'] = 0

    series = mark_clean_anglez_too_low_variance(series)
    series = mark_clean_repetition(series)
    
    series = series[series.clean == 0]


    return series


### Data Normalization

In [88]:
#scaler = load('/kaggle/input/transformer-checkpoint/scaler.pkl')
scaler = load('../../data/processed/scaler.pkl')

def data_normalization(series_to_normalize):
    series_to_normalize[['enmo', 'anglez']] = scaler.transform(series_to_normalize[['enmo', 'anglez']])
    return series_to_normalize

### Feature Engineering

In [89]:
def data_feature_engineering(series):

    series['t_0'] = series[['anglez', 'enmo']].values.tolist()

    for i in range(1, 25):
        series[f'anglez_lag_{i}'] = series["anglez"].shift(i).bfill()
        series[f'enmo_lag_{i}'] = series["enmo"].shift(i).bfill()
        series[f't_lag_{i}'] = series[[f'anglez_lag_{i}', f'enmo_lag_{i}']].values.tolist()
        series = series.drop(columns=[f'anglez_lag_{i}', f'enmo_lag_{i}'])

    for i in range(-1, -25, -1):
        series[f'anglez_lag_{i}'] = series["anglez"].shift(i).ffill()
        series[f'enmo_lag_{i}'] = series["enmo"].shift(i).ffill()
        series[f't_lag_{i}'] = series[[f'anglez_lag_{i}', f'enmo_lag_{i}']].values.tolist()
        series = series.drop(columns=[f'anglez_lag_{i}', f'enmo_lag_{i}'])
    
    return series.reset_index(drop=True)

### Data Pipeline

In [90]:
def data_pipeline(series_id):
    #series = pd.read_parquet('/kaggle/input/child-mind-institute-detect-sleep-states/test_series.parquet', filters=[('series_id','=',series_id)])
    series = pd.read_parquet('../../data/processed/validation_series_split.parquet', filters=[('series_id','=',series_id)])
    series = data_cleaning(series)
    series = data_normalization(series)
    return data_feature_engineering(series)

## Model

### Model Definition

In [91]:
class ClassificationHead(nn.Module):
    def __init__(self, d_model, seq_len, n_classes: int = 2):
        super().__init__()
        self.norm = nn.LayerNorm(d_model)
        self.seq = nn.Sequential(
            nn.Flatten(),
            nn.Linear(d_model * seq_len, 512),
            nn.ReLU(),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Linear(128, n_classes)
        )

    def forward(self, x):
        x = self.norm(x)
        x = self.seq(x)
        return x


class TransformerEncoderClassifier(nn.Module):

    def __init__(self, num_features=2, encoder_layer_nhead=4, num_layers=2, dim_model=64, num_classes=2,
                 sequence_length=49, dropout: float = 0.1):
        super().__init__()

        self.model_type = 'Transformer'

        self.num_features = num_features
        self.encoder_layer_nhead = encoder_layer_nhead
        self.num_layers = num_layers
        self.dim_model = dim_model
        self.num_classes = num_classes
        self.sequence_length = sequence_length

        self.embedding = nn.Linear(self.num_features, self.dim_model)

        self.pos_encoder = PositionalEncoding(self.dim_model, dropout, self.sequence_length)

        encoder_layer = nn.TransformerEncoderLayer(d_model=self.dim_model,
                                                   nhead=self.encoder_layer_nhead,
                                                   batch_first=True)
        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=self.num_layers)

        self.classifier = ClassificationHead(seq_len=sequence_length, d_model=self.dim_model, n_classes=num_classes)

    def forward(self, src):
        output = self.embedding(src)
        output = self.pos_encoder(output)
        output = self.encoder(output)
        return self.classifier(output)


class LightningModel(pl.LightningModule):

    def __init__(self, model=None, encoder_layer_nhead=4, num_layers=2, dim_model=64, learning_rate=None):
        super().__init__()

        self.save_hyperparameters()

        self.sequence_length = 49
        self.num_features = 2
        self.num_classes = 2
        self.encoder_layer_nhead = encoder_layer_nhead
        self.num_layers = num_layers
        self.dim_model = dim_model
        self.learning_rate = learning_rate

        if model is None:
            self.model = TransformerEncoderClassifier(self.num_features,
                                                      self.encoder_layer_nhead,
                                                      self.num_layers,
                                                      self.dim_model,
                                                      self.num_classes,
                                                      self.sequence_length)
        else:
            self.model = model

    def forward(self, x):
        return self.model(x)

class PositionalEncoding(nn.Module):
    def __init__(self, d_model: int, dropout: float = 0.1, max_length: int = 5000):
        """
        Args:
          d_model:      dimension of embeddings
          dropout:      randomly zeroes-out some of the input
          max_length:   max sequence length
        """
        # inherit from Module
        super().__init__()

        # initialize dropout
        self.dropout = nn.Dropout(p=dropout)

        # create tensor of 0s
        pe = torch.zeros(max_length, d_model)

        # create position column
        k = torch.arange(0, max_length).unsqueeze(1)

        # calc divisor for positional encoding
        div_term = torch.exp(
            torch.arange(0, d_model, 2) * -(math.log(10000.0) / d_model)
        )

        # calc sine on even indices
        pe[:, 0::2] = torch.sin(k * div_term)

        # calc cosine on odd indices
        pe[:, 1::2] = torch.cos(k * div_term)

        # add dimension
        pe = pe.unsqueeze(0)

        # buffers are saved in state_dict but not trained by the optimizer
        self.register_buffer("pe", pe)

    def forward(self, x: Tensor):
        """
        Args:
          x:        embeddings (batch_size, seq_length, d_model)

        Returns:
                    embeddings + positional encodings (batch_size, seq_length, d_model)
        """
        # add positional encoding to the embeddings
        x = x + self.pe[:, : x.size(1)].requires_grad_(False)

        # perform dropout
        return self.dropout(x)


In [92]:
#model = LightningModel.load_from_checkpoint('/kaggle/input/transformer-checkpoint/transformer.ckpt')
model = LightningModel.load_from_checkpoint('../../models/transformer/neural-nappers/613dj1tv/checkpoints/transformer.ckpt')
model.eval()

C:\Users\Michael Jakober\anaconda3\envs\aich\lib\site-packages\pytorch_lightning\utilities\migration\utils.py:55: The loaded checkpoint was produced with Lightning v2.1.2, which is newer than your current Lightning version: v2.1.0


LightningModel(
  (model): TransformerEncoderClassifier(
    (embedding): Linear(in_features=2, out_features=64, bias=True)
    (pos_encoder): PositionalEncoding(
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): TransformerEncoder(
      (layers): ModuleList(
        (0-1): 2 x TransformerEncoderLayer(
          (self_attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=64, out_features=64, bias=True)
          )
          (linear1): Linear(in_features=64, out_features=2048, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (linear2): Linear(in_features=2048, out_features=64, bias=True)
          (norm1): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
          (norm2): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
          (dropout1): Dropout(p=0.1, inplace=False)
          (dropout2): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (classifier): ClassificationHead(
      (norm):

In [93]:
def prediction_batch(batch):
    X = batch
    with torch.no_grad():
        logits = model(X.to(device))
    label = torch.argmax(logits, dim=-1)
    confidence = torch.softmax(logits, dim=-1)
    confidence_0 = confidence[:, 0]
    confidence_1 = confidence[:, 1]
    return label, confidence_0, confidence_1

In [94]:
def prediction(series):
    predictions = series[['series_id', 'step']]
    
    label_list = []
    confidence_0_list = []
    confidence_1_list = []
    
    series_length, series_columns = series[FEATURES].values.shape
    start_time = time()
    dataset = TensorDataset(torch.from_numpy(np.array(np.ravel(series[FEATURES].values).tolist())
                                             .reshape(series_length, series_columns, 2)).to(torch.float32).to(device))
    
    dataloader = DataLoader(dataset, batch_size=10000)
    start_time = time()
    for index, batch in enumerate(dataloader):
        label, confidence_0, confidence_1 = prediction_batch(batch[0])
        
        label_list.append(label)
        confidence_0_list.append(confidence_0)
        confidence_1_list.append(confidence_1) 

    predictions['prediction_class'] = torch.cat(label_list).cpu().numpy()
    predictions['prediction_confidence_0'] = torch.cat(confidence_0_list).cpu().numpy()
    predictions['prediction_confidence_1'] = torch.cat(confidence_1_list).cpu().numpy()

    return predictions

## Event Extraction

In [95]:
smoothing_length = 12 * 60 # 60 Minutes

def event_extraction(series):
    events = []

    series_id = series["series_id"].values[0]
               
    series["confidence_awake"] = series["prediction_confidence_1"].rolling(smoothing_length, center=True).mean().bfill().ffill()
    series["asleep"] = series["prediction_confidence_0"].rolling(smoothing_length, center=True).mean().bfill().ffill()

    # Binarize the smoothing column
    series["asleep"] = series["asleep"].round()

    # Getting predicted onset and wakeup time steps
    pred_onsets = series[series['asleep'].diff() > 0]['step'].tolist() # diff is > 0 if it changes from 0 (awake) to 1 (asleep)
    pred_wakeups = series[series['asleep'].diff() < 0]['step'].tolist() # diff is < 0 if it changes from 1 (asleep) to 0 (awake)
     
    if len(pred_onsets) > 0 and len(pred_wakeups) > 0:

        # Ensuring all predicted sleep periods begin and end
        if min(pred_wakeups) < min(pred_onsets):
            pred_wakeups = pred_wakeups[1:]

        if max(pred_onsets) > max(pred_wakeups):
            pred_onsets = pred_onsets[:-1]

        # Keeping sleep periods longer than 30 minutes
        sleep_periods = [(onset, wakeup) for onset, wakeup in zip(pred_onsets, pred_wakeups) if wakeup - onset >= 12 * 30]

        for onset, wakeup in sleep_periods :
            # Scoring using mean probability over period
            score = 1 - series[(series['step'] >= onset) & (series['step'] < wakeup)]['confidence_awake'].mean()

            # Adding sleep event to dataframe
            onset_row = {'row_id': len(events), 'series_id': series_id, 'step': onset, 'event': 'onset', 'score': score}                
            events.append(onset_row)

            wakeup_row = {'row_id': len(events), 'series_id': series_id, 'step': wakeup, 'event': 'wakeup', 'score': score}
            events.append(wakeup_row)

    return pd.DataFrame(events)


## Execution

In [97]:
#df_test = pd.read_parquet('/kaggle/input/child-mind-institute-detect-sleep-states/test_series.parquet', columns=['series_id'])
df_test = pd.read_parquet('../../data/processed/validation_series_split.parquet', columns=['series_id'])
series_ids = df_test.series_id.unique()[1:2]
del df_test
gc.collect()

0

In [99]:
events_list = []

for i, series_id in enumerate(series_ids):
    print(f'Step {i+1} of {len(series_ids)}')
    series_prepared = data_pipeline(series_id)
    predictions = prediction(series_prepared)
    events = event_extraction(predictions)

    events_list.append(events)

    del series_prepared
    del predictions
    gc.collect()

events = pd.concat(events_list).reset_index(drop=True)

Step 1 of 1
tensor([0.9998, 0.9999, 0.9999,  ..., 0.9997, 0.9996, 0.9994])
tensor([0.9992, 0.9990, 0.9989,  ..., 0.9855, 0.9670, 0.9534])
tensor([0.9494, 0.9522, 0.9238,  ..., 0.9984, 0.9960, 0.9892])
tensor([0.9824, 0.9691, 0.9516,  ..., 0.0374, 0.0374, 0.0374])
tensor([0.0374, 0.0374, 0.0374,  ..., 0.9981, 0.9987, 0.9985])
tensor([0.9981, 0.9979, 0.9977,  ..., 0.0366, 0.0366, 0.0366])
tensor([0.0366, 0.0366, 0.0366,  ..., 1.0000, 1.0000, 1.0000])
tensor([1.0000, 1.0000, 0.9999,  ..., 0.2639, 0.2871, 0.3357])
tensor([0.4140, 0.5051, 0.5546,  ..., 0.9553, 0.9782, 0.9723])
tensor([0.9762, 0.9942, 0.9918,  ..., 0.9707, 0.9788, 0.9856])
tensor([0.9879, 0.9796, 0.9800,  ..., 0.5764, 0.5415, 0.4067])
tensor([0.4258, 0.4894, 0.5042,  ..., 0.9969, 0.9968, 0.9977])
tensor([0.9978, 0.9980, 0.9986,  ..., 0.5884, 0.6025, 0.5802])
tensor([0.5192, 0.5412, 0.5841,  ..., 0.9999, 0.9999, 0.9999])
tensor([0.9999, 0.9999, 0.9999,  ..., 0.9994, 0.9993, 0.9982])
tensor([0.9982, 0.9981, 0.9988,  ..., 0.999

In [100]:
submissions = events
submissions.reset_index(inplace=True)
submissions.rename(columns={"index": "row_id"}, inplace=True)
submissions.to_csv('submission.csv', sep=',', index=False)

In [101]:
submissions

Unnamed: 0,row_id,row_id.1,series_id,step,event,score
0,0,0,0cd1e3d0ed95,5497,onset,0.87139
1,1,1,0cd1e3d0ed95,9387,wakeup,0.87139
2,2,2,0cd1e3d0ed95,10341,onset,0.719725
3,3,3,0cd1e3d0ed95,11202,wakeup,0.719725
4,4,4,0cd1e3d0ed95,12233,onset,0.778423
5,5,5,0cd1e3d0ed95,13487,wakeup,0.778423
6,6,6,0cd1e3d0ed95,23465,onset,0.858795
7,7,7,0cd1e3d0ed95,29529,wakeup,0.858795
8,8,8,0cd1e3d0ed95,37880,onset,0.876353
9,9,9,0cd1e3d0ed95,45348,wakeup,0.876353
