In [None]:
import pandas as pd
import gc
from joblib import load
import math
import numpy as np

# Data
from sklearn.preprocessing import StandardScaler
from torch.utils.data import DataLoader, TensorDataset


# Modeling
import pytorch_lightning as pl
import torch
import torch.nn.functional as F
from torch import nn, Tensor
from torchmetrics import MetricCollection
from torchmetrics.classification import MulticlassAccuracy, MulticlassPrecision, MulticlassRecall, MulticlassF1Score
from torch.optim import AdamW
from torch.optim.lr_scheduler import LinearLR

# Disable warnings
pd.options.mode.chained_assignment = None

In [None]:
LAGS_FUTURE = [f"t_lag_{i}" for i in range(-1, -25, -1)]
LAGS_PAST = reversed([f"t_lag_{i}" for i in range(1, 25)])
FEATURES = [*LAGS_PAST, 't_0', *LAGS_FUTURE]

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

## Data

### Data Cleaning

In [None]:
sequence_length = 24 * 60 * 12 # 17280 Steps = 1 Day

def data_cleaning(series_to_clean):
    multiplicator = 0

    indices_to_remove = []

    while True:
        # Get 24 Hours (s1) and the next 24 Hours (s2)
        s1 = series_to_clean[multiplicator*sequence_length:(multiplicator+1)*sequence_length]['anglez'].reset_index(drop=True)
        s2 = series_to_clean[(multiplicator+1)*sequence_length:(multiplicator+2)*sequence_length]['anglez'].reset_index(drop=True)

        # If the length is not the same, its the last part of the series
        if len(s1) != len(s2):
            # If the last part of the series is the same as the part 24 hours before, remove that as well
            if s1[:len(s2)].equals(s2):
                indices_to_remove.append((len(series_to_clean)-len(s2), len(series_to_clean)))
            break

        # If the 24 hours match, remove those indices
        if s1.equals(s2):
            indices_to_remove.append(((multiplicator+1)*sequence_length, (multiplicator+2)*sequence_length))

        multiplicator += 1


    cleaned_df = series_to_clean

    # Remove the indices reversed, otherwise the indices of the remaining rows change
    for start_idx, end_idx in reversed(indices_to_remove):
        cleaned_df = cleaned_df.drop(index=cleaned_df.iloc[start_idx:end_idx].index)
    
    return cleaned_df

### Data Normalization

In [None]:
#scaler = load('/kaggle/input/transformer-checkpoint/scaler.pkl')
scaler = load('local-path/scaler.pkl')

def data_normalization(series_to_normalize):
    series_to_normalize[['enmo', 'anglez']] = scaler.transform(series_to_normalize[['enmo', 'anglez']])
    return series_to_normalize

### Feature Engineering

In [None]:
def data_feature_engineering(series):

    series['t_0'] = series[['anglez', 'enmo']].values.tolist()

    for i in range(1, 25):
        series[f'anglez_lag_{i}'] = series["anglez"].shift(i).bfill()
        series[f'enmo_lag_{i}'] = series["enmo"].shift(i).bfill()
        series[f't_lag_{i}'] = series[[f'anglez_lag_{i}', f'enmo_lag_{i}']].values.tolist()
        series = series.drop(columns=[f'anglez_lag_{i}', f'enmo_lag_{i}'])

    for i in range(-1, -25, -1):
        series[f'anglez_lag_{i}'] = series["anglez"].shift(i).ffill()
        series[f'enmo_lag_{i}'] = series["enmo"].shift(i).ffill()
        series[f't_lag_{i}'] = series[[f'anglez_lag_{i}', f'enmo_lag_{i}']].values.tolist()
        series = series.drop(columns=[f'anglez_lag_{i}', f'enmo_lag_{i}'])
    
    return series.reset_index(drop=True)

### Data Pipeline

In [None]:
def data_pipeline(series_id):
    #series = pd.read_parquet('/kaggle/input/child-mind-institute-detect-sleep-states/train_series.parquet', filters=[('series_id','=',series_id)])
    series = pd.read_parquet('../../data/processed/validation_series_split.parquet', filters=[('series_id','=',series_id)])
    series = data_cleaning(series)
    series = data_normalization(series)
    return data_feature_engineering(series)

## Model

### Model Definition

In [None]:
class ClassificationHead(nn.Module):
    def __init__(self, d_model, seq_len, n_classes: int = 2):
        super().__init__()
        self.norm = nn.LayerNorm(d_model)
        self.seq = nn.Sequential(
            nn.Flatten(),
            nn.Linear(d_model * seq_len, 512),
            nn.ReLU(),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Linear(128, n_classes)
        )

    def forward(self, x):
        x = self.norm(x)
        x = self.seq(x)
        return x


class TransformerEncoderClassifier(nn.Module):

    def __init__(self, num_features=2, encoder_layer_nhead=4, num_layers=2, dim_model=64, num_classes=2,
                 sequence_length=49, dropout: float = 0.1):
        super().__init__()

        self.model_type = 'Transformer'

        self.num_features = num_features
        self.encoder_layer_nhead = encoder_layer_nhead
        self.num_layers = num_layers
        self.dim_model = dim_model
        self.num_classes = num_classes
        self.sequence_length = sequence_length

        self.embedding = nn.Linear(self.num_features, self.dim_model)

        self.pos_encoder = PositionalEncoding(self.dim_model, dropout, self.sequence_length)

        encoder_layer = nn.TransformerEncoderLayer(d_model=self.dim_model,
                                                   nhead=self.encoder_layer_nhead,
                                                   batch_first=True)
        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=self.num_layers)

        self.classifier = ClassificationHead(seq_len=sequence_length, d_model=self.dim_model, n_classes=num_classes)

    def forward(self, src):
        output = self.embedding(src)
        output = self.pos_encoder(output)
        output = self.encoder(output)
        return self.classifier(output)


class LightningModel(pl.LightningModule):

    def __init__(self, model=None, encoder_layer_nhead=4, num_layers=2, dim_model=64, learning_rate=None):
        super().__init__()

        self.save_hyperparameters()

        self.sequence_length = 49
        self.num_features = 2
        self.num_classes = 2
        self.encoder_layer_nhead = encoder_layer_nhead
        self.num_layers = num_layers
        self.dim_model = dim_model
        self.learning_rate = learning_rate

        if model is None:
            self.model = TransformerEncoderClassifier(self.num_features,
                                                      self.encoder_layer_nhead,
                                                      self.num_layers,
                                                      self.dim_model,
                                                      self.num_classes,
                                                      self.sequence_length)
        else:
            self.model = model

        # metrics
        metrics = MetricCollection([
            MulticlassAccuracy(num_classes=self.num_classes),
            MulticlassPrecision(num_classes=self.num_classes),
            MulticlassRecall(num_classes=self.num_classes),
            MulticlassF1Score(num_classes=self.num_classes)
        ])
        self.train_metrics = metrics.clone(prefix='train_')
        self.val_metrics = metrics.clone(prefix='val_')

    def forward(self, x):
        return self.model(x)

    def training_step(self, batch, batch_idx):
        loss, true_labels, logits = self._shared_step(batch)

        self.log('train_loss', loss)
        self.train_metrics(logits, true_labels)
        self.log_dict(self.train_metrics, on_epoch=True, on_step=False)

        return loss

    def validation_step(self, batch, batch_nb):
        loss, true_labels, logits = self._shared_step(batch)

        self.log('val_loss', loss)
        self.val_metrics(logits, true_labels)
        self.log_dict(self.val_metrics)

        return loss

    def configure_optimizers(self):
        optimizer = AdamW(self.parameters(), lr=self.hparams.learning_rate)

        scheduler = LinearLR(optimizer, start_factor=1.0, end_factor=0.001, total_iters=self._num_steps())
        scheduler = {"scheduler": scheduler, "interval": "step", "frequency": 1}

        return [optimizer], [scheduler]

    def _shared_step(self, batch):
        features, true_labels = batch
        logits = self(features)

        loss = F.cross_entropy(logits, true_labels)
        return loss, true_labels, logits

    def _num_steps(self):
        train_dataloader = self.trainer.datamodule.train_dataloader()
        dataset_size = len(train_dataloader.dataset)
        num_steps = dataset_size * self.trainer.max_epochs // self.trainer.datamodule.batch_size
        return num_steps


class PositionalEncoding(nn.Module):
    def __init__(self, d_model: int, dropout: float = 0.1, max_length: int = 5000):
        """
        Args:
          d_model:      dimension of embeddings
          dropout:      randomly zeroes-out some of the input
          max_length:   max sequence length
        """
        # inherit from Module
        super().__init__()

        # initialize dropout
        self.dropout = nn.Dropout(p=dropout)

        # create tensor of 0s
        pe = torch.zeros(max_length, d_model)

        # create position column
        k = torch.arange(0, max_length).unsqueeze(1)

        # calc divisor for positional encoding
        div_term = torch.exp(
            torch.arange(0, d_model, 2) * -(math.log(10000.0) / d_model)
        )

        # calc sine on even indices
        pe[:, 0::2] = torch.sin(k * div_term)

        # calc cosine on odd indices
        pe[:, 1::2] = torch.cos(k * div_term)

        # add dimension
        pe = pe.unsqueeze(0)

        # buffers are saved in state_dict but not trained by the optimizer
        self.register_buffer("pe", pe)

    def forward(self, x: Tensor):
        """
        Args:
          x:        embeddings (batch_size, seq_length, d_model)

        Returns:
                    embeddings + positional encodings (batch_size, seq_length, d_model)
        """
        # add positional encoding to the embeddings
        x = x + self.pe[:, : x.size(1)].requires_grad_(False)

        # perform dropout
        return self.dropout(x)


In [None]:
#model = LightningModel.load_from_checkpoint('/kaggle/input/transformer-checkpoint/transformer.ckpt')
model = LightningModel.load_from_checkpoint('/local-checkpoint/transformer.ckpt')
model.eval()

In [None]:
def prediction_batch(batch):
    X = batch
    with torch.no_grad():
        logits = model(X.to(device))
    label = torch.argmax(logits, dim=-1)
    confidence = torch.softmax(logits, dim=-1)
    confidence_0 = confidence[:, 0]
    confidence_1 = confidence[:, 1]
    return label, confidence_0, confidence_1

In [None]:
def prediction(series):
    predictions = series[['series_id', 'step']]
    
    label_list = []
    confidence_0_list = []
    confidence_1_list = []
    
    series_length, series_columns = series[FEATURES].values.shape

    dataset = TensorDataset(torch.from_numpy(np.vstack(np.ravel(series[FEATURES].values))
                                             .reshape(series_length, series_columns, 2)).to(torch.float32).to(device))
    dataloader = DataLoader(dataset, batch_size=1024)
    for index, batch in enumerate(dataloader): 
        label, confidence_0, confidence_1 = prediction_batch(batch[0])
        
        label_list.append(label)
        confidence_0_list.append(confidence_0)
        confidence_1_list.append(confidence_1) 
        
    predictions['prediction_class'] = torch.cat(label_list).cpu().numpy()
    predictions['prediction_confidence_0'] = torch.cat(confidence_0_list).cpu().numpy()
    predictions['prediction_confidence_1'] = torch.cat(confidence_1_list).cpu().numpy()

    return predictions

## Event Extraction

In [None]:
smoothing_length = 12 * 60 # 60 Minutes

def event_extraction(series):
    events = []

    series_id = series["series_id"].values[0]
               
    series["confidence_awake"] = series["prediction_confidence_1"].rolling(smoothing_length, center=True).mean().bfill().ffill()
    series["asleep"] = series["prediction_confidence_0"].rolling(smoothing_length, center=True).mean().bfill().ffill()

    # Binarize the smoothing column
    series["asleep"] = series["asleep"].round()

    # Getting predicted onset and wakeup time steps
    pred_onsets = series[series['asleep'].diff() > 0]['step'].tolist() # diff is > 0 if it changes from 0 (awake) to 1 (asleep)
    pred_wakeups = series[series['asleep'].diff() < 0]['step'].tolist() # diff is < 0 if it changes from 1 (asleep) to 0 (awake)
     
    if len(pred_onsets) > 0 and len(pred_wakeups) > 0:

        # Ensuring all predicted sleep periods begin and end
        if min(pred_wakeups) < min(pred_onsets):
            pred_wakeups = pred_wakeups[1:]

        if max(pred_onsets) > max(pred_wakeups):
            pred_onsets = pred_onsets[:-1]

        # Keeping sleep periods longer than 30 minutes
        sleep_periods = [(onset, wakeup) for onset, wakeup in zip(pred_onsets, pred_wakeups) if wakeup - onset >= 12 * 30]

        for onset, wakeup in sleep_periods :
            # Scoring using mean probability over period
            score = 1 - series[(series['step'] >= onset) & (series['step'] < wakeup)]['score'].mean()

            # Adding sleep event to dataframe
            onset_row = {'row_id': len(events), 'series_id': series_id, 'step': onset, 'event': 'onset', 'score': score}                
            events.append(onset_row)

            wakeup_row = {'row_id': len(events), 'series_id': series_id, 'step': wakeup, 'event': 'wakeup', 'score': score}
            events.append(wakeup_row)

    return pd.DataFrame(events)


## Execution

In [None]:
#df_test = pd.read_parquet('/kaggle/input/child-mind-institute-detect-sleep-states/train_series.parquet', columns=['series_id'])
df_test = pd.read_parquet('../../data/processed/validation_series_split.parquet', columns=['series_id'])
series_ids = df_test.series_id.unique()
del df_test
gc.collect()

In [None]:
events_list = []

for i, series_id in enumerate(series_ids):
    print(f'Step {i+1} of {len(series_ids)}')
    series_prepared = data_pipeline(series_id)
    predictions = prediction(series_prepared)
    events = event_extraction(predictions)

    events_list.append(events)

    del series_prepared
    del predictions
    gc.collect()

events = pd.concat(events_list).reset_index(drop=True)

In [None]:
submissions = events
submissions.reset_index(inplace=True)
submissions.rename(columns={"index": "row_id"}, inplace=True)
submissions.to_csv('/kaggle/working/submission.csv', sep=',', index=False)