# Child Mind Sleep States

I'll introduce this notebook as a way to use the KLDivLoss, as well as an architecture that is a mix of GRU cells and the UNET architecture.  It can be improved, and done rather easily.  I will discuss the architecture below.

I have used the Kullback-Leibler divergence loss from torch.  

`L(y_pred, y_true) = y_true * (log y_true - log y_pred)`


My reasoning:

1. It is easily interpretable as a probability distribution
2. Ensembles are also easy to interpret, no matter how differently they are trained.

We are going to predict the the critical points, i.e. where the onset and wakeup events happen.  We will take as the target a gaussian of a tunable width centered around the actual point.


This, of course, draws heavily from @werus23: 

https://www.kaggle.com/code/werus23/sleep-critical-point-train/notebook

https://www.kaggle.com/code/werus23/sleep-critical-point-infer?scriptVersionId=147143158

Which is based on the wonderful discussion pose of @tolgadincer: 

https://www.kaggle.com/competitions/child-mind-institute-detect-sleep-states/discussion/441470


There could be bugs and other issues in here.  No promises!

In [None]:
!cp /kaggle/input/event-detection-ap-metric/event_detection_ap.py /kaggle/working/event_detection_ap.py

In [None]:

from plotly.offline import init_notebook_mode
init_notebook_mode()
from IPython.display import Markdown

import dateutil.relativedelta as rd
from plotly import __version__
from plotly.offline import download_plotlyjs, init_notebook_mode, iplot
from plotly.graph_objs import *
from plotly.tools import FigureFactory as FF

from scipy.interpolate import interp1d

from math import pi, sqrt, exp
import sklearn,sklearn.model_selection
import torch
from torch import nn,Tensor
import random
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset, SubsetRandomSampler
from sklearn.metrics import average_precision_score
from timm.scheduler import CosineLRScheduler

import pandas as pd
import numpy as np
import gc
import math
import matplotlib.pyplot as plt
plt.style.use("ggplot")

from tqdm.auto import tqdm

from pyarrow.parquet import ParquetFile
import pyarrow as pa 

import ctypes
import torch
from torch.utils.data import Dataset, DataLoader


import copy
import event_detection_ap as mapmetric

mapmetric.series_id_column_name = 'series_id'
mapmetric.time_column_name = 'step'
mapmetric.event_column_name = 'event'
mapmetric.score_column_name = 'score'
mapmetric.use_scoring_intervals = False
tolerance_intervals = [12, 36, 60, 90, 120, 150, 180, 240, 300, 360]
tolerances = {'wakeup': tolerance_intervals, 'onset': tolerance_intervals}

class PATHS:
    MAIN_DIR = "/kaggle/input/child-mind-institute-detect-sleep-states/"
    SPLIT_DIR = "/kaggle/input/child-sleep-mind-split-train/"
    
    # CSV FILES : 
    SUBMISSION = MAIN_DIR + "sample_submission.csv"
    TRAIN_EVENTS = MAIN_DIR + "train_events.csv"
    # PARQUET FILES:
    TRAIN_SERIES = MAIN_DIR + "train_series.parquet"
    TEST_SERIES = MAIN_DIR + "test_series.parquet"
    
    @staticmethod
    def get_series_filename(series_id):
        f = f'{series_id}_test_series.parquet'
        return PATHS.SPLIT_DIR + f
    
class CFG:
    DEMO_MODE = False
    VERBOSE = True
    
    SEED = 42
    DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
def torch_fix_seed(seed=42):
    # Python random
    random.seed(seed)
    # Numpy
    np.random.seed(seed)
    # Pytorch
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # torch.backends.cudnn.deterministic = True
    # torch.use_deterministic_algorithms = True
    # torch.backends.cudnn.benchmark = True

torch_fix_seed(CFG.SEED)

In [None]:
def clean_memory():
    gc.collect()
    ctypes.CDLL("libc.so.6").malloc_trim(0)
    torch.cuda.empty_cache()

In [None]:
train_events = pd.read_csv(PATHS.TRAIN_EVENTS)
len(train_events)

First step is to rid ourselves of sequences that have no events, or sequences that have a few events.

In [None]:
def get_longest_continuous(gdf):
    """Function will find longest subsequence of a training set where there is an event each night"""
    c = sorted(list(set(gdf[gdf['event'] == 'onset'].dropna().night.unique()) & set(gdf[gdf['event'] == 'wakeup'].dropna().night.unique())))
    start = -1
    end = -1
    m = 0
    save = 0,0
    last = start
    for x in c:
        if x == last + 1:
            end = x
            last = x
        else:
            v = end - start
            if v > m:
                save = start, end
                m = v
            start = x
            end = x
            last = x

    v = end - start
    if v > m:
        save = start, end
        m = v

#     print(f'Max length is {m} from {save}')
#     print(c)
    return save

In [None]:
drop_series = []
continuous = {}
for series_id, gdf in train_events.groupby('series_id'):
    tmp = gdf.dropna()
    # print(series_id, len(tmp))
    if len(tmp) == 0:
        drop_series.append(series_id)
    else:
        start, end = get_longest_continuous(gdf)
        if end - start == 0:
            drop_series.append(series_id)
        else:
            continuous[series_id] = start, end
        
print(f'Drop {len(drop_series)}')

In [None]:
series_ids = train_events.series_id.unique()
len(series_ids)

## Utility Functions

In [None]:
def compare_predictions(valid_ds, i, net):
    """Utility function to return dataframes of predicted and actual events with their distributions"""
    net.eval()
    with torch.no_grad():
        X, Y = valid_ds[i]
        Y = Y.to(CFG.DEVICE, non_blocking=True)
        pred = torch.zeros(Y.shape).to(CFG.DEVICE, non_blocking=True)

        h = None

        seq_len = X.shape[0]
        for j in range(0, seq_len, max_chunk_size):
            X_chunk = X[j: j + max_chunk_size].float().to(CFG.DEVICE, non_blocking=True)
            y_pred, h = net(X_chunk, h)
            h = [hi.detach() for hi in h]
            pred[j: j+max_chunk_size, :] = y_pred

            del X_chunk, y_pred
        clean_memory()
    res_df = pd.DataFrame(torch.softmax(pred.cpu(), axis=0).numpy(), columns=['wakeup_val', 'onset_val'])   
    act_df = pd.DataFrame(Y.cpu().numpy(), columns=['wakeup_val', 'onset_val'])
    return res_df, act_df

In [None]:
def get_predictions(res_df, target, SIGMA):
    """Function will take a predicted dataframe, and find local maxima to get event location.  The score is determined by the area under the curve"""
    q = res_df[target].max() * 0.1
    tmp = res_df.loc[res_df[target] > q].copy()
    # print(f'Target max = {q}, len = {len(tmp)}')
    tmp['gap'] = tmp['step'].diff()
    tmp = tmp[tmp['gap'] > 5*5]
    # print(f'Target max = {q}, len = {len(tmp)}')
    res = []
    for i in range(len(tmp) + 1):
        start_i = 0 if i == 0 else tmp['step'].iloc[i-1]
        end_i = tmp['step'].iloc[i] if i < len(tmp) else res_df['step'].max()
        v = res_df.loc[(res_df['step'] > start_i) & (res_df['step'] < end_i)]
        if v[target].max() > q:
            # print('Locate in ', start_i, end_i)
            idx = v.idxmax()[target]
            step = v.loc[idx, 'step']
            span = 3*SIGMA
            score = res_df.loc[(res_df['step'] > step - span) & (res_df['step'] < step + span), target].sum()
            res.append([step, target, score])
            
    return res

In [None]:
def create_predictions(test_ds, i, net):
    """Function to create the prediction dataframe"""
    net.eval()
    with torch.no_grad():
        X = test_ds[i]
        pred = torch.zeros(X.shape).to(CFG.DEVICE, non_blocking=True)

        h = None

        seq_len = X.shape[0]
        for j in range(0, seq_len, max_chunk_size):
            X_chunk = X[j: j + max_chunk_size].float().to(CFG.DEVICE, non_blocking=True)
            y_pred, h = net(X_chunk, h)
            h = [hi.detach() for hi in h]
            pred[j: j+max_chunk_size, :] = y_pred

            del X_chunk, y_pred
        clean_memory()
    res_df = pd.DataFrame(torch.softmax(pred.cpu(), axis=0).numpy(), columns=['wakeup_val', 'onset_val'])   
    return res_df

In [None]:
class SleepDatasetTrain(Dataset):
    """
    Dataset for Child Mind Sleep States.  We have it output just the raw anglez and enmo variables.
    
    :param series_ids: list of series ids in this set
    :param events: The events dataframe
    :param len_mult: The total length of the sequence must be a multiple of this integer
    :param continuous: dictionary of series_id to start and end points, if we want to trim to continuous series
    :param sigma: The width of the distribution to use for output.
    
    """
    def __init__(
        self,
        series_ids,
        events,
        len_mult,
        continuous = None,
        sigma = None
    ):
        self.series_ids = series_ids
        self.continuous = continuous
        self.len_mult = len_mult
        if events is not None:
            self.events = events
            self.sigma = sigma
        else:
            self.events = None
            self.sigma = None
    
    def load_data(self, series_id):
        filename = PATHS.get_series_filename(series_id)
        data = pd.read_parquet(filename)
        if self.events is not None:
            if self.continuous is not None:
                start, end = self.continuous[series_id]
            else:
                start, end = 0, 1000000
            gap = 6*60*12
            tmp = self.events[(self.events.series_id == series_id) & (self.events.night >= start) & (self.events.night <= end)]
            data = data[(data.step > (tmp.step.min() - gap)) & (data.step < (tmp.step.max() + gap))]
            
            data = data.set_index(['series_id', 'step']).join(tmp.set_index(['series_id', 'step'])[['event', 'night']]).reset_index()
            norm = 1/ np.sqrt(pi / self.sigma)
            for evt in ['wakeup', 'onset']:
                steps = data[data.event == evt]['step'].values
                col = f'{evt}_val'
                data[col] = 0.0
                for i in steps:
                    x = 0.5*((data.step.astype(np.int64) - i)/self.sigma)**2
                    data[col] += np.exp(-x)*norm
                data[col] /= data[col].sum()
                
        n = int((len(data) // len_mult) * len_mult)
        
        return data.iloc[:n]
        
    def __len__(self):
        return len(self.series_ids)

    def __getitem__(self, index):
        series_id = self.series_ids[index]
        data = self.load_data(series_id)
        X = data[['anglez','enmo']].values.astype(np.float32)
        X = torch.from_numpy(X)
        if self.sigma is not None:
            Y = data[['wakeup_val', 'onset_val']].values.astype(np.float32)
            Y = torch.from_numpy(Y)
            return X, Y
        else:
            return X

## Network

I decided that we are detecting edge points, similar to segmentation of images, the a UNET like architecture would be helpful.  Instead of downsampling to get indicators, we let the network learn the convolution.  Since we have wakeup events always after onset events, there is some time component, which is why the GRU cells are in the bottleneck of the UNET.


Possible improvements:

1. The encoder layers and decoder layers could better include more convolutions
2. We can add skip connections.
3. We could just put more indicators in the input.  
4. We can add a time component to the input.
5. We can increase the size of the hidden layers.


The Residual GRU is from 

https://www.kaggle.com/competitions/tlvmc-parkinsons-freezing-gait-prediction/discussion/416410

In [None]:
# Made using Draw.io
from IPython.display import Image
Image('/kaggle/input/grunet-diagram/grunet.jpg')

In [None]:
class ResidualBiGRU(nn.Module):
    def __init__(self, hidden_size, n_layers=1, bidir=True):
        super(ResidualBiGRU, self).__init__()

        self.hidden_size = hidden_size
        self.n_layers = n_layers

        self.gru = nn.GRU(
            hidden_size,
            hidden_size,
            n_layers,
            batch_first=True,
            bidirectional=bidir,
        )
        dir_factor = 2 if bidir else 1
        self.fc1 = nn.Linear(
            hidden_size * dir_factor, hidden_size * dir_factor * 2
        )
        self.ln1 = nn.LayerNorm(hidden_size * dir_factor * 2)
        self.fc2 = nn.Linear(hidden_size * dir_factor * 2, hidden_size)
        self.ln2 = nn.LayerNorm(hidden_size)

    def forward(self, x, h=None):
        res, new_h = self.gru(x, h)
        # res.shape = (batch_size, sequence_size, 2*hidden_size)

        res = self.fc1(res)
        res = self.ln1(res)
        res = nn.functional.relu(res)

        res = self.fc2(res)
        res = self.ln2(res)
        res = nn.functional.relu(res)

        # skip connection
        res = res + x

        return res, new_h

class MultiResidualBiGRU(nn.Module):
    def __init__(self, input_size, hidden_size, out_size, n_layers, bidir=True):
        super(MultiResidualBiGRU, self).__init__()

        self.input_size = input_size
        self.hidden_size = hidden_size
        self.out_size = out_size
        self.n_layers = n_layers

        self.fc_in = nn.Linear(input_size, hidden_size)
        self.ln = nn.LayerNorm(hidden_size)
        self.res_bigrus = nn.ModuleList(
            [
                ResidualBiGRU(hidden_size, n_layers=1, bidir=bidir)
                for _ in range(n_layers)
            ]
        )
        self.fc_out = nn.Linear(hidden_size, out_size)

    def forward(self, x, h=None):
        # if we are at the beginning of a sequence (no hidden state)
        if h is None:
            # (re)initialize the hidden state
            h = [None for _ in range(self.n_layers)]

        x = self.fc_in(x)
        x = self.ln(x)
        x = nn.functional.relu(x)

        new_h = []
        for i, res_bigru in enumerate(self.res_bigrus):
            x, new_hi = res_bigru(x, h[i])
            new_h.append(new_hi)

        x = self.fc_out(x)
#         x = F.normalize(x,dim=0)
        return x, new_h  # log probabilities + hidden states


I have elected to downsample using stride, but you could instead do it via some pooling method.

In [None]:
class EncoderLayer(nn.Module):
    def __init__(self, in_channels, hidden_size, kernel_size, stride, padding, dilation, use_layernorm, print_shape):
        super(EncoderLayer, self).__init__()
        self.conv = nn.Conv1d(in_channels, hidden_size, kernel_size, stride, padding=padding, dilation=1)
        self.ln = nn.LayerNorm(hidden_size) if use_layernorm else None
        self.print_shape = print_shape
        
    def forward(self, x):
        x = self.conv(x.transpose(-1,-2))
        if self.print_shape:
            print('After Conv', x.shape)
        if self.ln is not None:
            x = self.ln(x.transpose(-1, -2))
        else:
            x = x.transpose(-1,-2)
        if self.print_shape:
            print('After Layernorm', x.shape)
        x = nn.functional.relu(x)
        return x

In [None]:
class GRUNET(nn.Module):
    def __init__(self, arch, out_channels, kernel_size, stride, dconv_padding, hidden_size, n_layers, bidir=True, print_shape=False):
        super(GRUNET, self).__init__()

        self.input_size = in_channels
        self.kernel_size = kernel_size
        self.stride = stride
        self.hidden_size = hidden_size
        # self.out_size = out_size
        self.n_layers = n_layers
        self.padding = kernel_size//2
        self.print_shape = print_shape
        self.arch = arch
        self.dilation = 1
        assert arch[-1][1] == hidden_size

        self.conv = nn.Sequential(*[EncoderLayer(in_chan, out_chan, ksize, stride=stride, padding=ksize//2, dilation=self.dilation, use_layernorm=True, print_shape=print_shape) for in_chan, out_chan, stride, ksize in self.arch])
        self.res_bigrus = nn.ModuleList(
            [
                ResidualBiGRU(hidden_size, n_layers=1, bidir=bidir)
                for _ in range(n_layers)
            ]
        )
        self.dconv = nn.Sequential(*sum([[nn.ConvTranspose1d(out_chan, in_chan, ksize, stride=stride, padding=ksize//2, dilation=self.dilation, output_padding=1), 
                                  nn.Conv1d(in_chan, in_chan, ksize, stride=1, padding=ksize//2, dilation=self.dilation), nn.ReLU(), 
                                  nn.Conv1d(in_chan, in_chan, ksize, stride=1, padding=ksize//2, dilation=self.dilation), nn.ReLU()] for in_chan, out_chan, stride, ksize in reversed(arch)], []))
        self.output_layer = nn.Conv1d(2, 2, kernel_size=1, stride=1)
        
    def forward(self, x, h=None):
        # if we are at the beginning of a sequence (no hidden state)
        init_shape = x.shape
        if h is None:
            # (re)initialize the hidden state
            h = [None for _ in range(self.n_layers)]

        if self.print_shape:
            print('In', x.shape)
        x = self.conv(x)
        if self.print_shape:
            print('After EncoderLayer', x.shape)
        new_h = []
        for i, res_bigru in enumerate(self.res_bigrus):
            x, new_hi = res_bigru(x, h[i])
            new_h.append(new_hi)
        if self.print_shape:
            print('After GRU', x.shape)
        x = self.dconv(x.transpose(-1, -2))
        if self.print_shape:
            print('After DConv', x.shape)
            
        x = self.output_layer(x)
        x = x.transpose(-1,-2)
            
        if self.print_shape:
            print('After SmoothConv', x.shape)
        
        return x, new_h  # probabilities + hidden states


Specify and architecture, and make sure that the network returns a prediction that matches the shape of the input.  Really we just want it for length.

In [None]:
arch = [(2, 8,  2, 17),
        (8, 32,  2, 11),
        (32, 64, 2, 7)]

in_channels = 2
hidden_size = arch[-1][1]
kernel_size=25
stride = arch[-1][0]
dilation = 1
n_layers = 5
dconv_padding=5
len_mult = 2**len(arch)


net = GRUNET(arch=arch,out_channels=2, hidden_size=hidden_size, kernel_size=kernel_size, stride=stride, dconv_padding=dconv_padding, n_layers=n_layers, bidir=True, print_shape=True)
X = torch.randn(1040, 2).float()
Z, h = net(X)
assert X.shape == Z.shape


## Training

We sum the loss over the entire series in order to do our loss and step.  This means a batch is each series, but they have variable length inside the batch since series have different sizes.  

In choosing the width, we start with a wide width because it helps the model converge, then tighten it at later epochs.  You can change this!  

1. I have chosen KLDivLoss, but this might not be the best.  Experiment with the different loss functions!
2. Experiment with learning rates and schedules.  Maybe it is better to restart the learning rate when we change sigma!
3. Use center of mass instead of maximum to predict the step!

In [None]:
max_chunk_size = 24*60*12
min_interval = 30

In [None]:
useable_series_ids = [s for s in series_ids if s not in drop_series]
if CFG.DEMO_MODE:
    useable_series_ids = useable_series_ids[:75]
    
np.random.shuffle(useable_series_ids)
len(useable_series_ids)

In [None]:
def get_sigma(epoch):
    if epoch < 4:
        return 90
    elif epoch < 7:
        return 60
    return 36

In [None]:
EPOCHS = 10
loss_fct = nn.KLDivLoss(reduction='sum')

for fold, valid_series_ids in enumerate(np.array_split(useable_series_ids, 5)):
    print(f'Fold {fold}')
    train_series_ids = [s for s in useable_series_ids if s not in valid_series_ids]
    net = GRUNET(arch=arch,out_channels=2, hidden_size=hidden_size, kernel_size=kernel_size, stride=stride, 
                 dconv_padding=dconv_padding, n_layers=n_layers, bidir=True, print_shape=False).to(CFG.DEVICE)
    learning_rate = 1.e-3
    clip_val = 2.
    weight_decay=0.0
    optimizer = torch.optim.Adam(net.parameters(), lr=learning_rate, weight_decay=weight_decay)

    WARMUP_PROP = 0.1
    train_size = len(train_series_ids)
    steps = train_size * EPOCHS
    warmup_steps = int(steps * WARMUP_PROP)
    scheduler = CosineLRScheduler(optimizer,t_initial= steps,warmup_t=warmup_steps, warmup_lr_init=1e-5,lr_min=1e-6,)
    m = nn.LogSoftmax(dim=0)

    train_loss_history = []
    valid_loss_history = []
    learning_rate_history = []
    mAP_history = []

    # with torch.autograd.detect_anomaly(check_nan=True):
    for epoch in range(EPOCHS):
        SIGMA = get_sigma(epoch)
        
        np.random.shuffle(train_series_ids)
        train_ds = SleepDatasetTrain(train_series_ids, events=train_events, len_mult=len_mult, continuous=continuous, sigma=SIGMA)
        valid_ds = SleepDatasetTrain(valid_series_ids, events=train_events, len_mult=len_mult, sigma=SIGMA)
        print(f'Epoch {epoch}, sigma = {SIGMA}')
        net.train()
        train_loss = 0

        for i in tqdm(range(len(train_ds))):
            X, Y = train_ds[i]
            Y = Y.to(CFG.DEVICE, non_blocking=True)
            if not np.isfinite(Y.sum().cpu()):
                print(f'Nan Target {i}')

            pred = torch.zeros(Y.shape).to(CFG.DEVICE, non_blocking=True)
            optimizer.zero_grad()
            scheduler.step(i+train_size*epoch)
            h = None

            seq_len = X.shape[0]
            for j in range(0, seq_len, max_chunk_size):
                X_chunk = X[j: j + max_chunk_size].float().to(CFG.DEVICE, non_blocking=True)
                y_pred, h = net(X_chunk, h)
                h = [hi.detach() for hi in h]
                pred[j: j+max_chunk_size, :] = y_pred

                del X_chunk, y_pred

            if not np.isfinite(pred.sum().cpu().detach()):
                print(f'Nan Pred before logsoftmax {i}')
            pred = m(pred.float())
            if not np.isfinite(pred.sum().cpu().detach()):
                print(f'Nan Pred after logsoftmax {i}')
            loss = loss_fct(pred.float(), Y.float())
            loss.backward()
            train_loss += loss.item()

            nn.utils.clip_grad_norm_(net.parameters(), max_norm=clip_val)
            optimizer.step()

            del pred, loss, Y, X, h
            clean_memory()
        train_loss /= len(train_ds)
        print(f'Epoch {epoch} train loss = {train_loss}')
        train_loss_history.append(train_loss)
        print(f'Learning Rate = {optimizer.param_groups[0]["lr"]}')
        learning_rate_history.append(optimizer.param_groups[0]["lr"])
        print('Evaluate Validation Loss and mAP')
        net.eval()
        val_loss = 0
        with torch.no_grad():
            for i in tqdm(range(len(valid_ds))):
                X, Y = valid_ds[i]
                Y = Y.to(CFG.DEVICE, non_blocking=True)
                pred = torch.zeros(Y.shape).to(CFG.DEVICE, non_blocking=True)

                h = None

                seq_len = X.shape[0]
                for j in range(0, seq_len, max_chunk_size):
                    X_chunk = X[j: j + max_chunk_size].float().to(CFG.DEVICE, non_blocking=True)
                    y_pred, h = net(X_chunk, h)
                    h = [hi.detach() for hi in h]
                    pred[j: j+max_chunk_size, :] = y_pred

                    del X_chunk, y_pred
                pred = m(pred.float())
                loss = loss_fct(pred.float(), Y.float())
                val_loss += loss.item()
                del pred, loss, Y, X, h
                clean_memory()
            val_loss /= len(valid_ds)

            if epoch >= 1:
                all_df = []
                all_truth_df = []
                for i in tqdm(range(len(valid_ds))):
                    series_id = valid_ds.series_ids[i]
                    # print(series_id)
                    data = valid_ds.load_data(series_id)
                    res_df, act_df = compare_predictions(valid_ds, i, net)
                    res_df['step'] = data['step']
                    onset_pred = get_predictions(res_df, target='onset_val', SIGMA=SIGMA)
                    wakeup_pred = get_predictions(res_df, target='wakeup_val', SIGMA=SIGMA)
                    pred_df = pd.DataFrame(wakeup_pred + onset_pred, columns=['step', 'event', 'score'])
                    pred_df['series_id'] = series_id
                    pred_df['row_id'] = pred_df.index
                    pred_df = pred_df.sort_values(by='step').drop_duplicates(subset='step').reset_index(drop=True)

                    all_df.append(pred_df)
                    all_truth_df.append(train_events[(train_events.series_id == series_id) & (train_events.step <= data.step.max()) & (train_events.step >= data.step.min())])

                pred_df = pd.concat(all_df).reset_index(drop=True)
                pred_df['row_id'] = pred_df.index
                pred_df = pred_df[['row_id', 'series_id', 'step', 'event', 'score']]
                pred_df = pred_df.sort_values(by=['series_id', 'step'])
                pred_df.event = pred_df.event.map(lambda x: x.replace('_val', ''))
                #pred_df = renormalize(pred_df)
                truth_df = pd.concat(all_truth_df).reset_index(drop=True)
                if len(pred_df) > 0:
                    map_val = mapmetric.event_detection_ap(solution=truth_df, submission=pred_df[['series_id', 'step', 'event', 'score']], tolerances=tolerances)
                else:
                    print(f'Empty pred dataframe')
                    map_val = 0

                tmp = [x for x in mAP_history if not np.isnan(x)]
                if len(tmp) > 0 and map_val > np.max(tmp):
                    torch.save(net.state_dict(), f'model_best_mAP{fold}.pth')
            else:
                map_val = np.nan

        print(f'Epoch {epoch} validation loss = {val_loss}, mAP = {map_val}')
        valid_loss_history.append(val_loss)
        mAP_history.append(map_val)
        
    torch.save(net.state_dict(), f'model_resid_bigru_fold{fold}.pth')
    iplot({'data': [Scatter(y=train_loss_history, name='train'), Scatter(y=valid_loss_history, name='valid')], 'layout': Layout(title=f'KLDivLoss {fold}')})
    iplot({'data': [Scatter(y=learning_rate_history, name='lr')], 'layout': Layout(title=f'Learning Rate {fold}')})
    iplot({'data': [Scatter(y=mAP_history, name='mAP')], 'layout': Layout(title=f'Event mAP {fold}')})
    print('Break after 1 to save GPU!')
    break

## Visualization

We used plotly.  Zoom in to see how the predictions hold up!

In [None]:
i = 0
series_id = valid_ds.series_ids[i]
print(series_id)
data = valid_ds.load_data(series_id)
res_df, act_df = compare_predictions(valid_ds, i, net)
res_df['step'] = data['step']
act_df['step'] = data['step']

iplot({'data': [Scatter(x=res_df['step'], y=res_df['wakeup_val'], name='pred'), Scatter(x=act_df['step'], y=act_df['wakeup_val'], name='act')], 'layout': Layout(title='wakeup')})
iplot({'data': [Scatter(x=res_df['step'], y=res_df['onset_val'], name='pred'), Scatter(x=act_df['step'], y=act_df['onset_val'], name='act')], 'layout': Layout(title='onset')})