In [1]:
!pip install pyyaml==5.4.1

Collecting pyyaml==5.4.1
  Downloading PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)
[?25l[K     |▌                               | 10 kB 29.0 MB/s eta 0:00:01[K     |█                               | 20 kB 8.5 MB/s eta 0:00:01[K     |█▌                              | 30 kB 7.4 MB/s eta 0:00:01[K     |██                              | 40 kB 7.0 MB/s eta 0:00:01[K     |██▋                             | 51 kB 3.3 MB/s eta 0:00:01[K     |███                             | 61 kB 3.9 MB/s eta 0:00:01[K     |███▋                            | 71 kB 4.1 MB/s eta 0:00:01[K     |████▏                           | 81 kB 4.5 MB/s eta 0:00:01[K     |████▋                           | 92 kB 5.0 MB/s eta 0:00:01[K     |█████▏                          | 102 kB 3.9 MB/s eta 0:00:01[K     |█████▋                          | 112 kB 3.9 MB/s eta 0:00:01[K     |██████▏                         | 122 kB 3.9 MB/s eta 0:00:01[K     |██████▊                         | 133 kB 3.9 M

In [None]:
!pip install darts

Collecting darts
  Downloading darts-0.19.0-py3-none-any.whl (370 kB)
[K     |████████████████████████████████| 370 kB 4.0 MB/s 
[?25hCollecting statsmodels>=0.13.0
  Downloading statsmodels-0.13.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (9.8 MB)
[K     |████████████████████████████████| 9.8 MB 32.8 MB/s 
Collecting pytorch-lightning>=1.5.0
  Downloading pytorch_lightning-1.6.2-py3-none-any.whl (582 kB)
[K     |████████████████████████████████| 582 kB 55.4 MB/s 
[?25hCollecting prophet>=1.0.0
  Downloading prophet-1.0.1.tar.gz (65 kB)
[K     |████████████████████████████████| 65 kB 4.4 MB/s 
[?25hCollecting nfoursid>=1.0.0
  Downloading nfoursid-1.0.0-py3-none-any.whl (16 kB)
Collecting ipython>=7.0.0
  Downloading ipython-7.33.0-py3-none-any.whl (793 kB)
[K     |████████████████████████████████| 793 kB 51.8 MB/s 
Collecting tbats>=1.1.0
  Downloading tbats-1.1.0-py3-none-any.whl (43 kB)
[K     |████████████████████████████████| 43 kB 2.1 MB/s 
Collecting mat

In [22]:
import time
import joblib
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from statsmodels.graphics.tsaplots import plot_acf
from datetime import datetime as dt
from datetime import timedelta, timezone
from scipy.signal import lombscargle, periodogram
import darts
from darts import TimeSeries
from darts.models import KalmanFilter, RNNModel, TCNModel, TransformerModel, NBEATSModel, BlockRNNModel
from darts.utils import timeseries_generation as tg
from darts.metrics import mape
%matplotlib inline

In [17]:
|# helper functions
patient_ids = [183, 184,  14, 220, 233,  62,  17, 186,  52, 216, 115,  37, 244,
       167, 265,  81, 113, 248,  76,  21,  79, 132, 215, 223, 110,  19,
       260, 289,  54,  64, 232, 172,  45, 203, 119,  31,  24, 156, 276,
       162, 240, 239,  32, 236,  73,  26, 205,  48, 267,   2, 256, 141,
         8,  22, 204, 163,  42, 226, 251, 198, 165,  60, 264, 241, 138,
        67,  90, 206,  38, 266, 281,  10,  41, 185,  35, 140, 283, 118,
        80, 123, 217,  39, 160, 112,  50, 292, 164, 246,  18, 128, 103,
       210,   9,  71, 169,  65, 166,  74,  96, 134, 271,  55, 228, 234,
       155, 139,  95,   7,  11,  61, 250, 168, 108, 252, 106,  68, 213,
       127,  36, 176, 273, 130, 136, 152,  53, 193, 243, 253, 148, 135,
       274, 287, 231, 105, 201,   3, 200, 245,  91,  98, 101, 137,  70,
       219, 121, 143,  23, 109, 263,  46, 229,  93, 188,  72, 190, 211,
       218,  77, 181, 288, 278,  87, 173,  47,  33, 277, 224,  43, 257,
        69, 175,  58, 124,  78, 116, 146,  49,  30,  16,  86,   5, 158,
       171, 179, 258, 284, 170, 147, 249,  97, 131, 272, 285, 174, 222,
       102, 221,  15,  89,  57,  82,  29, 214, 149, 227,  20, 280, 247,
       269, 157,  40, 293, 145, 197,  27, 187, 254, 235, 209, 129, 177,
       291, 189, 290, 275, 111]

def get_all_resampled_patient_timeseries(patient_dfs, value_column = "GlucoseValue"):
  all_patient_timeseries = []
  for df in patient_dfs: 
    individual_patient_timeseries = TimeSeries.from_dataframe(df, value_cols = [value_column]) 
    all_patient_timeseries.append(individual_patient_timeseries)
  return all_patient_timeseries

def get_resampled_patient_dfs(): 
  resampled_patient_dfs = []
  for patient_id in patient_ids:
    # patient_ids less than ten lead with 0, eg: 01
    patient_id_string = f'0{patient_id}' if patient_id < 10 else str(patient_id)
    df = pd.read_hdf(f'/content/drive/MyDrive/Colab Notebooks/data/resampled/patient_df_{patient_id_string}.h5')  
    resampled_patient_dfs.append(df)
  return resampled_patient_dfs

def get_filtered_patient_dfs(): 
  filtered_patient_dfs = []
  for patient_id in patient_ids:
    # patient_ids less than ten lead with 0, eg: 01
    patient_id_string = f'0{patient_id}' if patient_id < 10 else str(patient_id)
    df = pd.read_hdf(f'/content/drive/MyDrive/Colab Notebooks/data/filtered/patient_df_{patient_id_string}.h5')  
    filtered_patient_dfs.append(df)
  return filtered_patient_dfs

In [25]:
# resampled data 
resampled_patient_dfs = get_resampled_patient_dfs()
# use 50/50 for test/train for now
train_cutoff_index = round(len(resampled_patient_dfs)/2)

# dfs
resampled_patient_dfs_train = resampled_patient_dfs[:train_cutoff_index]
resampled_patient_dfs_test = resampled_patient_dfs[train_cutoff_index:]

# time-series
resampled_patient_timeseries_train = get_all_resampled_patient_timeseries(resampled_patient_dfs_train, "GlucoseValue")
resampled_patient_timeseries_test = get_all_resampled_patient_timeseries(resampled_patient_dfs_test, "GlucoseValue")

In [24]:
# kalman filtered data 
filtered_patient_dfs = get_resampled_patient_dfs()

# use 50/50 for test/train for now
train_cutoff_index = round(len(filtered_patient_dfs)/2)

# dfs
filtered_patient_dfs_train = filtered_patient_dfs[:train_cutoff_index]
filtered_patient_dfs_test = filtered_patient_dfs[train_cutoff_index:]

# time-series
filtered_patient_timeseries_train = get_all_resampled_patient_timeseries(filtered_patient_dfs_train, "GlucoseValue_kalman_filtered")
filtered_patient_timeseries_test = get_all_resampled_patient_timeseries(filtered_patient_dfs_test, "GlucoseValue_kalman_filtered")

KeyError: ignored

In [None]:
#NBEATS - Parameter Grid Search
parameters = {
    "n_epochs": [20,40,60],
    "input_chunk_length": [100, 200, 300],
    "output_chunk_length": [1],
    "num_layers": [200],
    "torch_device_str":["cuda"]
}

# last 5000 elements 
nbeats_gridsearch_train_X = resampled_patient_timeseries_train[-50].tail(5000)
nbeats_gridsearch = NBEATSModel.gridsearch(parameters = parameters, 
                        series=nbeats_gridsearch_train_X,
                        forecast_horizon=1,
                        last_points_only=False,
                        metric=mape,
                        n_jobs=100,
                        verbose=True
                        )



100%|██████████| 9/9 [00:00<00:00, 2209.73it/s]


In [None]:
last_10000_X_train = [X_df.tail(10000) for X_df in resampled_patient_timeseries_train]
nbeats_model = NBEATSModel(input_chunk_length=300, output_chunk_length=1, n_epochs=10, torch_device_str="cuda")
nbeats_model.fit(last_10000_X_train)
nbeats_model.save_model("NBEATS_25K_HEAD.pth.tar")

In [None]:
tcn_model = TCN(input_chunk_length=2000, output_chunk_length=30, n_epochs=30, torch_device_str="cuda")
nbeats_model.fit(pt184_timeseries.head(25000))
pt184_timeseries

In [None]:
# problems: 
    # glucose value is capped at 400! should we impute those values? 
# todo's: 
    # add test metrics - backtest!!
    # distribution analysis fo
    # explainability? shap scores? 
    # add scaling if we end up doing multivariate prediction
    # probablistic forecasting + distribution analysis -> log normal? look at distribution of the glucose values!
    # rolling mape for test data
    # models: 
    # - ARIMA + Stationarity checks
    # - TCN 
    # - outlier df's we can drop??
    # confidence intervals 
    
# ideas: 
    # predict on time of day? (device time
    