In [1]:
import os
import random
import sys

import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import StandardScaler
from alibi_detect.cd import MMDDrift
import random

sys.path.append("../..")

from utils.utils import *
from drift_detector.rolling_window import *
from baseline_models.temporal.pytorch.optimizer import Optimizer
from baseline_models.temporal.pytorch.utils import *

2022-08-10 09:12:04,486 [1;37mINFO[0m cyclops.orm     - Database setup, ready to run queries!


In [2]:
PATH = "/mnt/nfs/project/delirium/drift_exp/JULY-04-2022"
threshold=0.05
num_timesteps = 6
run=1
shift="covid"
hospital = ["SBK", "UHNTG", "THPC", "THPM", "UHNTW", "SMH","MSH","PMH"]
outcome="mortality"
aggregation_type="time"
scale=True

In [3]:
admin_data, x, y = get_gemini_data(PATH)

numerical_cols = get_numerical_cols(PATH)
for col in numerical_cols:
    scaler = StandardScaler().fit(x[col].values.reshape(-1, 1))
    x[col] = pd.Series(
        np.squeeze(scaler.transform(x[col].values.reshape(-1, 1))),
        index=x[col].index,
    )
X = reshape_inputs(x, num_timesteps)

2022-08-10 09:12:04,539 [1;37mINFO[0m cyclops.utils.file - Loading dataframe to /mnt/nfs/project/delirium/drift_exp/JULY-04-2022/aggregated_events.parquet


Load data from aggregated events...


2022-08-10 09:12:04,970 [1;37mINFO[0m cyclops.utils.file - Loading dataframe to /mnt/nfs/project/delirium/drift_exp/JULY-04-2022/aggmeta_start_ts.parquet
2022-08-10 09:12:05,139 [1;37mINFO[0m cyclops.feature_handler - Loading features from file...
2022-08-10 09:12:05,143 [1;37mINFO[0m cyclops.feature_handler - Found file to load for static features...
2022-08-10 09:12:05,145 [1;37mINFO[0m cyclops.feature_handler - Successfully loaded static features from file...
2022-08-10 09:12:05,179 [1;37mINFO[0m cyclops.feature_handler - Found file to load for temporal features...


Load data from feature handler...


2022-08-10 09:12:10,451 [1;37mINFO[0m cyclops.feature_handler - Successfully loaded temporal features from file...


Load data from admin data...


2022-08-10 09:12:18,971 [1;37mINFO[0m cyclops.utils.file - Loading dataframe to /mnt/nfs/project/delirium/drift_exp/JULY-04-2022/aggmeta_end_ts.parquet
2022-08-10 09:12:39,692 [1;37mINFO[0m cyclops.feature_handler - Loading features from file...
2022-08-10 09:12:39,694 [1;37mINFO[0m cyclops.feature_handler - Found file to load for static features...
2022-08-10 09:12:39,695 [1;37mINFO[0m cyclops.feature_handler - Successfully loaded static features from file...
2022-08-10 09:12:39,727 [1;37mINFO[0m cyclops.feature_handler - Found file to load for temporal features...
2022-08-10 09:12:44,481 [1;37mINFO[0m cyclops.feature_handler - Successfully loaded temporal features from file...


In [4]:
(x_train, y_train), (x_val, y_val), (x_test, y_test), feats, admin_data = import_dataset_hospital(admin_data, x, y, shift, outcome, hospital, run, shuffle=True)

random.seed(1)

# Normalize data
(X_tr_normalized, y_tr),(X_val_normalized, y_val), (X_t_normalized, y_t) = normalize_data(aggregation_type, admin_data, num_timesteps, x_train, y_train, x_val, y_val, x_test, y_test)
# Scale data
if scale:
    X_tr_normalized, X_val_normalized, X_t_normalized = scale_data(numerical_cols, X_tr_normalized, X_val_normalized, X_t_normalized)
# Process data
X_tr_final, X_val_final, X_t_final = process_data(aggregation_type, num_timesteps, X_tr_normalized, X_val_normalized, X_t_normalized)

## Create Data Streams

In [5]:
start_date = date(2019, 1, 1)
end_date = date(2020, 8, 1)

val_ids=list(X_val_normalized.index.get_level_values(0).unique())

x_test_stream, y_test_stream, measure_dates_test = get_streams(x, y, admin_data, start_date, end_date, stride=1, window=1, ids_to_exclude=val_ids)

2019-01-01 - 2019-01-02
2020-01-01 - 2020-01-02


## Cumulating Rolling Window 

In [16]:
random.seed(1)

# rolling window parameters
threshold = 0.05
num_timesteps = 6
stat_window=30
lookup_window=0
stride=1
# model parameters
output_dim = 1
batch_size = 64
input_dim = 108
timesteps = 6
hidden_dim = 64
layer_dim = 2
dropout = 0.2
n_epochs = 1
learning_rate = 2e-3
weight_decay = 1e-6
last_timestep_only = False
device = get_device()
#drift detector parameters
dr_technique="BBSDs_trained_LSTM"
model_path=os.path.join(os.getcwd(),"../../saved_models/"+shift+"_lstm.pt")
md_test="MMD"
sign_level=0.05
sample=1000
dataset="gemini"
context_type="lstm"
representation="rf"

shift_reductor = ShiftReductor(
    X_tr_final, y_tr, dr_technique, dataset, var_ret=0.8, model_path=model_path,
)
# Get shift detector
shift_detector = ShiftDetector(
    dr_technique, md_test, sign_level, shift_reductor, sample, dataset, feats, model_path, context_type, representation,
)

model_params = {
    "device": device,
    "input_dim": input_dim,
    "hidden_dim": hidden_dim,
    "layer_dim": layer_dim,
    "output_dim": output_dim,
    "dropout_prob": dropout,
    "last_timestep_only": last_timestep_only,
}

model = get_temporal_model("lstm", model_params).to(device)
model_path = os.path.join(os.getcwd(),'../../saved_models/',shift+"_lstm.pt")
model.load_state_dict(torch.load(model_path))

loss_fn = nn.BCEWithLogitsLoss(reduction="none")
optimizer = optim.Adagrad(
    model.parameters(), lr=learning_rate, weight_decay=weight_decay
)
lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=128, gamma=0.5)
activation = nn.Sigmoid()
opt = Optimizer(
    model=model,
    loss_fn=loss_fn,
    optimizer=optimizer,
    activation=activation,
    lr_scheduler=lr_scheduler,
)
def cumulative_rolling_window(X_train, X_stream, y_stream, shift_detector, sample, stat_window, lookup_window, stride, num_timesteps, threshold, model_name, model, opt=None, X_ref=None, y_ref=None, retrain=True):

    p_vals = np.asarray([])
    dist_vals = np.asarray([])
    run_length = int(stat_window) 
    
    i = stat_window
    p_val=1
    total_alarms = 0 
    
    if X_ref is not None:
        X_prev = X_ref
        # create val loader
        
    while i+stat_window+lookup_window <= len(X_stream):
        feat_index = 0
        
        if p_val < threshold:
            
            if retrain:
                X_update = pd.concat(X_stream[max(int(i)-run_length,0):int(i)])
                X_update = X_update[~X_update.index.duplicated(keep='first')]
                ind = X_update.index.get_level_values(0).unique()
                X_update = reshape_inputs(X_update, num_timesteps)

                ## Get updated source data for two-sample test (including data for retraining) 
                X_prev = np.concatenate((X_prev, X_update), axis=0)
                tups = [tuple(row) for row in X_prev]
                X_prev = np.unique(tups, axis=0)
                np.random.shuffle(X_prev)                 

                y_update = pd.concat(y_stream[max(int(i)-run_length,0):int(i)])
                y_update.index = ind
                y_update = y_update[~y_update.index.duplicated(keep='first')].to_numpy()

                print("Retrain ",model_name," on: ",max(int(i)-run_length,0),"-",int(i))

                if model_name == "rnn":
                    ## create train loader 
                    update_dataset = get_data(X_update, y_update)
                    update_loader = torch.utils.data.DataLoader(update_dataset, batch_size=1, shuffle=False)

                    retrain_model_path='adaptive_window_retrain.model'

                    ## train 
                    opt.train(
                         update_loader,
                         update_loader,
                         batch_size=batch_size,
                         n_epochs=n_epochs,
                         n_features=input_dim,
                         timesteps=timesteps,
                         model_path=retrain_model_path,
                    )

                    model.load_state_dict(torch.load(retrain_model_path))
                    opt.model = model
                    shift_detector.model_path = retrain_model_path

                elif model_name == "gbt":
                    model = model.fit(X_retrain, y_retrain, xgb_model=model.get_booster())

                else:
                    print("Invalid Model Name")

        if X_ref is None:
            X_prev = pd.concat(X_stream[max(int(i)-run_length,0):int(i)+stat_window])
            X_prev = X_prev[~X_prev.index.duplicated(keep='first')]
            X_prev = reshape_inputs(X_prev, num_timesteps)
            #prev = prev.reshape(prev.shape[0]*prev.shape[1],prev.shape[2])
            
        X_next = pd.concat(X_stream[max(int(i)+lookup_window,0):int(i)+stat_window+lookup_window])
        X_next = X_next[~X_next.index.duplicated(keep='first')]
        X_next = reshape_inputs(X_next, num_timesteps)
        
        if X_next.shape[0]<=2 or X_prev.shape[0]<=2:
            break
            
        ## run distribution shift check here
        (p_val, dist, val_acc, te_acc) = shift_detector.detect_data_shift(X_train, 
                                                                          X_prev[:1000,:], 
                                                                          X_next[:sample,:]
        )

        #print("Ref -->",max(int(i)+lookup_window,0),"-",int(i)+stat_window+lookup_window,"\tP-Value: ",p_val)

        dist_vals = np.concatenate((dist_vals, np.repeat(dist, 1)))
        p_vals = np.concatenate((p_vals, np.repeat(p_val, 1)))
        i += stride
        run_length += stat_window
        
        if p_val < threshold:
            total_alarms += 1
    
    return dist_vals, p_vals, total_alarms

dist_test, pvals_test, total_alarms = cumulative_rolling_window(X_tr_final, x_test_stream, y_test_stream, shift_detector, sample, stat_window, lookup_window, stride, num_timesteps, threshold, model_name="rnn", model=model,opt=opt, X_ref=X_val_final, retrain=True)

Retrain  rnn  on:  0 - 40
[1/1] Training loss: 0.7346	 Validation loss: 0.6383
Retrain  rnn  on:  0 - 52
[1/1] Training loss: 0.6654	 Validation loss: 0.6330
Retrain  rnn  on:  0 - 55
[1/1] Training loss: 0.6547	 Validation loss: 0.6305
Retrain  rnn  on:  0 - 59
[1/1] Training loss: 0.6531	 Validation loss: 0.6381
Retrain  rnn  on:  0 - 60
[1/1] Training loss: 0.6480	 Validation loss: 0.6362
Retrain  rnn  on:  0 - 86
[1/1] Training loss: 0.6560	 Validation loss: 0.6443
Retrain  rnn  on:  0 - 87
[1/1] Training loss: 0.6535	 Validation loss: 0.6436
Retrain  rnn  on:  0 - 105
[1/1] Training loss: 0.6486	 Validation loss: 0.6427
Retrain  rnn  on:  0 - 242
[1/1] Training loss: 0.6467	 Validation loss: 0.6393
Retrain  rnn  on:  0 - 244
[1/1] Training loss: 0.6387	 Validation loss: 0.6348
Retrain  rnn  on:  0 - 245
[1/1] Training loss: 0.6373	 Validation loss: 0.6333
Retrain  rnn  on:  0 - 297
[1/1] Training loss: 0.6346	 Validation loss: 0.6300
Retrain  rnn  on:  0 - 300
[1/1] Training loss:

In [22]:
random.seed(1)

# rolling window parameters
threshold = 0.05
num_timesteps = 6
stat_window=30
lookup_window=0
stride=1
# model parameters
output_dim = 1
batch_size = 64
input_dim = 108
timesteps = 6
hidden_dim = 64
layer_dim = 2
dropout = 0.2
n_epochs = 1
learning_rate = 2e-3
weight_decay = 1e-6
last_timestep_only = False
device = get_device()
#drift detector parameters
dr_technique="BBSDs_trained_LSTM"
model_path=os.path.join(os.getcwd(),"../../saved_models/"+shift+"_lstm.pt")
md_test="MMD"
sign_level=0.05
sample=1000
dataset="gemini"
context_type="lstm"
representation="rf"

shift_reductor = ShiftReductor(
    X_tr_final, y_tr, dr_technique, dataset, var_ret=0.8, model_path=model_path,
)
# Get shift detector
shift_detector = ShiftDetector(
    dr_technique, md_test, sign_level, shift_reductor, sample, dataset, feats, model_path, context_type, representation,
)

model_params = {
    "device": device,
    "input_dim": input_dim,
    "hidden_dim": hidden_dim,
    "layer_dim": layer_dim,
    "output_dim": output_dim,
    "dropout_prob": dropout,
    "last_timestep_only": last_timestep_only,
}

model = get_temporal_model("lstm", model_params).to(device)
model_path = os.path.join(os.getcwd(),'../../saved_models/',shift+"_lstm.pt")
model.load_state_dict(torch.load(model_path))

loss_fn = nn.BCEWithLogitsLoss(reduction="none")
optimizer = optim.Adagrad(
    model.parameters(), lr=learning_rate, weight_decay=weight_decay
)
lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=128, gamma=0.5)
activation = nn.Sigmoid()
opt = Optimizer(
    model=model,
    loss_fn=loss_fn,
    optimizer=optimizer,
    activation=activation,
    lr_scheduler=lr_scheduler,
)

#####################################################
## dynamically adjusting drift detector - if drift is significant, reference dataset is reset to current time 
#####################################################
def dynamic_rolling_window(X_train, X_stream, y_stream, shift_detector, sample, stat_window, lookup_window, stride, num_timesteps, threshold, model_name, model, opt=None, X_ref=None, retrain=True):

    p_vals = np.asarray([])
    dist_vals = np.asarray([])
    run_length = int(stat_window) 
    
    i = stat_window
    p_val=1
    total_alarms = 0 
    
    if X_ref is not None:
        X_prev = X_ref
        # create val loader
        
    while i+stat_window+lookup_window <= len(X_stream):
        feat_index = 0
        
        if p_val < threshold:
            
            if retrain:
                X_update = pd.concat(X_stream[max(int(i)-run_length,0):int(i)])
                X_update = X_update[~X_update.index.duplicated(keep='first')]
                ind = X_update.index.get_level_values(0).unique()
                X_update = reshape_inputs(X_update, num_timesteps)

                ## Get updated source data for two-sample test (including data for retraining) 
                X_prev = np.concatenate((X_prev, X_update), axis=0)
                tups = [tuple(row) for row in X_prev]
                X_prev = np.unique(tups, axis=0)
                np.random.shuffle(X_prev)                 

                y_update = pd.concat(y_stream[max(int(i)-run_length,0):int(i)])
                y_update.index = ind
                y_update = y_update[~y_update.index.duplicated(keep='first')].to_numpy()

                print("Retrain ",model_name," on: ",max(int(i)-run_length,0),"-",int(i))

                if model_name == "rnn":
                    ## create train loader 
                    update_dataset = get_data(X_update, y_update)
                    update_loader = torch.utils.data.DataLoader(update_dataset, batch_size=1, shuffle=False)

                    retrain_model_path='adaptive_window_retrain.model'

                    ## train 
                    opt.train(
                         update_loader,
                         update_loader,
                         batch_size=batch_size,
                         n_epochs=n_epochs,
                         n_features=input_dim,
                         timesteps=timesteps,
                         model_path=retrain_model_path,
                    )

                    model.load_state_dict(torch.load(retrain_model_path))
                    opt.model = model
                    shift_detector.model_path = retrain_model_path

                elif model_name == "gbt":
                    model = model.fit(X_retrain, y_retrain, xgb_model=model.get_booster())

                else:
                    print("Invalid Model Name")

            i += stride

        if X_ref is None:
            X_prev = pd.concat(X_stream[max(int(i)-run_length,0):int(i)+stat_window])
            X_prev = X_prev[~X_prev.index.duplicated(keep='first')]
            X_prev = reshape_inputs(X_prev, num_timesteps)
            #prev = prev.reshape(prev.shape[0]*prev.shape[1],prev.shape[2])
            
        X_next = pd.concat(X_stream[max(int(i)+lookup_window,0):int(i)+stat_window+lookup_window])
        X_next = X_next[~X_next.index.duplicated(keep='first')]
        X_next = reshape_inputs(X_next, num_timesteps)
        
        if X_next.shape[0]<=2 or X_prev.shape[0]<=2:
            break
            
        ## run distribution shift check here
        (p_val, dist, val_acc, te_acc) = shift_detector.detect_data_shift(X_train, 
                                                                          X_prev[:1000,:], 
                                                                          X_next[:sample,:]
        )

    #    print(max(int(i)-run_length,0),"-", int(i),"-->",max(int(i)+lookup_window,0),"-",int(i)+stat_window+lookup_window,"\tP-Value: ",p_val)
        
        dist_vals = np.concatenate((dist_vals, np.repeat(dist, 1)))
        p_vals = np.concatenate((p_vals, np.repeat(p_val, 1)))

        if p_val >= threshold:
            run_length += stride
            i += stride
        else:
            run_length= stat_window
            total_alarms += 1
    
    return dist_vals, p_vals, total_alarms

dist_test, pvals_test, total_alarms = dynamic_rolling_window(X_tr_final, x_test_stream, y_test_stream, shift_detector, sample, stat_window, lookup_window, stride, num_timesteps, threshold, model_name="rnn", model=model,opt=opt, X_ref=X_val_final, retrain=True)

Retrain  rnn  on:  9 - 39
[1/1] Training loss: 0.8294	 Validation loss: 0.6653
Retrain  rnn  on:  23 - 53
[1/1] Training loss: 0.6793	 Validation loss: 0.6444
Retrain  rnn  on:  24 - 54
[1/1] Training loss: 0.6573	 Validation loss: 0.6341
Retrain  rnn  on:  25 - 55
[1/1] Training loss: 0.6593	 Validation loss: 0.6403
Retrain  rnn  on:  26 - 56
[1/1] Training loss: 0.6516	 Validation loss: 0.6319
Retrain  rnn  on:  29 - 59
[1/1] Training loss: 0.6313	 Validation loss: 0.6165
Retrain  rnn  on:  44 - 74
[1/1] Training loss: 0.6506	 Validation loss: 0.6374
Retrain  rnn  on:  54 - 84
[1/1] Training loss: 0.6827	 Validation loss: 0.6757
Retrain  rnn  on:  55 - 85
[1/1] Training loss: 0.6806	 Validation loss: 0.6621
Retrain  rnn  on:  56 - 86
[1/1] Training loss: 0.6815	 Validation loss: 0.6657
Retrain  rnn  on:  78 - 108
[1/1] Training loss: 0.6748	 Validation loss: 0.6619
Retrain  rnn  on:  209 - 239
[1/1] Training loss: 0.6639	 Validation loss: 0.6552
Retrain  rnn  on:  211 - 241
[1/1] Tra

In [20]:
total_alarms

61

In [21]:
import numpy as np, scipy.stats as st
mean = np.mean(pvals_test[pvals_test<0.05])
ci = st.t.interval(0.95, len(pvals_test[pvals_test<0.05])-1, loc=np.mean(pvals_test[pvals_test<0.05]), scale=st.sem(pvals_test[pvals_test<0.05]))
print(mean, ci)

0.010327868621613159 (0.006767784025553591, 0.013887953217672727)


In [23]:
total_alarms

79

In [24]:
import numpy as np, scipy.stats as st
mean = np.mean(pvals_test[pvals_test<0.05])
ci = st.t.interval(0.95, len(pvals_test[pvals_test<0.05])-1, loc=np.mean(pvals_test[pvals_test<0.05]), scale=st.sem(pvals_test[pvals_test<0.05]))
print(mean, ci)

0.015063290802549712 (0.011817926154194571, 0.018308655450904855)


In [17]:
total_alarms

64

In [18]:
import numpy as np, scipy.stats as st
mean = np.mean(pvals_test[pvals_test<0.05])
ci = st.t.interval(0.95, len(pvals_test[pvals_test<0.05])-1, loc=np.mean(pvals_test[pvals_test<0.05]), scale=st.sem(pvals_test[pvals_test<0.05]))
print(mean, ci)

0.014687499671708792 (0.011100127117103094, 0.018274872226314492)
