# Simple Case: PPP and NNN cases only
Author: Lin Lee Cheong <br>
Date: 12/21/2020 <br> <br>

PPP event is defined as one adverse token and one adverse helper token. 
- adverse_tokens:
    - AMI
    - PH
    - ARR
    - CHF

- adverse_helper_tokens:
    - apnea
    - furosemide
    - pneumonia
    - high_creatinine
    - tachycardia
    - resistent_hyp
    
Findings:
1. Main source of variations between SHAP values originals from model overtraining. Models with large feature space ends up with strange attributions of features. The sweet spot seems to be around 1:10 param:obs, slower learning rate, aggressive stopping. <If the feature attributes don't make sense, it means you've overtrained>.
2. For SHAP: use negative background to backfill missingness that calculates the effects of the given feature. About 500 is sufficient. 

TODO:
1. First event still has plenty of high attributes. Need to look into trainable init hidden states.
2. Start reducing probabilities (performance drop) and see how the 'latching on' of important features change
3. Look into how to do feature comparisons
4. Define attention models and see how they help in such cases.
5. Retry this on the original toy dataset
6. Multiprocessing explainer otherwise it will take forever!!

In [None]:
#! pip install pathos

In [None]:
#import pathos.multiprocessing as multiprocessing

In [1]:
import os
import time
import torch
import pandas as pd
import numpy as np
import time
import matplotlib.pyplot as plt
import seaborn as sns

from collections import Counter

import torch
import torch.nn as nn
from torch.utils.data import DataLoader

import deep_id_pytorch_debug2 as deep_id_pytorch_debug

from lstm_models import *
from lstm_utils import *

In [2]:
%load_ext autoreload

%autoreload 2

## OPTIONS

In [3]:
nrows = 1e9
min_freq = 1

seq_len = 30

train_data_path = "../../data/toy_dataset/data/30_ppp_nnn_only/train_30000.csv"
valid_data_path = "../../data/toy_dataset/data/30_ppp_nnn_only/val_3000.csv"
test_data_path = "../../data/toy_dataset/data/30/test_pos3.csv"

test_data_path2 = "../../data/toy_dataset/data/30/test.csv"

model_save_path = './models/lstm/lstm_seq_ppp_nnn_only_{}'.format(seq_len)
results_save_path = "./model_results/lstm/lstm_seq_ppp_nnn_only_{}".format(seq_len)
batch_size = 64

n_epochs = 1
stop_num = 2

embedding_dim = 5
hidden_dim = 8
bidirectional = True
dropout = 0.4
n_layers = 3

target_colname = 'label'
uid_colname = 'patient_id'
x_inputs = [str(x) for x in range(29, -1, -1)]
target_value = '1'

rev = False

In [22]:
for fp in [model_save_path, results_save_path]:
    if not os.path.isdir(os.path.split(fp)[0]):
        print(f'New directory created: {fp}')
        os.makedirs(os.path.split(fp)[0])

print(f"Cuda available: {torch.cuda.is_available()}")
model_device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')
#model_device = torch.device('cpu')

Cuda available: True


## Create Vocab and Build Dataset

In [23]:
train_dataset, vocab = build_lstm_dataset(
                                train_data_path,
                                min_freq=min_freq,
                                uid_colname="patient_id",
                                target_colname="label",
                                max_len=seq_len,
                                target_value=target_value,
                                vocab=None,
                                nrows=nrows,
                                rev=rev
                            )
valid_dataset, _ = build_lstm_dataset(
                                valid_data_path,
                                min_freq=min_freq,
                                uid_colname="patient_id",
                                target_colname="label",
                                max_len=seq_len,
                                target_value=target_value,
                                vocab=vocab,
                                nrows=nrows,
                                rev=rev
                            )

test_dataset, _ = build_lstm_dataset(
                                test_data_path,
                                min_freq=min_freq,
                                uid_colname="patient_id",
                                target_colname="label",
                                max_len=seq_len,
                                target_value=target_value,
                                vocab=vocab,
                                nrows=nrows,
                                rev=rev
                            )

test_dataset2, _ = build_lstm_dataset(
                                test_data_path2,
                                min_freq=min_freq,
                                uid_colname="patient_id",
                                target_colname="label",
                                max_len=seq_len,
                                target_value=target_value,
                                vocab=vocab,
                                nrows=nrows,
                                rev=rev
                            )

Building dataset from ../../data/toy_dataset/data/30_ppp_nnn_only/train_30000.csv..
Success!
Building dataset from ../../data/toy_dataset/data/30_ppp_nnn_only/val_3000.csv..
Success!
Building dataset from ../../data/toy_dataset/data/30/test_pos3.csv..
Success!
Building dataset from ../../data/toy_dataset/data/30/test.csv..
Success!


In [24]:
train_dataloader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    shuffle=True,
    num_workers=2
)

valid_dataloader = DataLoader(
    valid_dataset,
    batch_size=batch_size,
    shuffle=False,
    num_workers=2
)

test_dataloader = DataLoader(
    test_dataset,
    batch_size=batch_size,
    shuffle=False,
    num_workers=2
)

test_dataloader2 = DataLoader(
    test_dataset2,
    batch_size=batch_size,
    shuffle=False,
    num_workers=2
)

In [25]:
len(train_dataset)

30000

In [26]:
from collections import Counter

def count_tokens(dataloader, del_pad=True):
    counter = None
    for _, _, idxes in dataloader:
        idxes = idxes.flatten().tolist()
        if counter is None:
            counter = Counter(idxes)
        else:
            counter.update(idxes)
    counter = dict(counter)
    if del_pad:
        del counter[0]
    return counter


def plot_data(counter, title):
    plt.bar(counter.keys(), counter.values(), width=1.0, color='g')
    plt.title(title)
    plt.xlabel('Token IDs')
    plt.ylabel('Freqs')
    plt.show()    

In [27]:
# train_counter = count_tokens(train_dataloader, del_pad=True)
# plot_data(train_counter, 'Train')

In [28]:
# val_counter = count_tokens(valid_dataloader, del_pad=True)
# plot_data(val_counter, 'Val')

In [29]:
# test_counter2 = count_tokens(test_dataloader2, del_pad=True)
# plot_data(test_counter2, 'Test All')

In [30]:
# test_counter = count_tokens(test_dataloader, del_pad=True)
# plot_data(test_counter, 'Test PPP Only')

## Model Training

In [36]:
torch.cuda.current_device()

0

In [37]:
os.environ['CUDA_VISIBLE_DEVICES'] = "0,1,2,3,4,5"

In [38]:
! echo ${CUDA_VISIBLE_DEVICES}

0,1,2,3,4,5


In [39]:
model = SimpleLSTM(embedding_dim, hidden_dim, vocab, model_device, nlayers=n_layers, dropout=dropout)

In [41]:
model = SimpleLSTM(embedding_dim, hidden_dim, vocab, model_device, nlayers=n_layers, dropout=dropout)
model = model.cuda()

In [42]:
model.device

device(type='cuda', index=1)

In [43]:
sum(p.numel() for p in model.parameters())

4465

In [44]:
#loss_function = nn.CrossEntropyLoss()
loss_function = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.9)

In [45]:
best_valid_loss = float("inf")
valid_worse_loss = 0  # enable early stopping


for epoch in range(n_epochs):

    start_time = time.time()

    train_loss, train_auc = epoch_train_lstm(
        model, train_dataloader, optimizer, loss_function
    )

    valid_loss, valid_auc = epoch_val_lstm(
        model, valid_dataloader, loss_function
    )  # , return_preds=False
    # )
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    print(f"Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s")

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), model_save_path)
        print("Saved Model, epoch {}".format(epoch))
        valid_worse_loss = 0

    else:
        valid_worse_loss += 1
        if valid_worse_loss == stop_num:
            print("EARLY STOP ------")
            break

    scheduler.step()
    print(
        f"Train Loss: {train_loss:.3f} | Train AUC: {train_auc:.2f} \t Val. Loss: {valid_loss:.3f} |  Val. AUC: {valid_auc:.4f}"
    )

RuntimeError: cuDNN error: CUDNN_STATUS_EXECUTION_FAILED

Load best model for test

In [46]:
model.load_state_dict(torch.load(model_save_path))

RuntimeError: Error(s) in loading state_dict for SimpleLSTM:
	Unexpected key(s) in state_dict: "attn_layer.weight", "attn_layer.bias", "context_layer.weight", "context_layer.bias". 
	size mismatch for pred_layer.weight: copying a param with shape torch.Size([1, 32]) from checkpoint, the shape in current model is torch.Size([1, 16]).

In [None]:
val_loss, val_auc, val_labs, val_pred_scores = epoch_val_lstm(
    model, valid_dataloader, loss_function, return_preds=True
)
print(f"Val Loss: {val_loss:.3f} | Val AUC: {val_auc:.2f}")

In [None]:
test_loss2, test_auc2 = epoch_val_lstm(
   model, test_dataloader2, loss_function)#, return_preds=False
print(f"Test Loss (All): {test_loss2:.3f} | Test AUC (All): {test_auc2:.2f}")

In [None]:
test_loss, test_auc = epoch_val_lstm(
   model, test_dataloader, loss_function)#, return_preds=False
print(f"Test Loss (PPP only): {test_loss:.3f} | Test AUC (PPP only): {test_auc:.2f}")

In [None]:
np.mean([1 if x > 0.5 else 0 for x in val_pred_scores])

In [None]:
plt.figure(figsize=(20, 10))
sns.distplot(val_pred_scores, kde=False, bins=100)

In [None]:
test_loss, test_auc, labs, pred_scores = epoch_val_lstm(
    model, test_dataloader, loss_function, return_preds=True
)

In [None]:
np.mean([1 if x > 0.5 else 0 for x in pred_scores])

In [None]:
plt.figure(figsize=(20, 10))
sns.distplot(pred_scores, kde=False, bins=50)

## Get SHAP values

Get the negative cases as background

Select non-AE cases only as background

In [None]:
n_background = 500
sel_set = []
for batch in valid_dataloader:
    sel_set.extend(
        [
            (uid, lab, idxes)
            for (uid, lab, idxes) in zip(batch[0], batch[1], batch[2])
            if lab == 0
        ]
    )
    
    if len(sel_set) > n_background:
        break
        
sel_set = sel_set[:n_background]
    
background_ids = [x[0] for x in sel_set]
background_labels = [x[1] for x in sel_set]
background_idxes = [x[2] for x in sel_set]

In [None]:
background_idxes = torch.stack(background_idxes)
background_idxes.shape

In [None]:
bg_data, bg_masks = model.get_all_ids_masks(background_idxes, seq_len)

In [None]:
bg_data.shape

In [None]:
explainer = deep_id_pytorch_debug.CustomPyTorchDeepIDExplainer(
    model, bg_data, bg_masks, gpu_memory_efficient=True
)

In [None]:
#model.train() # in case that shap complains that autograd cannot be called
lstm_values = []
features = []
start = 0

Get AE events to test

In [None]:
n_test = 100
sel_set = []
for batch in test_dataloader:
    sel_set.extend(
        [
            (uid, lab, idxes)
            for (uid, lab, idxes) in zip(batch[0], batch[1], batch[2])
            if lab == 1
        ]
    )
    
    if len(sel_set) > n_test:
        break
        
sel_set = sel_set[:n_test]
    
test_ids = [x[0] for x in sel_set]
test_labels = [x[1] for x in sel_set]
test_idxes = [x[2] for x in sel_set]

In [None]:
test_idxes = torch.stack(test_idxes)
test_idxes.shape

In [None]:
test_data, test_masks = model.get_all_ids_masks(test_idxes, seq_len)

In [None]:
test_data.shape

In [None]:
np.array(test_masks[0])

In [None]:
len(test_masks[0])

In [None]:
test_data[0].shape

In [None]:
import time

In [None]:
model.cpu()

In [None]:
model.train()

In [None]:
start = time.time()
lstm_shap_values_p = explainer.shap_values_parallel(test_data, test_masks)
print('Total time: {:.2f}mins'.format((time.time()-start)/60.0))

In [None]:
start = time.time()
lstm_shap_values = explainer.shap_values(test_data, test_masks)
print('Total time: {:.2f}mins'.format((time.time()-start)/60.0))

In [None]:
model.eval()

In [None]:
lstm_values, features = [], []
for idx, label in zip(range(test_idxes.shape[0]), test_labels):
    features.append([model.vocab.itos(x) for x in test_idxes[idx].numpy()])
    w = []
    for seq_idx, tok_idx in enumerate(test_idxes[idx].numpy()):
        w.append(lstm_shap_values[idx, seq_idx, tok_idx])
    lstm_values.append(w)

Mix background + dropout in LSTM

In [None]:
# WITH NEG AS BACKGROUND + EMBEDDING DROPOUT + 30000 observation in training + 
# reduced dim space, 3 layers, bidirectional
# raise dropout, single epoch training, drop training rate

#for idx in range(n_test):
for idx in range(10):
    df = pd.DataFrame()
    df['events'] = features[idx]
    df['shap_vals'] = lstm_values[idx]
    
    plt.figure(figsize=(20, 10))
    ax = sns.barplot(x=df.index, y=df.shap_vals, orient='v')
    z = ax.set_xticklabels(df.events, rotation=90)
    plt.title(f"idx: {idx}")

Find overall contributions

In [None]:
results = {}
for idx in range(n_test):
    for token, value in zip(features[idx], lstm_values[idx]):
        if token not in results:
            results[token] = {}
            results[token]['val'] = 0
            results[token]['count'] = 0
        results[token]['val'] += value
        results[token]['count'] += 1
        
for token in results.keys():
    results[token]['ave'] = results[token]['val'] / results[token]['count']

results_df = pd.DataFrame()
res = [(k, results[k]['ave'], results[k]['val']) for k in results.keys()]
results_df['token'] = [i[0] for i in res]
results_df['ave'] = [i[1] for i in res]
results_df['tot'] = [i[2] for i in res]
results_df.head()

In [None]:
results_df.sort_values('ave', ascending=False)

In [None]:
results_df['ave_abs'] = results_df.ave.abs()
results_df.sort_values('ave_abs', ascending=False)