## Main notebook to run Attention-LSTM models: Single Fold

Author: Lin Lee Cheong <br>
Last Updated: 11/23/2020 <br>

#Pytorch LSTM/Embeddings Layer:

Issue: RuntimeError: only Tensors of floating point dtype can require gradients

Resource: https://gitmemory.com/issue/slundberg/shap/595/494334554
https://github.com/slundberg/shap/issues/595

* RNN/Embedding Layer not supported?
    
* nn.embedding changed to one-hot-encoding * V (matrix multiplication)

    https://discuss.pytorch.org/t/backwards-through-embedding/30364/5

In [1]:
import os
import argparse
import time
import pickle
import pandas as pd
import numpy as np
from collections import Counter
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torchtext.datasets import text_classification
from torchtext.vocab import Vocab
from attn_lstm_model import AttentionRNN
from model_utils import (
    log,
    build_lstm_dataset_v2,
    epoch_train_lstm,
    epoch_val_lstm,
    generate_batch,
    count_parameters,
    epoch_time
)

In [2]:
%load_ext autoreload
%autoreload 2

**OPTIONS**

In [3]:
nrows = 1e9
min_freq = 500
device_id = None

train_data_x_path = '/home/ec2-user/SageMaker/CMSAI/modeling/tes/data/anonymize/AE/Data/Preprocessed/Anonymized/final_allvocab_x_train.npy'
train_data_y_path = '/home/ec2-user/SageMaker/CMSAI/modeling/tes/data/anonymize/AE/Data/Preprocessed/Anonymized/final_allvocab_y_train.npy'
val_data_x_path = '/home/ec2-user/SageMaker/CMSAI/modeling/tes/data/anonymize/AE/Data/Preprocessed/Anonymized/final_allvocab_x_val.npy'
val_data_y_path = '/home/ec2-user/SageMaker/CMSAI/modeling/tes/data/anonymize/AE/Data/Preprocessed/Anonymized/final_allvocab_y_val.npy'
test_data_x_path = '/home/ec2-user/SageMaker/CMSAI/modeling/tes/data/anonymize/AE/Data/Preprocessed/Anonymized/cms_test_x.npy'
test_data_y_path = '/home/ec2-user/SageMaker/CMSAI/modeling/tes/data/anonymize/AE/Data/Preprocessed/Anonymized/cms_test_y.npy'
vocab_path = '/home/ec2-user/SageMaker/CMSAI/modeling/tes/data/anonymize/AE/Data/Preprocessed/Anonymized/ae_all_vocab_last180_whole'
model_save_path = '/home/ec2-user/SageMaker/CMSAI/modeling/tes/data/final-global/re/1000/training/lstm/lstm_model_30days/gen_attn_lstm'
results_path = '/home/ec2-user/SageMaker/CMSAI/modeling/tes/data/final-global/re/1000/training/lstm/lstm_results_30days/gen_attn_lstm_results'

target_names = ['d_5990', 'd_78605', 'd_486', 'd_78650', 'd_78079', 'd_78900', 'd_78609', 'd_7862', 'd_1101', 'd_78701', 
                'd_5789', 'd_78791', 'd_6826', 'd_78659', 'd_78907', 'd_7840', 'd_28860', 'd_4660', 'd_6829', 'd_00845']

batch_size = 2048
N_EPOCHS = 20

EMBEDDING_DIM = 30
HIDDEN_DIM = 30
BIDIRECTIONAL = False
DROPOUT = 0.0#0.3 # TODO: remove dropout

In [4]:
torch.cuda.is_available()
if device_id is None:
    DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
else:
    DEVICE = torch.device(f'cuda:{device_id}' if torch.cuda.is_available() else 'cpu')

In [5]:
for fp in [model_save_path, results_path]:
    if not os.path.isdir(os.path.split(fp)[0]):
        print(f'New directory created: {fp}')
        os.makedirs(os.path.split(fp)[0])

**READ IN TO GENERATE DATASET**

In [6]:
train_dataset = build_lstm_dataset_v2(train_data_x_path, 
                                      train_data_y_path,
                                      vocab_path, 
                                      target_names)

valid_dataset = build_lstm_dataset_v2(val_data_x_path, 
                                      val_data_y_path,
                                      vocab_path, 
                                      target_names)

# test_dataset = build_lstm_dataset_v2(test_data_x_path, 
#                                       test_data_y_path,
#                                       vocab_path, 
#                                       target_names)

log('vocab length:', len(train_dataset._vocab))

    0.00: Load data and vocab
    0.49: Build data
    0.49: Build pytorch dataset
    0.49: Done
    0.50: Load data and vocab
    0.54: Build data
    0.55: Build pytorch dataset
    0.55: Done
    0.55: vocab length: 31534


In [7]:
# TODO: build LSTM dataset to use a provided vocabulary to process

In [8]:
# TODO: SAVE dataset, vocab
# torch.save(train_dataset, './tmp_train_dataset.pt')
# torch.save(valid_dataset,'./tmp_valid_datset.pt')

In [9]:
train_dataloader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    shuffle=True,
    collate_fn=generate_batch,
    num_workers=8,
)
valid_dataloader = DataLoader(
    valid_dataset,
    batch_size=batch_size,
    shuffle=False,
    collate_fn=generate_batch,
    num_workers=8
)

# test_dataloader = DataLoader(
#     test_dataset,
#     batch_size=batch_size,
#     shuffle=False,
#     collate_fn=generate_batch,
#     num_workers=8
# )

**MODEL GENERATION**

In [10]:
import torch
log(torch.cuda.is_available())
log(DEVICE)

    0.55: True
    0.55: cuda


In [11]:
INPUT_DIM = len(train_dataset._vocab) 
OUTPUT_DIM = len(train_dataset._labels)

In [12]:
model = AttentionRNN(       
    INPUT_DIM, 
    EMBEDDING_DIM, 
    HIDDEN_DIM, 
    OUTPUT_DIM, 
    BIDIRECTIONAL, 
    DROPOUT,
    padding_idx=0,
    device=DEVICE
)

model =  model.to(DEVICE)

log(model)
log(f'Nb of params: {count_parameters(model)}')

    0.61: AttentionRNN(
  (embedding): Embedding(31534, 30, padding_idx=0)
  (rnn): LSTM(30, 30)
  (fc): Linear(in_features=30, out_features=20, bias=True)
  (dropout): Dropout(p=0.0, inplace=False)
)
    0.61: Nb of params: 954080


**MODEL TRAINING**

In [13]:
optimizer = optim.Adam(model.parameters(), lr=0.02)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 4, gamma=0.9)

#    optimizer = optim.SGD(model.parameters(), lr=args.lr)
#    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 5, gamma=0.9) #LLC-2/12: less aggresive drops
    
criterion = nn.BCEWithLogitsLoss() 
criterion = criterion.to(DEVICE)

In [14]:
# log('Train')
# best_valid_loss = float("inf")
# valid_worse_loss = 0  # enable early stopping
# stop_num = 6

# for epoch in range(N_EPOCHS):
#     print('Training Epoch {}...'.format(epoch+1))

#     start_time = time.time()

#     train_loss, train_auc = epoch_train_lstm(
#         model, train_dataloader, optimizer, criterion
#     )

#     valid_loss, valid_auc = epoch_val_lstm(
#         model, valid_dataloader, criterion, return_preds=False
#     )

#     end_time = time.time()

#     epoch_mins, epoch_secs = epoch_time(start_time, end_time)
#     print(f"Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s")

#     if valid_loss < best_valid_loss:
#         best_valid_loss = valid_loss
#         torch.save(model.state_dict(), model_save_path)
#         print("Saved Model, epoch {}".format(epoch))
#         valid_worse_loss = 0

#     else:
#         valid_worse_loss += 1
#         if valid_worse_loss == stop_num:
#             print("EARLY STOP ------")
#             break

#     scheduler.step()
#     log(
#         f"Train Loss: {train_loss:.3f} | Train AUC: {train_auc:.2f} \t Val. Loss: {valid_loss:.3f} |  Val. AUC: {valid_auc:.4f}"
#     )

## Get best model on val set: predictions, feature importance etc

In [15]:
model.load_state_dict(torch.load(model_save_path))

<All keys matched successfully>

In [16]:
# results = ( ids, predictions, labels, attn, events)
# valid_loss, valid_auc, valid_results = epoch_val_lstm(
#         model,
#         valid_dataloader,
#         criterion,
#         return_preds=True
#     )
# valid_loss, valid_auc, valid_results = epoch_val_lstm(
#         model,
#         valid_dataloader,
#         criterion,
#         return_preds=True
#     )

In [17]:
# torch.save(valid_results, results_path)

In [18]:
# #Attn_weights
# print(valid_results[3][0].shape)
# valid_results[3][0][:15]

In [19]:
# ranked = np.argsort(valid_results[3][0])
# ranked = ranked[::-1]
# ranked[:20]

### SHAP

In [20]:
### SHAP
import shap

In [21]:
batch = next(iter(valid_dataloader))

In [22]:
len(batch[0]), batch[1].shape

(2048, torch.Size([2048, 500]))

In [23]:
batch[1][0, :5]

tensor([ 3,  4, 12, 21, 24])

In [24]:
batch[3].shape

torch.Size([2048, 20])

In [25]:
def repackage_hidden(h):
    """
    Wraps hidden states in new Tensors, to detach them from their history.
    Needed to prevent RNN+Attention backpropagating between batches.
    """
    if isinstance(h, torch.Tensor):
        return h.detach()
    else:
        return tuple(repackage_hidden(v) for v in h)

In [26]:
ids, text, text_lengths, labels = batch

In [34]:
text_lengths.shape

torch.Size([2048])

In [36]:
text, text_lengths, labels = (
    text.to(model.device),
    text_lengths.to(model.device),
    labels.to(model.device),
)
hidden = model.init_hidden(text.shape[0])
hidden = repackage_hidden(hidden)

In [37]:
text.shape

torch.Size([2048, 500])

In [38]:
hidden[0].shape, hidden[1].shape

(torch.Size([1, 2048, 30]), torch.Size([1, 2048, 30]))

In [48]:
#Modified: ~/anaconda3/envs/pytorch_latest_p36/lib/python3.6/site-packages/shap/explainers/_deep/deep_pytorch.py
#Added a line with: data = data[0]
#data = [text, text_lengths, hidden]
data = text#, text_lengths]#, torch.Tensor(text.shape[0])]
explainer = shap.DeepExplainer(model, data)
#shap.GradientExplainer(model, data)

In [49]:
explainer.explainer.multi_input

False

In [65]:
shap_values = explainer.shap_values(data)

> /home/ec2-user/anaconda3/envs/pytorch_latest_p36/lib/python3.6/site-packages/shap/explainers/_deep/deep_pytorch.py(150)shap_values()
-> if not self.multi_input:


(Pdb)  c


NameError: name 'repackage_hidden' is not defined

In [None]:
#         for idx, (ids, text, text_lengths, labels) in enumerate(dataloader):

#             text, text_lengths, labels = (
#                 text.to(model.device),
#                 text_lengths,
#                 labels.to(model.device),
#             )

#             hidden = model.init_hidden(text.shape[0])
#             hidden = repackage_hidden(hidden)

#             predictions, hidden, attn_weights = model(
#                 text, text_lengths, hidden, explain=True
#             )

In [None]:
explainer = shap.DeepExplainer(model, *batch)

In [None]:
#e = shap.DeepExplainer(model, background)
shap_values = e.shap_values(test_images)

In [None]:
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X)
shap_values_pos = explainer.shap_values(X_pos)

explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X)
shap_values_pos = explainer.shap_values(X_pos)


In [None]:

#Visualizing the shap value of the first 10 predictions of the positive examples
columns = df_data_pos.columns.tolist()
patient_id_idx = columns.index('patient_id')
for j in range(10):
    patient_id = df_data_pos.iloc[j, patient_id_idx]
    vis_path = os.path.join(shap_dir, 'shap_{}.png'.format(patient_id))
    shap.force_plot(explainer.expected_value, shap_values_pos[j,:], X_pos.iloc[j,:], matplotlib=True, show=False)
    plt.savefig(vis_path, bbox_inches='tight')
    plt.close("all")

shap_path = os.path.join(FINAL_RESULTS_DIR, 'shap_{}.csv'.format(SPLIT))
df_shap = pd.DataFrame(shap_values_pos, columns=feature_names)
df_shap['patient_id'] = df_data_pos['patient_id'].tolist()
columns = ['patient_id'] + feature_names
df_shap = df_shap[columns]
df_shap.to_csv(shap_path, index=False)
# # visualize the training set predictions
# #shap.force_plot(explainer.expected_value, shap_values, X) ## Out-of-memory Error

# # create a dependence plot to show the effect of a single feature across the whole dataset
# vis_path = os.path.join(shap_dir, target+'_per_feature_shap.png')
# shap.dependence_plot(feature_names[0], shap_values, X, show=False)
# plt.savefig(vis_path, bbox_inches='tight')
# plt.close("all")

# # summarize the effects of all the features
# shap.summary_plot(shap_values, X, show=False)
# vis_path = os.path.join(shap_dir, target+'_all_features_shap.png')
# plt.savefig(vis_path, bbox_inches='tight')
# plt.close("all")

#Compute the mean absolute value of the SHAP values for each feature to get a standard bar plot
print('Computing feature importance')
shap.summary_plot(shap_values, X, plot_type="bar", show=False)
vis_path = os.path.join(FINAL_RESULTS_DIR, 'feature_importance.png')
plt.savefig(vis_path, bbox_inches='tight')
plt.close("all")

# print('Shap Values and Visualizations Successfully Saved to {}!'.format(shap_dir))
print('Done!')

In [None]:
# since shuffle=True, this is a random sample of test data
batch = next(iter(test_loader))
images, _ = batch

background = images[:100]
test_images = images[100:103]

e = shap.DeepExplainer(model, background)
shap_values = e.shap_values(test_images)