In [None]:
#! pip install botocore==1.12.201

In [None]:
#! jupyter nbextension enable jupyter-black-master/jupyter-black

In [None]:
#! pip install shap

In [17]:
## Test Multiprocessing

In [18]:

params

[(1, 11),
 (2, 12),
 (3, 13),
 (4, 14),
 (5, 15),
 (6, 16),
 (7, 17),
 (8, 18),
 (9, 19),
 (10, 20)]

In [30]:
from multiprocessing import Pool

def add(args):
    x, y = args
    return x+y

x_range = range(1, 11)
y_range = range(11, 21)

x_range = range(1, 11)
y_range = range(11, 21)
params = list(zip(x_range, y_range))
print(params)
pool = Pool(os.cpu_count())      # Create a multiprocessing Pool
outputs = pool.map(add, params)  # process data_inputs iterable with pool

pool.close()
pool.join()

[(1, 11), (2, 12), (3, 13), (4, 14), (5, 15), (6, 16), (7, 17), (8, 18), (9, 19), (10, 20)]


In [31]:
outputs

[12, 14, 16, 18, 20, 22, 24, 26, 28, 30]

In [1]:
import os
import time
import torch
import pandas as pd
import numpy as np
import time
import matplotlib.pyplot as plt
import seaborn as sns

from collections import Counter

import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from urllib.parse import urlparse
import tarfile
import pickle
import shutil

import shap
import xgboost as xgb

import sagemaker
import boto3
from sagemaker.tuner import IntegerParameter, CategoricalParameter, ContinuousParameter, HyperparameterTuner
from sagemaker.image_uris import retrieve

import deep_id_pytorch

from lstm_models import *
from lstm_utils import *
from xgboost_utils import *

In [11]:
%load_ext autoreload

%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## OPTIONS

In [13]:
nrows = 1000
min_freq = 1

seq_len = 30

train_data_path = "../../data/toy_dataset/data/{}/train.csv".format(seq_len)
valid_data_path = "../../data/toy_dataset/data/{}/val.csv".format(seq_len)
test_data_path = "../../data/toy_dataset/data/{}/test.csv".format(seq_len)

model_save_path = './output/{}/lstm/model'.format(seq_len)
results_save_path = "./output/{}/lstm/results".format(seq_len)
batch_size = 64

n_epochs = 10
stop_num = 6

embedding_dim = 8
hidden_dim = 16
nlayers = 1
bidirectional = True
dropout = 0.3

target_colname = 'label'
uid_colname = 'patient_id'
x_inputs = [str(x) for x in range(29, -1, -1)]
target_value = '1'

rev = False

In [15]:
for fp in [model_save_path, results_save_path]:
    if not os.path.isdir(os.path.split(fp)[0]):
        print(f'New directory created: {fp}')
        os.makedirs(os.path.split(fp)[0])

print(f"Cuda available: {torch.cuda.is_available()}")
model_device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

Cuda available: True


## Create Vocab and Build Dataset

In [5]:
train_dataset, vocab = build_lstm_dataset(
                                train_data_path,
                                min_freq=min_freq,
                                uid_colname="patient_id",
                                target_colname="label",
                                max_len=seq_len,
                                target_value=target_value,
                                vocab=None,
                                nrows=nrows,
                                rev=rev
                            )
valid_dataset, _ = build_lstm_dataset(
                                valid_data_path,
                                min_freq=min_freq,
                                uid_colname="patient_id",
                                target_colname="label",
                                max_len=seq_len,
                                target_value=target_value,
                                vocab=vocab,
                                nrows=nrows,
                                rev=rev
                            )

test_dataset, _ = build_lstm_dataset(
                                test_data_path,
                                min_freq=min_freq,
                                uid_colname="patient_id",
                                target_colname="label",
                                max_len=seq_len,
                                target_value=target_value,
                                vocab=vocab,
                                nrows=nrows,
                                rev=rev
                            )

Building dataset from ../../data/toy_dataset/data/30/train.csv..
Success!
Building dataset from ../../data/toy_dataset/data/30/val.csv..
Success!
Building dataset from ../../data/toy_dataset/data/30/test.csv..
Success!


In [None]:
train_dataloader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    shuffle=True,
    num_workers=2
)

valid_dataloader = DataLoader(
    valid_dataset,
    batch_size=batch_size,
    shuffle=False,
    num_workers=2
)

test_dataloader = DataLoader(
    test_dataset,
    batch_size=batch_size,
    shuffle=False,
    num_workers=2
)

## Model Training

In [16]:
model = SimpleLSTM(embedding_dim, hidden_dim, vocab, model_device, nlayers=nlayers, dropout=dropout)
model = model.cuda()

TypeError: super(type, obj): obj must be an instance or subtype of type

In [None]:
model

In [None]:
#loss_function = nn.CrossEntropyLoss()
loss_function = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.05)

scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 4, gamma=0.9)

In [None]:
best_valid_loss = float("inf")
valid_worse_loss = 0  # enable early stopping


for epoch in range(n_epochs):

    start_time = time.time()

    train_loss, train_auc = epoch_train_lstm(
        model, train_dataloader, optimizer, loss_function
    )

    valid_loss, valid_auc = epoch_val_lstm(
       model, valid_dataloader, loss_function)#, return_preds=False
    #)
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    print(f"Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s")

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), model_save_path)
        print("Saved Model, epoch {}".format(epoch))
        valid_worse_loss = 0

    else:
        valid_worse_loss += 1
        if valid_worse_loss == stop_num:
            print("EARLY STOP ------")
            break

    scheduler.step()
    print(
        f"Train Loss: {train_loss:.3f} | Train AUC: {train_auc:.2f} \t Val. Loss: {valid_loss:.3f} |  Val. AUC: {valid_auc:.4f}"
    )

model.load_state_dict(torch.load(model_save_path))
test_loss, test_auc = epoch_val_lstm(
   model, test_dataloader, loss_function)#, return_preds=False

print(f"Test Loss: {test_loss:.3f} | Test AUC: {test_auc:.2f}")

## Get SHAP values

In [None]:
#index of the patient where you get compute shap
idx = 1

In [None]:
background = next(iter(train_dataloader))
background_ids, background_labels, background_idxes = background

In [None]:
background_idxes.shape

In [None]:
bg_data, bg_masks = model.get_all_ids_masks(background_idxes, seq_len)

In [None]:
bg_data.shape

In [None]:
explainer = deep_id_pytorch.CustomPyTorchDeepIDExplainer(model, bg_data, bg_masks,
                                                         gpu_memory_efficient=True)

In [None]:
model.train() # in case that shap complains that autograd cannot be called
lstm_values = []
features = []
start = 0

In [None]:
#test = list(test_dataloader)[3]
test = next(iter(test_dataloader))
test_ids, test_labels, test_idxes = test

In [None]:
#test_labels

In [None]:
test_idxes.shape

In [None]:
test_data, test_masks = model.get_all_ids_masks(test_idxes, seq_len)

In [None]:
test_data.shape

In [None]:
np.array(test_masks[0])

In [None]:
len(test_masks[0])

In [None]:
lstm_shap_values = explainer.shap_values(test_data, test_masks)

In [None]:
lstm_shap_values.shape

In [None]:
df_lstm_shap, patient_id = get_per_patient_shap(lstm_shap_values, test, model.vocab, idx)

In [None]:
patient_id

In [None]:
df_lstm_shap

In [None]:
plot_shap_values(df_lstm_shap, patient_id, sort=True)

## XGBoost

### Data Preprocessing

In [None]:
x_train_one_hot_path = 'output/{}/xgboost/data/train_one_hot.csv'.format(seq_len)
x_valid_one_hot_path = 'output/{}/xgboost/data/val_one_hot.csv'.format(seq_len)
x_test_one_hot_path = 'output/{}/xgboost/data/test_one_hot.csv'.format(seq_len)

x_train_data_path = 'output/{}/xgboost/data/train.csv'.format(seq_len)
x_valid_data_path = 'output/{}/xgboost/data/val.csv'.format(seq_len)
x_test_data_path = 'output/{}/xgboost/data/test.csv'.format(seq_len)

s3_output_data_dir = 's3://merck-paper-bucket/{}/data'.format(seq_len)

In [None]:
df = pd.read_csv(train_data_path)
print(df.shape)
df.head()

In [None]:
def get_valid_tokens(tokens):
    """Get all tokens except <pad> and <unk>"""
    my_tokens = []
    for key, val in tokens.items():
        if val>=2:
            my_tokens.append(key)
    my_tokens
    return my_tokens

In [None]:
tokens = model.vocab._vocab
my_tokens = get_valid_tokens(tokens)

In [None]:
df.head()

In [None]:
prepare_data(train_data_path, x_train_one_hot_path, x_train_data_path, seq_len, target_colname, my_tokens, s3_output_data_dir)
prepare_data(valid_data_path, x_valid_one_hot_path, x_valid_data_path, seq_len, target_colname, my_tokens, s3_output_data_dir)
prepare_data(test_data_path, x_test_one_hot_path, x_test_data_path, seq_len, target_colname, my_tokens, s3_output_data_dir)

### XGBoost Model Training

In [None]:
BUCKET = 'merck-paper-bucket'
DATA_PREFIX = '{}/data'.format(seq_len)
MODEL_PREFIX = '{}/xgboost/model'.format(seq_len)
label = 'label'

output_results_path = 'output/{}/xgboost/models/train_results.csv'.format(seq_len)
local_model_dir = 'output/{}/xgboost/models/'
s3_output_path = 's3://{}/{}/output'.format(BUCKET, MODEL_PREFIX)

###Algorithm config
ALGORITHM = 'xgboost'
REPO_VERSION = '1.2-1'

###Hyperparameter tuning config
TRAIN_INSTANCE_TYPE = 'ml.m5.4xlarge'#'ml.m4.16xlarge'
TRAIN_INSTANCE_COUNT = 1
MAX_PARALLEL_JOBS = 1#4 #TODO: Remove
MAX_TRAIN_JOBS = 1#20

EVALUATION_METRIC = 'auc'
OBJECTIVE = 'binary:logistic'
OBJECTIVE_METRIC_NAME = 'validation:auc'

#Update hyperparameter ranges
# HYPERPARAMETER_RANGES = {'eta': ContinuousParameter(0, 1),
#                         'alpha': ContinuousParameter(0, 2),
#                         'max_depth': IntegerParameter(1, 10)}

HYPERPARAMETER_RANGES = {'eta': ContinuousParameter(0.1, 0.5),
                       'alpha': ContinuousParameter(0, 2),
                       'max_depth': IntegerParameter(1, 10),
                       'gamma': ContinuousParameter(0, 5),
                       'num_round': IntegerParameter(200, 500),
                       'colsample_bylevel': ContinuousParameter(0.1, 1.0),
                       'colsample_bynode': ContinuousParameter(0.1, 1.0),
                       'colsample_bytree': ContinuousParameter(0.5, 1.0),
                       'lambda': ContinuousParameter(0, 1000),
                       'max_delta_step': IntegerParameter(0, 10),
                       'min_child_weight': ContinuousParameter(0, 120),
                       'subsample': ContinuousParameter(0.5, 1.0),
                       }


### SageMaker Initialization
region = boto3.Session().region_name
role = sagemaker.get_execution_role()
smclient = boto3.Session().client('sagemaker')

sess = sagemaker.Session()

container = retrieve(ALGORITHM, region, version=REPO_VERSION)

start = time.time()
print('Training for seq_len={}, label={}...'.format(seq_len, label))
#Prepare the input train & validation data path
s3_input_train = sagemaker.inputs.TrainingInput(s3_data='s3://{}/{}/train'.format(BUCKET, DATA_PREFIX), content_type='csv')
s3_input_validation = sagemaker.inputs.TrainingInput(s3_data='s3://{}/{}/val'.format(BUCKET, DATA_PREFIX), content_type='csv')

#Class Imbalance
scale_pos_weight = 1.0 # negative/positive

data_channels = {'train': s3_input_train, 'validation': s3_input_validation}

tuner = train_hpo(hyperparameter_ranges=HYPERPARAMETER_RANGES, 
                  container=container, 
                  execution_role=role, 
                  instance_count=TRAIN_INSTANCE_COUNT, 
                  instance_type=TRAIN_INSTANCE_TYPE, 
                  output_path=s3_output_path, 
                  sagemaker_session=sess, 
                  eval_metric=EVALUATION_METRIC, 
                  objective=OBJECTIVE, 
                  objective_metric_name=OBJECTIVE_METRIC_NAME, 
                  max_train_jobs=MAX_TRAIN_JOBS, 
                  max_parallel_jobs=MAX_PARALLEL_JOBS, 
                  scale_pos_weight=scale_pos_weight, 
                  data_channels=data_channels)

#Get the hyperparameter tuner status at regular interval
val_auc, best_model_path = get_tuner_status_and_result_until_completion(tuner, seq_len, label)

result = [label, seq_len, val_auc, best_model_path]
training_results = [result]

print('Success! Total training time={} mins.'.format((time.time()-start)/60.0))
#Save the results to file
df_results = pd.DataFrame(training_results, columns=['class', 'seq_len', 'val_auc', 'best_model_path'])

if not os.path.isdir(os.path.split(output_results_path)[0]):
    os.makedirs(os.path.split(output_results_path)[0])

df_results.to_csv(output_results_path, index=False)
print('ALL SUCCESS!')

### XGBoost SHAP

In [None]:
output_results_path = 'output/{}/xgboost/models/train_results.csv'.format(seq_len)
local_model_dir = 'output/{}/xgboost/models/'.format(seq_len)

In [None]:
df_train = pd.read_csv(x_train_one_hot_path)
df_test = pd.read_csv(x_test_one_hot_path)

In [None]:
df_train.head()

In [None]:
X_train = df_train.iloc[:, 1:-1]
X_test = df_test.iloc[:, 1:-1]
print(X_train.shape)
print(X_test.shape)

In [None]:
X_train.head()

In [None]:
df_best = pd.read_csv(output_results_path)
df_best

In [None]:
s3_best_model_path = df_best.iloc[0]['best_model_path']
s3_best_model_path

In [None]:
#Copy the best model from s3 to local
output_path = copy_model_from_s3(s3_best_model_path, local_model_dir)
#Load the copied model
xgb_model = load_model(output_path)

In [None]:
explainer = shap.TreeExplainer(xgb_model)
xgb_shap_values = explainer.shap_values(X_test)

In [None]:
xgb_shap_values.shape

In [None]:
events = X_test.columns.tolist()
events

In [None]:
idx

In [None]:
xgb_shap_values[0]

In [None]:
xgb_shap_values[1]

In [None]:
patient_id = df_test.patient_id[idx]
pat_shap_values = xgb_shap_values[idx]

In [None]:
patient_id

In [None]:
pat_shap_values.shape

In [None]:
df_lstm_shap

In [None]:
xgboost

In [None]:
df_lstm_shap.reindex(df_lstm_shap.shap_vals.abs().sort_values(ascending=False).index).reset_index()

In [None]:
df.reindex(df.shap_vals.abs().sort_values(ascending=False).index).reset_index()

In [None]:
df = pd.DataFrame(np.array([events, pat_shap_values]).T, columns=['events', 'shap_vals'])
df["shap_vals"] = pd.to_numeric(df["shap_vals"])
df

In [None]:
plot_shap_values(df_lstm_shap, patient_id, sort=True)

In [None]:
plot_shap_values(df, patient_id, sort=True)

In [None]:
test_auc = xgb_model.eval(xgb.DMatrix(X_test.values, df_test[target_colname].values))
test_auc

In [None]:
auc

In [None]:
lstm_shap_values.shape

In [None]:
xgb_shap_values.shape

In [None]:
lstm_shap_values[0].shape