# Computing SHAP values and Jaccard Index Similarities

In [145]:
#! jupyter nbextension enable jupyter-black-master/jupyter-black

#! pip install botocore==1.12.201

#! pip install shap
#! pip install xgboost

In [191]:
import os
import time
import torch
import pandas as pd
import numpy as np
import time
import matplotlib.pyplot as plt
import seaborn as sns

from collections import Counter

import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from urllib.parse import urlparse
import tarfile
import pickle
import shutil

import shap
import xgboost as xgb

import sagemaker
import boto3
from sagemaker.tuner import IntegerParameter, CategoricalParameter, ContinuousParameter, HyperparameterTuner
from sagemaker.image_uris import retrieve

import deep_id_pytorch

from lstm_models import *
from att_lstm_models import *
from lstm_utils import *
from xgboost_utils import *
from shap_jacc_utils import *

In [192]:
%load_ext autoreload

%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## 1. LSTM Dataset and Model Loading

## Constants

In [193]:
nrows = 1e9
min_freq = 1

seq_len = 30

train_data_path = "../../data/toy_dataset/data/{}/train.csv".format(seq_len)
valid_data_path = "../../data/toy_dataset/data/{}/val.csv".format(seq_len)
test_data_path = "../../data/toy_dataset/data/{}/test.csv".format(seq_len)

lstm_shap_features_path = './output/{}/shap/features/lstm_features.pkl'.format(seq_len)
lstm_shap_scores_path = './output/{}/shap/feature-importance/lstm_scores.pkl'.format(seq_len)
lstm_shap_patients_path = './output/{}/shap/feature-importance/lstm_patients.pkl'.format(seq_len)

lstm_att_shap_features_path = './output/{}/shap/features/lstm_att_features.pkl'.format(seq_len)
lstm_att_shap_scores_path = './output/{}/shap/feature-importance/lstm_att_scores.pkl'.format(seq_len)
lstm_att_shap_patients_path = './output/{}/shap/feature-importance/lstm_att_patients.pkl'.format(seq_len)

lstm_model_save_path = './output/{}/lstm/models/model'.format(seq_len)
lstm_results_save_path = "./output/{}/lstm/results/".format(seq_len)

lstm_att_model_save_path = './output/{}/lstm-att/models/model'.format(seq_len)
lstm_att_results_save_path = "./output/{}/lstm-att/results/".format(seq_len)

batch_size = 64

embedding_dim = 8
hidden_dim = 16
nlayers = 1
bidirectional = True
dropout = 0.3

target_colname = 'label'
uid_colname = 'patient_id'
x_inputs = [str(x) for x in range(29, -1, -1)]
target_value = '1'

rev = False

In [194]:
for fp in [lstm_model_save_path, lstm_results_save_path]:
    if not os.path.isdir(os.path.split(fp)[0]):
        print(f'New directory created: {fp}')
        os.makedirs(os.path.split(fp)[0])
        
for fp in [lstm_att_model_save_path, lstm_att_results_save_path]:
    if not os.path.isdir(os.path.split(fp)[0]):
        print(f'New directory created: {fp}')
        os.makedirs(os.path.split(fp)[0])

print(f"Cuda available: {torch.cuda.is_available()}")
model_device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

Cuda available: True


### Create Vocab and Build Dataset

In [195]:
train_dataset, vocab = build_lstm_dataset(
                                train_data_path,
                                min_freq=min_freq,
                                uid_colname="patient_id",
                                target_colname="label",
                                max_len=seq_len,
                                target_value=target_value,
                                vocab=None,
                                nrows=nrows,
                                rev=rev
                            )
valid_dataset, _ = build_lstm_dataset(
                                valid_data_path,
                                min_freq=min_freq,
                                uid_colname="patient_id",
                                target_colname="label",
                                max_len=seq_len,
                                target_value=target_value,
                                vocab=vocab,
                                nrows=nrows,
                                rev=rev
                            )

test_dataset, _ = build_lstm_dataset(
                                test_data_path,
                                min_freq=min_freq,
                                uid_colname="patient_id",
                                target_colname="label",
                                max_len=seq_len,
                                target_value=target_value,
                                vocab=vocab,
                                nrows=nrows,
                                rev=rev
                            )

Building dataset from ../../data/toy_dataset/data/900/train.csv..
Success!
Building dataset from ../../data/toy_dataset/data/900/val.csv..
Success!
Building dataset from ../../data/toy_dataset/data/900/test.csv..
Success!


In [196]:
train_dataloader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    shuffle=True,
    num_workers=2
)

valid_dataloader = DataLoader(
    valid_dataset,
    batch_size=batch_size,
    shuffle=False,
    num_workers=2
)

test_dataloader = DataLoader(
    test_dataset,
    batch_size=batch_size,
    shuffle=False,
    num_workers=2
)

### LSTM Model Loading and Evaluation

In [197]:
lstm_model = SimpleLSTM(embedding_dim, hidden_dim, vocab, model_device, nlayers=nlayers, dropout=dropout)
lstm_model = lstm_model.cuda()

In [198]:
lstm_model

SimpleLSTM(
  (emb_layer): Embedding(32, 8, padding_idx=0)
  (lstm): LSTM(8, 16, batch_first=True, dropout=0.3, bidirectional=True)
  (pred_layer): Linear(in_features=32, out_features=1, bias=True)
  (dpt): Dropout(p=0.3, inplace=False)
)

In [199]:
#loss_function = nn.CrossEntropyLoss()
loss_function = nn.BCEWithLogitsLoss()

In [200]:
lstm_model.load_state_dict(torch.load(lstm_model_save_path))
test_loss, test_auc = epoch_val_lstm(
   lstm_model, test_dataloader, loss_function)#, return_preds=False

print(f"Test Loss: {test_loss:.3f} | Test AUC: {test_auc:.2f}")

Test Loss: 0.693 | Test AUC: 0.48


## 3. LSTM with Attention Model Loading and Evaluation

In [201]:
lstm_att_model = AttLSTM(embedding_dim, hidden_dim, vocab, model_device, nlayers=nlayers, dropout=dropout)
lstm_att_model = lstm_att_model.cuda()

In [202]:
lstm_att_model

AttLSTM(
  (emb_layer): Embedding(32, 8, padding_idx=0)
  (lstm): LSTM(8, 16, batch_first=True, dropout=0.3, bidirectional=True)
  (pred_layer): Linear(in_features=64, out_features=1, bias=True)
  (attn_layer): Linear(in_features=32, out_features=1, bias=True)
  (dpt): Dropout(p=0.3, inplace=False)
  (context_layer): Linear(in_features=32, out_features=1, bias=True)
)

In [203]:
#loss_function = nn.CrossEntropyLoss()
loss_function = nn.BCEWithLogitsLoss()

In [204]:
lstm_att_model.load_state_dict(torch.load(lstm_att_model_save_path))
test_loss, test_auc = epoch_val_lstm(
   lstm_att_model, test_dataloader, loss_function)#, return_preds=False

print(f"Test Loss: {test_loss:.3f} | Test AUC: {test_auc:.2f}")

Test Loss: 0.334 | Test AUC: 0.90


## 2. XGBoost Dataset and Model Loading

### Data Preprocessing

In [205]:
x_train_one_hot_path = 'output/{}/xgboost/data/train_one_hot.csv'.format(seq_len)
x_valid_one_hot_path = 'output/{}/xgboost/data/val_one_hot.csv'.format(seq_len)
x_test_one_hot_path = 'output/{}/xgboost/data/test_one_hot.csv'.format(seq_len)

x_train_data_path = 'output/{}/xgboost/data/train.csv'.format(seq_len)
x_valid_data_path = 'output/{}/xgboost/data/val.csv'.format(seq_len)
x_test_data_path = 'output/{}/xgboost/data/test.csv'.format(seq_len)

s3_output_data_dir = 's3://merck-paper-bucket/{}/data'.format(seq_len)

xgb_shap_features_path = './output/{}/shap/features/xgb_features.pkl'.format(seq_len)
xgb_shap_scores_path = './output/{}/shap/feature-importance/xgb_scores.pkl'.format(seq_len)
xgb_shap_patients_path = './output/{}/shap/feature-importance/xgb_patients.pkl'.format(seq_len)

BUCKET = 'merck-paper-bucket'
DATA_PREFIX = '{}/data'.format(seq_len)
MODEL_PREFIX = '{}/xgboost/model'.format(seq_len)
label = 'label'

output_results_path = 'output/{}/xgboost/train/train_results.csv'.format(seq_len)
local_model_dir = 'output/{}/xgboost/models/'
s3_output_path = 's3://{}/{}/output'.format(BUCKET, MODEL_PREFIX)

In [206]:
df = pd.read_csv(train_data_path)
print(df.shape)
df.head()

(18000, 903)


Unnamed: 0,index,899,898,897,896,895,894,893,892,891,...,7,6,5,4,3,2,1,0,label,patient_id
0,58,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,...,hay_fever_N,backache_N,quad_injury_N,quad_injury_N,ACL_tear_N,pneumonia_H,pneumonia_H,cardiac_rehab_U,0,W522K3U1NM
1,583,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,...,myopia_N,dental_exam_N,peanut_allergy_N,quad_injury_N,ingrown_nail_N,myopia_N,hay_fever_N,tachycardia_H,0,4PY82TXWUI
2,2882,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,...,eye_exam_N,ACL_tear_N,annual_physical_N,ACL_tear_N,myopia_N,headache_N,foot_pain_N,apnea_H,1,HD8EMMXS9G
3,2159,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,...,quad_injury_N,ingrown_nail_N,foot_pain_N,eye_exam_N,ACL_tear_N,peanut_allergy_N,hay_fever_N,myopia_N,1,A07ZWN6OWW
4,2317,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,...,ingrown_nail_N,quad_injury_N,annual_physical_N,peanut_allergy_N,eye_exam_N,eye_exam_N,foot_pain_N,annual_physical_N,1,I37X5VLIV1


In [207]:
tokens = lstm_model.vocab._vocab
my_tokens = get_valid_tokens(tokens)

In [208]:
df.head()

Unnamed: 0,index,899,898,897,896,895,894,893,892,891,...,7,6,5,4,3,2,1,0,label,patient_id
0,58,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,...,hay_fever_N,backache_N,quad_injury_N,quad_injury_N,ACL_tear_N,pneumonia_H,pneumonia_H,cardiac_rehab_U,0,W522K3U1NM
1,583,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,...,myopia_N,dental_exam_N,peanut_allergy_N,quad_injury_N,ingrown_nail_N,myopia_N,hay_fever_N,tachycardia_H,0,4PY82TXWUI
2,2882,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,...,eye_exam_N,ACL_tear_N,annual_physical_N,ACL_tear_N,myopia_N,headache_N,foot_pain_N,apnea_H,1,HD8EMMXS9G
3,2159,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,...,quad_injury_N,ingrown_nail_N,foot_pain_N,eye_exam_N,ACL_tear_N,peanut_allergy_N,hay_fever_N,myopia_N,1,A07ZWN6OWW
4,2317,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,...,ingrown_nail_N,quad_injury_N,annual_physical_N,peanut_allergy_N,eye_exam_N,eye_exam_N,foot_pain_N,annual_physical_N,1,I37X5VLIV1


In [209]:
prepare_data(train_data_path, x_train_one_hot_path, x_train_data_path, seq_len, target_colname, my_tokens, s3_output_data_dir)
prepare_data(valid_data_path, x_valid_one_hot_path, x_valid_data_path, seq_len, target_colname, my_tokens, s3_output_data_dir)
prepare_data(test_data_path, x_test_one_hot_path, x_test_data_path, seq_len, target_colname, my_tokens, s3_output_data_dir)

Sucess!
Sucess!
Sucess!


### Model Loading and Evaluation

In [210]:
df_train = pd.read_csv(x_train_one_hot_path)
df_test = pd.read_csv(x_test_one_hot_path)

In [211]:
print(df_train.shape)
df_train.head()

(18000, 32)


Unnamed: 0,patient_id,peanut_allergy_N,annual_physical_N,ingrown_nail_N,quad_injury_N,hay_fever_N,eye_exam_N,backache_N,cold_sore_N,headache_N,...,high_creatinine_H,apnea_H,ARR_A,AMI_A,PCI_U,furosemide_H,ACE_inhibitors_U,CHF_A,PH_A,label
0,W522K3U1NM,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,4PY82TXWUI,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
2,HD8EMMXS9G,1,1,1,1,1,1,1,1,1,...,1,1,0,0,0,0,0,0,0,1
3,A07ZWN6OWW,1,1,1,1,1,1,1,1,1,...,0,0,1,0,0,0,0,0,0,1
4,I37X5VLIV1,1,1,1,1,1,1,1,1,1,...,1,0,1,0,0,0,0,0,0,1


In [212]:
print(df_test.shape)
df_test.head()

(6000, 32)


Unnamed: 0,patient_id,peanut_allergy_N,annual_physical_N,ingrown_nail_N,quad_injury_N,hay_fever_N,eye_exam_N,backache_N,cold_sore_N,headache_N,...,high_creatinine_H,apnea_H,ARR_A,AMI_A,PCI_U,furosemide_H,ACE_inhibitors_U,CHF_A,PH_A,label
0,R71VMMSCUX,1,1,1,1,1,1,1,1,1,...,0,0,1,0,0,0,0,0,0,1
1,KLCWD0TMDA,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,1,0,0,0,1
2,DNX5D4PT8V,1,1,1,1,1,1,1,1,1,...,0,0,1,0,0,0,0,0,0,1
3,FKA8RFX3CF,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,1,0,0,0
4,YWB1YCU8KN,1,1,1,1,1,1,1,1,1,...,0,0,0,0,1,0,0,0,0,0


In [213]:
X_train = df_train.iloc[:, 1:-1]
X_test = df_test.iloc[:, 1:-1]

In [214]:
df_best = pd.read_csv(output_results_path)
df_best

Unnamed: 0,class,seq_len,val_auc,best_model_path
0,label,900,0.8989,s3://merck-paper-bucket/900/xgboost/model/outp...


In [215]:
s3_best_model_path = df_best.iloc[0]['best_model_path']
s3_best_model_path

's3://merck-paper-bucket/900/xgboost/model/output/sagemaker-xgboost-210105-0304-001-e8ad1545/output/model.tar.gz'

In [216]:
#Copy the best model from s3 to local
output_path = copy_model_from_s3(s3_best_model_path, local_model_dir)
#Load the copied model
xgb_model = load_model(output_path)

In [217]:
test_auc = xgb_model.eval(xgb.DMatrix(X_test.values, df_test[target_colname].values))
test_auc

'[0]\teval-auc:0.898684'

## 3. Compute SHAP values

In [218]:
#Constants for LSTM
LSTM_N_BACKGROUND = 500 #Number of background examples
LSTM_NEGATIVE_ONLY = True #Whether to use negative examples as background
LSTM_N_TEST = 64 # Number of examples to compute shap values from

XGB_N_BACKGROUND = 500
XGB_NEGATIVE_ONLY = True
XGB_N_TEST = 64

In [219]:
print('Computing SHAP values for LSTM model...')
lstm_features, lstm_scores, lstm_patient_ids = get_lstm_features_and_shap_scores(
                                              lstm_model, 
                                              train_dataloader, 
                                              test_dataloader, 
                                              seq_len,
                                              lstm_shap_features_path,
                                              lstm_shap_scores_path,
                                              lstm_shap_patients_path,
                                              n_test=LSTM_N_TEST,
                                              n_background=LSTM_N_BACKGROUND,
                                              negative_only=LSTM_NEGATIVE_ONLY)

print('Computing SHAP values for LSTM with Attention model...')
lstm_att_features, lstm_att_scores, lstm_att_patient_ids = get_lstm_features_and_shap_scores(
                                              lstm_att_model, 
                                              train_dataloader, 
                                              test_dataloader, 
                                              seq_len,
                                              lstm_att_shap_features_path,
                                              lstm_att_shap_scores_path,
                                              lstm_att_shap_patients_path,
                                              n_test=LSTM_N_TEST,
                                              n_background=LSTM_N_BACKGROUND,
                                              negative_only=LSTM_NEGATIVE_ONLY)

print('Computing SHAP values for XGB model...')
xgb_features, xgb_scores, xgb_patient_ids = get_xgboost_features_and_shap_scores(
                                                 xgb_model, 
                                                 df_train, 
                                                 df_test,
                                                 xgb_shap_features_path,
                                                 xgb_shap_scores_path,
                                                 xgb_shap_patients_path,
                                                 n_test=XGB_N_TEST,
                                                 n_background=XGB_N_BACKGROUND,
                                                 negative_only=XGB_NEGATIVE_ONLY)
print('Successfully Completed!')

Computing SHAP values for LSTM model...


500it [00:10, 45.57it/s]
1000it [00:42, 23.37it/s]
1000it [00:42, 23.47it/s]
1000it [00:42, 23.49it/s]
1000it [00:42, 23.52it/s]
1000it [00:42, 23.41it/s]
1000it [00:42, 23.43it/s]
1000it [00:42, 23.39it/s]
1000it [00:42, 23.50it/s]
1000it [00:42, 23.51it/s]
1000it [00:42, 23.57it/s]
1000it [00:42, 23.44it/s]
1000it [00:42, 23.41it/s]
1000it [00:42, 23.51it/s]
1000it [00:42, 23.43it/s]
1000it [00:42, 23.28it/s]
1000it [00:42, 23.52it/s]
1000it [00:42, 23.33it/s]
1000it [00:42, 23.48it/s]
1000it [00:42, 23.53it/s]
1000it [00:42, 23.38it/s]
1000it [00:42, 23.41it/s]
1000it [00:42, 23.41it/s]
1000it [00:42, 23.55it/s]
1000it [00:42, 23.32it/s]
1000it [00:42, 23.35it/s]
1000it [00:42, 23.32it/s]
1000it [00:42, 23.40it/s]
1000it [00:42, 23.47it/s]
1000it [00:42, 23.50it/s]
1000it [00:42, 23.32it/s]
1000it [00:42, 23.45it/s]
1000it [00:42, 23.47it/s]
1000it [00:42, 23.29it/s]
1000it [00:42, 23.27it/s]
1000it [00:42, 23.43it/s]
1000it [00:42, 23.48it/s]
1000it [00:42, 23.35it/s]
1000it [00:42

saved ./output/900/shap/features/lstm_features.pkl pickle..
saved ./output/900/shap/feature-importance/lstm_scores.pkl pickle..
saved ./output/900/shap/feature-importance/lstm_patients.pkl pickle..
Computing SHAP values for LSTM with Attention model...


500it [00:06, 80.48it/s]
1000it [00:30, 32.58it/s]
1000it [00:27, 35.91it/s]
1000it [00:20, 49.20it/s]
1000it [00:27, 36.44it/s]
1000it [00:17, 55.56it/s]
1000it [00:17, 56.63it/s]
1000it [00:32, 30.36it/s]
1000it [00:19, 52.63it/s]
1000it [00:15, 66.15it/s]
1000it [00:26, 37.58it/s]
1000it [00:33, 30.25it/s]
1000it [00:28, 35.08it/s]
1000it [00:32, 30.75it/s]
1000it [00:25, 38.81it/s]
1000it [00:24, 41.54it/s]
136it [00:01, 120.15it/s]


RuntimeError: CUDA out of memory. Tried to allocate 2.00 MiB (GPU 0; 15.78 GiB total capacity; 13.56 GiB already allocated; 3.00 MiB free; 13.73 GiB reserved in total by PyTorch)

### Visualize SHAP values for a single example

In [None]:
#Load features, scores & patient ids if necessary
# lstm_features = load_pickle(lstm_shap_features_path)
# lstm_scores = load_pickle(lstm_shap_scores_path)
# lstm_patients = load_pickle(lstm_shap_patients_path)

# lstm_att_features = load_pickle(lstm_att_shap_features_path)
# lstm_att_scores = load_pickle(lstm_att_shap_scores_path)
# lstm_att_patients = load_pickle(lstm_att_shap_patients_path)

# xgb_features = load_pickle(lstm_shap_features_path)
# xgb_scores = load_pickle(lstm_shap_scores_path)
# xgb_patients = load_pickle(xgb_shap_patients_path)

In [None]:
#index of the patient where you get compute shap
idx = 4

In [None]:
#For LSTM Model
lstm_features1 = lstm_features[idx]
lstm_scores1 = lstm_scores[idx]
lstm_patient_id = lstm_patient_ids[idx]
len(lstm_features1), len(lstm_scores1)

df_lstm_shap = pd.DataFrame(np.array([lstm_features1, lstm_scores1]).T, columns=['events', 'shap_vals'])
df_lstm_shap["shap_vals"] = pd.to_numeric(df_lstm_shap["shap_vals"])
print(df_lstm_shap.shape)
df_lstm_shap.head()

In [None]:
plot_shap_values(df_lstm_shap, lstm_patient_id, sort=True)

In [None]:
#For LSTM+Attention Model
lstm_att_features1 = lstm_att_features[idx]
lstm_att_scores1 = lstm_att_scores[idx]
lstm_att_patient_id = lstm_att_patient_ids[idx]
len(lstm_att_features1), len(lstm_att_scores1)

df_lstm_att_shap = pd.DataFrame(np.array([lstm_att_features1, lstm_att_scores1]).T, columns=['events', 'shap_vals'])
df_lstm_att_shap["shap_vals"] = pd.to_numeric(df_lstm_att_shap["shap_vals"])
print(df_lstm_att_shap.shape)
df_lstm_att_shap.head()

In [None]:
plot_shap_values(df_lstm_att_shap, lstm_att_patient_id, sort=True)

In [None]:
#For XGB Model
xgb_features1 = xgb_features[idx]
xgb_scores1 = xgb_scores[idx]
xgb_patient_id = xgb_patient_ids[idx]
len(xgb_features1), len(xgb_scores1)

df_xgb_shap = pd.DataFrame(np.array([xgb_features1, xgb_scores1]).T, columns=['events', 'shap_vals'])
df_xgb_shap["shap_vals"] = pd.to_numeric(df_xgb_shap["shap_vals"])
print(df_xgb_shap.shape)
df_xgb_shap.head()

In [None]:
plot_shap_values(df_xgb_shap, xgb_patient_id, sort=True)

## Computing Jaccard Index Similarity b/n Models SHAP Values

In [None]:
k = 5 #Number of top k features
k_list = range(1, 11)
models = ['lstm', 'lstm-att', 'xgb']

In [None]:
lstm_features = load_pickle(lstm_shap_features_path)
lstm_scores = load_pickle(lstm_shap_scores_path)
lstm_features_scores = (lstm_features, lstm_scores)

lstm_att_features = load_pickle(lstm_att_shap_features_path)
lstm_att_scores = load_pickle(lstm_att_shap_scores_path)
lstm_att_features_scores = (lstm_att_features, lstm_att_scores)

xgb_features = load_pickle(xgb_shap_features_path)
xgb_scores = load_pickle(xgb_shap_scores_path)
xgb_features_scores = (xgb_features, xgb_scores)

all_features_scores = [lstm_features_scores, 
                       lstm_att_features_scores, 
                       xgb_features_scores]

In [None]:
len(lstm_features), len(lstm_att_features), len(xgb_features)

In [None]:
generate_heatmap(all_features_scores, models, k)

In [None]:
generate_k_heatmaps(all_features_scores, models, k_list)

## Intersection Similarity Score

This score computes the fraction of the intersection of the Ground Truth of Helpers+Adverse Events and Predicted Helpers+Adverse Events by Total Number of Helping Events.

In [None]:
#For LSTM
avg_sim = get_model_intersection_similarity(lstm_features_scores)
print('Average Intersection Similarity(LSTM): {}'.format(avg_sim))

#For LSTM+Attention
avg_sim = get_model_intersection_similarity(lstm_att_features_scores)
print('Average Intersection Similarity(LSTM+Attention): {}'.format(avg_sim))

#For XGB
avg_sim = get_model_intersection_similarity(xgb_features_scores)
print('Average Intersection Similarity(XGB): {}'.format(avg_sim))

```
For seq_len=30 (test batch_size=64), results are as follows:
    Average Intersection Similarity(LSTM): 0.14062499999999997
    Average Intersection Similarity(LSTM+Attention): 0.15625
    Average Intersection Similarity(XGB): 0.10781249999999994
For seq_len=300 (test batch_size=64), results are as follows:
    Average Intersection Similarity(LSTM): 0.24739583333333334
    Average Intersection Similarity(LSTM+Attention): 0.1953125
    Average Intersection Similarity(XGB): 0.4703124999999999
```