In [2]:
###<NOT YET UPDATED>

# XGB Model Training and SHAP computation using the Synthetic-events Dataset

**Author: Tesfagabir Meharizghi<br>Last Updated: 01/07/2021**

This notebook does the following actions:
- Model training using the given parameters
- Model selection using Intersection Similarity Score between ground truth helping features and predicted ones
    * Early stopping using Intersection similarity score criteria
- Computes SHAP values and visualizes for a few examples
- Visualizes the train/val/test probability scores from each trained model
- Visualizes the Intersection Similarity Scores for val/test splits
- Finally, after tweaking the parameters, it gets the best model for the given model architecture and dataset

Outputs:
- The following artifacts are saved:
    * Model artifacts
    * SHAP values and their corresponding scores for the specified number of val/test examples

Model Architecture Used:
- XGB

Dataset:
- Synthetic-events (Toy Dataset)

Requirements:
- Make sure that you have already generated the synthetic toy dataset (train/val/test splits) using [Create_toy_dataset.ipynb](../../data/toy_dataset/Create_toy_dataset.ipynb).

Next Steps:
- Once you train different models, save the best one you found
- Do also the same for other models architectures (SimpleLSTM, XGB, etc.) using the separate notebooks
- Finally, go to [this ipynb]() to compare to compare the models' performances and shap values usig Jaccard Similarity Index

In [52]:
# pip install nb-black

In [53]:
#! pip install botocore==1.12.201

#! pip install shap
#! pip install xgboost

In [54]:
%load_ext lab_black

%load_ext autoreload

%autoreload 2

The lab_black extension is already loaded. To reload it, use:
  %reload_ext lab_black
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [1]:
import os
import time
import torch
import pandas as pd
import numpy as np
import time
import matplotlib.pyplot as plt
import seaborn as sns

import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from urllib.parse import urlparse
import tarfile
import pickle
import shutil
from collections import Counter, defaultdict, OrderedDict

import shap
import xgboost as xgb

import sagemaker
import boto3
from sagemaker.tuner import (
    IntegerParameter,
    CategoricalParameter,
    ContinuousParameter,
    HyperparameterTuner,
)
from sagemaker.image_uris import retrieve

import deep_id_pytorch

import xgboost_utils as xgb_utils
import shap_jacc_utils as sj_utils

## LSTM+Attention Model Training

### Constants

In [2]:
nrows = 1000#1e9 #TODO

seq_len = 30

batch_size = 64  # For model training

n_epochs = 10
stop_num = 2

embedding_dim = 8
hidden_dim = 16
nlayers = 1
bidirectional = True
dropout = 0.2

target_colname = "label"
uid_colname = "patient_id"
target_value = "1"

rev = False

model_name = "xgb"
dataset = 'synthetic-events'

# For model early stopping criteria
EARLY_STOPPING = "intersection_similarity"  # Values are any of these: ['intersection_similarity', 'loss']

# SHAP related constants
N_BACKGROUND = 500  # Number of background examples
BACKGROUND_NEGATIVE_ONLY = True  # If negative examples are used as background
N_VALID_EXAMPLES = 32  # Number of validation examples to be used during model training
N_TEST_EXAMPLES = 64  # Number of test examples
TEST_POSITIVE_ONLY = True  # If only positive examples are selected
IS_TEST_RANDOM = (
    True  # If random test/val examples are selected for shap value computation
)
SORT_SHAP_VALUES = False  # Whether to sort per-patient shap values for visualization

train_data_path = f"../../data/toy_dataset/data/{seq_len}/train.csv"
valid_data_path = f"../../data/toy_dataset/data/{seq_len}/val.csv"
test_data_path = f"../../data/toy_dataset/data/{seq_len}/test.csv"

model_save_path = "./output/{}/{}/models/model_{}.pkl".format(seq_len, model_name, "{}")
shap_save_path = "./output/{}/{}/shap/{}_shap_{}.pkl".format(
    seq_len, model_name, "{}", "{}"
)  # SHAP values path for a given dataset split (train/val/test) (data format (features, scores, patient_ids))

# Dataset preprocessing
x_train_one_hot_path = f"output/{seq_len}/{model_name}/data/train_one_hot.csv"
x_valid_one_hot_path = f"output/{seq_len}/{model_name}/data/val_one_hot.csv"
x_test_one_hot_path = f"output/{seq_len}/{model_name}/data/test_one_hot.csv"

x_train_data_path = f"output/{seq_len}/{model_name}/data/train.csv"
x_valid_data_path = f"output/{seq_len}/{model_name}/data/val.csv"
x_test_data_path = f"output/{seq_len}/{model_name}/data/test.csv"

s3_output_data_dir = f"s3://merck-paper-bucket/{dataset}/{seq_len}/data"


#Model training
BUCKET = "merck-paper-bucket"
DATA_PREFIX = f"{dataset}/{seq_len}/data"
MODEL_PREFIX = f"{dataset}/{seq_len}/{model_name}".format(seq_len)
label = "label"

output_results_path = f"output/{seq_len}/{model_name}/train/train_results.csv"
local_model_dir = f"output/{seq_len}/{model_name}/models/"
s3_output_path = f"s3://{BUCKET}/{MODEL_PREFIX}/output"

###Algorithm config
ALGORITHM = "xgboost"
REPO_VERSION = "1.2-1"

###Hyperparameter tuning config
TRAIN_INSTANCE_TYPE = "ml.m5.4xlarge"  #'ml.m4.16xlarge'
TRAIN_INSTANCE_COUNT = 1
MAX_PARALLEL_JOBS = 1  # 4 #TODO: Remove
MAX_TRAIN_JOBS = 1  # 20

EVALUATION_METRIC = "auc"
OBJECTIVE = "binary:logistic"
OBJECTIVE_METRIC_NAME = "validation:auc"

# Update hyperparameter ranges
# HYPERPARAMETER_RANGES = {'eta': ContinuousParameter(0, 1),
#                         'alpha': ContinuousParameter(0, 2),
#                         'max_depth': IntegerParameter(1, 10)}

HYPERPARAMETER_RANGES = {
    "eta": ContinuousParameter(0.1, 0.5),
    "alpha": ContinuousParameter(0, 2),
    "max_depth": IntegerParameter(1, 10),
    "gamma": ContinuousParameter(0, 5),
    "num_round": IntegerParameter(200, 500),
    "colsample_bylevel": ContinuousParameter(0.1, 1.0),
    "colsample_bynode": ContinuousParameter(0.1, 1.0),
    "colsample_bytree": ContinuousParameter(0.5, 1.0),
    "lambda": ContinuousParameter(0, 1000),
    "max_delta_step": IntegerParameter(0, 10),
    "min_child_weight": ContinuousParameter(0, 120),
    "subsample": ContinuousParameter(0.5, 1.0),
}

In [3]:
# LSTM+Attention Model Output Directory
model_save_dir = os.path.dirname(model_save_path)
shap_save_dir = os.path.dirname(shap_save_path)
if os.path.exists(model_save_dir):
    # Remove model save directory if exists
    shutil.rmtree(model_save_dir)
if os.path.exists(shap_save_dir):
    # Remove model save directory if exists
    shutil.rmtree(shap_save_dir)
os.makedirs(model_save_dir)
os.makedirs(shap_save_dir)
print(f"New directory created: {model_save_dir}")
print(f"New directory created: {shap_save_dir}")

New directory created: ./output/30/xgb/models
New directory created: ./output/30/xgb/shap


## 2. XGBoost Model Training

### Data Preprocessing

In [4]:
df = pd.read_csv(train_data_path)
print(df.shape)
df.head()

(18000, 33)


Unnamed: 0,index,29,28,27,26,25,24,23,22,21,...,7,6,5,4,3,2,1,0,label,patient_id
0,2741,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,...,apnea_H,headache_N,cold_sore_N,high_creatinine_H,high_creatinine_H,myopia_N,cold_sore_N,cut_finger_N,1,YYZP9V6PWV
1,1615,<pad>,<pad>,<pad>,myopia_N,myopia_N,headache_N,myopia_N,apnea_H,normal_bmi_U,...,foot_pain_N,headache_N,backache_N,foot_pain_N,hay_fever_N,furosemide_H,ACL_tear_N,ACL_tear_N,1,4VX2RBGZE0
2,852,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,eye_exam_N,ankle_sprain_N,myopia_N,...,cold_sore_N,hay_fever_N,headache_N,foot_pain_N,cold_sore_N,cold_sore_N,hay_fever_N,backache_N,0,6U018NSHLS
3,2749,<pad>,<pad>,eye_exam_N,ingrown_nail_N,headache_N,myopia_N,headache_N,ingrown_nail_N,cut_finger_N,...,myopia_N,tachycardia_H,cold_sore_N,myopia_N,backache_N,cut_finger_N,hay_fever_N,headache_N,1,CLEDNASY68
4,2090,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,...,cold_sore_N,foot_pain_N,eye_exam_N,backache_N,myopia_N,cut_finger_N,ACL_tear_N,hay_fever_N,1,XV86JBVK2M


In [15]:
def get_valid_tokens(df, seq_len):
    feature_cols = [str(i) for i in range(seq_len-1, -1, -1)]
    tokens = list(set(df[feature_cols].values.flatten().tolist()))
    pad = '<pad>'
    if pad in tokens:
        tokens.remove('<pad>')
    return tokens

In [16]:
tokens = get_valid_tokens(df, seq_len)

In [27]:
prepare_data(
    train_data_path,
    x_train_one_hot_path,
    x_train_data_path,
    seq_len,
    target_colname,
    tokens,
    s3_output_data_dir,
)
prepare_data(
    valid_data_path,
    x_valid_one_hot_path,
    x_valid_data_path,
    seq_len,
    target_colname,
    tokens,
    s3_output_data_dir,
)
prepare_data(
    test_data_path,
    x_test_one_hot_path,
    x_test_data_path,
    seq_len,
    target_colname,
    tokens,
    s3_output_data_dir,
)

Sucess!
Sucess!
Sucess!


### XGBoost Model Training

In [29]:
### SageMaker Initialization
region = boto3.Session().region_name
role = sagemaker.get_execution_role()
smclient = boto3.Session().client("sagemaker")

sess = sagemaker.Session()

container = retrieve(ALGORITHM, region, version=REPO_VERSION)

start = time.time()
print("Training for seq_len={}, label={}...".format(seq_len, label))
# Prepare the input train & validation data path
s3_input_train = sagemaker.inputs.TrainingInput(
    s3_data="s3://{}/{}/train".format(BUCKET, DATA_PREFIX), content_type="csv"
)
s3_input_validation = sagemaker.inputs.TrainingInput(
    s3_data="s3://{}/{}/val".format(BUCKET, DATA_PREFIX), content_type="csv"
)

# Class Imbalance
scale_pos_weight = 1.0  # negative/positive

data_channels = {"train": s3_input_train, "validation": s3_input_validation}

tuner = train_hpo(
    hyperparameter_ranges=HYPERPARAMETER_RANGES,
    container=container,
    execution_role=role,
    instance_count=TRAIN_INSTANCE_COUNT,
    instance_type=TRAIN_INSTANCE_TYPE,
    output_path=s3_output_path,
    sagemaker_session=sess,
    eval_metric=EVALUATION_METRIC,
    objective=OBJECTIVE,
    objective_metric_name=OBJECTIVE_METRIC_NAME,
    max_train_jobs=MAX_TRAIN_JOBS,
    max_parallel_jobs=MAX_PARALLEL_JOBS,
    scale_pos_weight=scale_pos_weight,
    data_channels=data_channels,
)

# Get the hyperparameter tuner status at regular interval
val_auc, best_model_path = get_tuner_status_and_result_until_completion(
    tuner, seq_len, label
)

result = [label, seq_len, val_auc, best_model_path]
training_results = [result]

print("Success! Total training time={} mins.".format((time.time() - start) / 60.0))
# Save the results to file
df_results = pd.DataFrame(
    training_results, columns=["class", "seq_len", "val_auc", "best_model_path"]
)

if not os.path.isdir(os.path.split(output_results_path)[0]):
    os.makedirs(os.path.split(output_results_path)[0])

df_results.to_csv(output_results_path, index=False)
print("ALL SUCCESS!")

Training for seq_len=30, label=label...
....................................................!
Total jobs completed: 1
Metric: validation:auc
Best AUC: 0.9015
Success! Total training time=4.404632727305095 mins.
ALL SUCCESS!


## Model Validation and Visualization

In [None]:
total_models = len(models_paths)
for i, model_path in enumerate(models_paths):
    print(f"Processing for model {os.path.basename(model_path)} ...")
    # Load trained weights
    print("Loading the trained weights...")
    model.load_state_dict(torch.load(model_path))
    ##Get Train/Val/Test Scores
    print("Computing the models performances for train/val/test splits...")
    train_loss, train_auc, train_labels, train_scores = l_utils.epoch_val_lstm(
        model, train_dataloader, loss_function, return_preds=True
    )
    val_loss, val_auc, val_labels, val_scores = l_utils.epoch_val_lstm(
        model, valid_dataloader, loss_function, return_preds=True
    )
    test_loss, test_auc, test_labels, test_scores = l_utils.epoch_val_lstm(
        model, test_dataloader, loss_function, return_preds=True
    )
    print("Ploting Histograms of Train/Val/Test Predicted Scores...")
    _, axes = plt.subplots(1, 3, sharex=False, figsize=(15, 5))
    # Train
    scores = train_scores.flatten().tolist()
    axes = sj_utils.plot_histogram(
        scores,
        title=f"Train Scores (Loss={train_loss:.4f}, AUC={train_auc:.4f})",
        xlabel="Prediction Scores",
        ylabel="Frequencies",
        axes=axes,
        axes_idx=0,
    )
    # Val
    scores = val_scores.flatten().tolist()
    axes = sj_utils.plot_histogram(
        scores,
        title=f"Val Scores (Loss={val_loss:.4f}, AUC={val_auc:.4f})",
        xlabel="Prediction Scores",
        ylabel="",
        axes=axes,
        axes_idx=1,
    )
    # Test
    scores = test_scores.flatten().tolist()
    axes = sj_utils.plot_histogram(
        scores,
        title=f"Test Scores (Loss={test_loss:.4f}, AUC={test_auc:.4f})",
        xlabel="Prediction Scores",
        ylabel="",
        axes=axes,
        axes_idx=2,
    )
    plt.show()

    print(f"Computing SHAP for {N_VALID_EXAMPLES} positive val examples...")
    epoch = sj_utils.get_epoch_number_from_path(model_path)
    val_shap_path = shap_save_path.format("val", f"{epoch:02}")
    (
        features,
        scores,
        patients,
    ) = sj_utils.load_pickle(val_shap_path)

    for idx in range(N_VALID_EXAMPLES):
        if idx > 2:
            break
        features1 = features[idx]
        scores1 = scores[idx]
        patient_id = patients[idx]

        df_shap = pd.DataFrame(
            np.array([features1, scores1]).T, columns=["events", "shap_vals"]
        )
        df_shap["shap_vals"] = pd.to_numeric(df_shap["shap_vals"])

        sj_utils.plot_shap_values(
            df_shap, patient_id, sort=SORT_SHAP_VALUES, figsize=(10, 5)
        )

    print("Computing Intersection Similarity...")
    avg_sim, sim = sj_utils.get_model_intersection_similarity((features, scores))
    sj_utils.plot_histogram(
        sim,
        title=f"Average Intersection Simi={avg_sim:.4f}",
        xlabel="Intersection Similarity",
        ylabel="Frequencies",
        axes=None,
    )

    # For the best model, get the final performance (test set) (intersection similarity)
    if i == (total_models - 1):
        print(
            f"Computing SHAP for {N_TEST_EXAMPLES} positive TEST examples for the final model..."
        )
        test_shap_path = shap_save_path.format("test", f"{epoch:02}")
        (features, scores, patients,) = sj_utils.get_lstm_features_and_shap_scores(
            model,
            train_dataloader,
            test_dataloader,
            seq_len,
            test_shap_path,
            save_output=True,
            n_background=N_BACKGROUND,
            background_negative_only=BACKGROUND_NEGATIVE_ONLY,
            n_test=N_TEST_EXAMPLES,
            test_positive_only=TEST_POSITIVE_ONLY,
            is_test_random=IS_TEST_RANDOM,
        )

        for idx in range(N_TEST_EXAMPLES):
            if idx > 2:
                break
            features1 = features[idx]
            scores1 = scores[idx]
            patient_id = patients[idx]

            df_shap = pd.DataFrame(
                np.array([features1, scores1]).T,
                columns=["events", "shap_vals"],
            )
            df_shap["shap_vals"] = pd.to_numeric(df_shap["shap_vals"])

            sj_utils.plot_shap_values(
                df_shap, patient_id, sort=SORT_SHAP_VALUES, figsize=(10, 5)
            )

        print("Computing Intersection Similarity...")
        avg_sim, sim = sj_utils.get_model_intersection_similarity((features, scores))
        sj_utils.plot_histogram(
            sim,
            title=f"Average Intersection Simi={avg_sim:.4f}",
            xlabel="Intersection Similarity",
            ylabel="Frequencies",
            axes=None,
        )

        print(
            "Finally computing and visualizing the global feature importance of the best model...."
        )
        feat_importance = sj_utils.get_global_feature_importance(features, scores)
        sj_utils.plot_global_feature_importance(feat_importance)
        print("All tasks SUCCESSFULLY completed!")

    print("=" * 100)