# Models Training using Toy Dataset

In [83]:
#! jupyter nbextension enable jupyter-black-master/jupyter-black

#! pip install botocore==1.12.201

#! pip install shap
#! pip install xgboost

In [84]:
import os
import time
import torch
import pandas as pd
import numpy as np
import time
import matplotlib.pyplot as plt
import seaborn as sns

from collections import Counter

import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from urllib.parse import urlparse
import tarfile
import pickle
import shutil

import shap
import xgboost as xgb

import sagemaker
import boto3
from sagemaker.tuner import IntegerParameter, CategoricalParameter, ContinuousParameter, HyperparameterTuner
from sagemaker.image_uris import retrieve

import deep_id_pytorch

from lstm_models import *
from att_lstm_models import *
from lstm_utils import *
from xgboost_utils import *


In [85]:
%load_ext autoreload

%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## 1. LSTM Model Training

### Constants

In [86]:
nrows = 1e9
min_freq = 1

seq_len = 30

train_data_path = "../../data/toy_dataset/data/{}/train.csv".format(seq_len)
valid_data_path = "../../data/toy_dataset/data/{}/val.csv".format(seq_len)
test_data_path = "../../data/toy_dataset/data/{}/test.csv".format(seq_len)

lstm_model_save_path = './output/{}/lstm/models/model'.format(seq_len)
lstm_results_save_path = "./output/{}/lstm/results/".format(seq_len)

batch_size = 64

n_epochs = 6
stop_num = 2

embedding_dim = 8
hidden_dim = 16
nlayers = 1
bidirectional = True
dropout = 0.3

target_colname = 'label'
uid_colname = 'patient_id'
x_inputs = [str(x) for x in range(29, -1, -1)]
target_value = '1'

rev = False

In [87]:
#LSTM Output Directory
for fp in [lstm_model_save_path, lstm_results_save_path]:
    if not os.path.isdir(os.path.split(fp)[0]):
        print(f'New directory created: {fp}')
        os.makedirs(os.path.split(fp)[0])

print(f"Cuda available: {torch.cuda.is_available()}")
model_device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

New directory created: ./output/900/lstm/models/model
New directory created: ./output/900/lstm/results/
Cuda available: True


### Create Vocab and Build Dataset

In [90]:
train_dataset, vocab = build_lstm_dataset(
                                train_data_path,
                                min_freq=min_freq,
                                uid_colname="patient_id",
                                target_colname="label",
                                max_len=seq_len,
                                target_value=target_value,
                                vocab=None,
                                nrows=nrows,
                                rev=rev
                            )
valid_dataset, _ = build_lstm_dataset(
                                valid_data_path,
                                min_freq=min_freq,
                                uid_colname="patient_id",
                                target_colname="label",
                                max_len=seq_len,
                                target_value=target_value,
                                vocab=vocab,
                                nrows=nrows,
                                rev=rev
                            )

test_dataset, _ = build_lstm_dataset(
                                test_data_path,
                                min_freq=min_freq,
                                uid_colname="patient_id",
                                target_colname="label",
                                max_len=seq_len,
                                target_value=target_value,
                                vocab=vocab,
                                nrows=nrows,
                                rev=rev
                            )

Building dataset from ../../data/toy_dataset/data/900/train.csv..
Success!
Building dataset from ../../data/toy_dataset/data/900/val.csv..
Success!
Building dataset from ../../data/toy_dataset/data/900/test.csv..
Success!


In [91]:
train_dataloader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    shuffle=True,
    num_workers=2
)

valid_dataloader = DataLoader(
    valid_dataset,
    batch_size=batch_size,
    shuffle=False,
    num_workers=2
)

test_dataloader = DataLoader(
    test_dataset,
    batch_size=batch_size,
    shuffle=False,
    num_workers=2
)

### SimpleLSTM Model Training and Evaluation

In [92]:
lstm_model = SimpleLSTM(embedding_dim, hidden_dim, vocab, model_device, nlayers=nlayers, dropout=dropout)
lstm_model = lstm_model.cuda()

In [93]:
lstm_model

SimpleLSTM(
  (emb_layer): Embedding(32, 8, padding_idx=0)
  (lstm): LSTM(8, 16, batch_first=True, dropout=0.3, bidirectional=True)
  (pred_layer): Linear(in_features=32, out_features=1, bias=True)
  (dpt): Dropout(p=0.3, inplace=False)
)

In [94]:
#loss_function = nn.CrossEntropyLoss()
loss_function = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(lstm_model.parameters(), lr=0.05)

scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 4, gamma=0.9)

In [95]:
best_valid_loss = float("inf")
valid_worse_loss = 0  # enable early stopping


for epoch in range(n_epochs):

    start_time = time.time()

    train_loss, train_auc = epoch_train_lstm(
        lstm_model, train_dataloader, optimizer, loss_function
    )

    valid_loss, valid_auc = epoch_val_lstm(
       lstm_model, valid_dataloader, loss_function)#, return_preds=False
    #)
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    print(f"Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s")

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(lstm_model.state_dict(), lstm_model_save_path)
        print("Saved Model, epoch {}".format(epoch))
        valid_worse_loss = 0

    else:
        valid_worse_loss += 1
        if valid_worse_loss == stop_num:
            print("EARLY STOP ------")
            break

    scheduler.step()
    print(
        f"Train Loss: {train_loss:.3f} | Train AUC: {train_auc:.2f} \t Val. Loss: {valid_loss:.3f} |  Val. AUC: {valid_auc:.4f}"
    )

Epoch: 01 | Epoch Time: 0m 25s
Saved Model, epoch 0
Train Loss: 0.696 | Train AUC: 0.50 	 Val. Loss: 0.699 |  Val. AUC: 0.4912
Epoch: 02 | Epoch Time: 0m 25s
Saved Model, epoch 1
Train Loss: 0.696 | Train AUC: 0.50 	 Val. Loss: 0.693 |  Val. AUC: 0.5007
Epoch: 03 | Epoch Time: 0m 25s
Train Loss: 0.696 | Train AUC: 0.50 	 Val. Loss: 0.694 |  Val. AUC: 0.4934
Epoch: 04 | Epoch Time: 0m 25s
Saved Model, epoch 3
Train Loss: 0.695 | Train AUC: 0.50 	 Val. Loss: 0.693 |  Val. AUC: 0.5132
Epoch: 05 | Epoch Time: 0m 25s
Train Loss: 0.694 | Train AUC: 0.51 	 Val. Loss: 0.699 |  Val. AUC: 0.5112
Epoch: 06 | Epoch Time: 0m 25s
EARLY STOP ------


In [96]:
lstm_model.load_state_dict(torch.load(lstm_model_save_path))
test_loss, test_auc = epoch_val_lstm(
   lstm_model, test_dataloader, loss_function)#, return_preds=False

print(f"Test Loss: {test_loss:.3f} | Test AUC: {test_auc:.2f}")

Test Loss: 0.693 | Test AUC: 0.48


### LSTM with Attention Model Training and Evaluation

In [97]:
lstm_att_model_save_path = './output/{}/lstm-att/models/model'.format(seq_len)
lstm_att_results_save_path = "./output/{}/lstm-att/results/".format(seq_len)

In [98]:
#LSTM with Attention Output Directory
for fp in [lstm_att_model_save_path, lstm_att_results_save_path]:
    if not os.path.exists(os.path.split(fp)[0]):
        print(f'New directory created: {fp}')
        os.makedirs(os.path.split(fp)[0])

New directory created: ./output/900/lstm-att/models/model
New directory created: ./output/900/lstm-att/results/


In [99]:
lstm_att_model = AttLSTM(embedding_dim, hidden_dim, vocab, model_device, nlayers=nlayers, dropout=dropout)
lstm_att_model = lstm_att_model.cuda()

In [100]:
lstm_att_model

AttLSTM(
  (emb_layer): Embedding(32, 8, padding_idx=0)
  (lstm): LSTM(8, 16, batch_first=True, dropout=0.3, bidirectional=True)
  (pred_layer): Linear(in_features=64, out_features=1, bias=True)
  (attn_layer): Linear(in_features=32, out_features=1, bias=True)
  (dpt): Dropout(p=0.3, inplace=False)
  (context_layer): Linear(in_features=32, out_features=1, bias=True)
)

In [101]:
#loss_function = nn.CrossEntropyLoss()
loss_function = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(lstm_att_model.parameters(), lr=0.05)

scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 4, gamma=0.9)

In [102]:
best_valid_loss = float("inf")
valid_worse_loss = 0  # enable early stopping


for epoch in range(n_epochs):

    start_time = time.time()

    train_loss, train_auc = epoch_train_lstm(
        lstm_att_model, train_dataloader, optimizer, loss_function
    )

    valid_loss, valid_auc = epoch_val_lstm(
       lstm_att_model, valid_dataloader, loss_function)#, return_preds=False
    #)
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    print(f"Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s")

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(lstm_att_model.state_dict(), lstm_att_model_save_path)
        print("Saved Model, epoch {}".format(epoch))
        valid_worse_loss = 0

    else:
        valid_worse_loss += 1
        if valid_worse_loss == stop_num:
            print("EARLY STOP ------")
            break

    scheduler.step()
    print(
        f"Train Loss: {train_loss:.3f} | Train AUC: {train_auc:.2f} \t Val. Loss: {valid_loss:.3f} |  Val. AUC: {valid_auc:.4f}"
    )

Epoch: 01 | Epoch Time: 0m 26s
Saved Model, epoch 0
Train Loss: 0.457 | Train AUC: 0.87 	 Val. Loss: 0.347 |  Val. AUC: 0.8999
Epoch: 02 | Epoch Time: 0m 26s
Saved Model, epoch 1
Train Loss: 0.360 | Train AUC: 0.89 	 Val. Loss: 0.340 |  Val. AUC: 0.8981
Epoch: 03 | Epoch Time: 0m 26s
Train Loss: 0.409 | Train AUC: 0.89 	 Val. Loss: 0.370 |  Val. AUC: 0.8990
Epoch: 04 | Epoch Time: 0m 26s
EARLY STOP ------


In [103]:
lstm_att_model.load_state_dict(torch.load(lstm_att_model_save_path))
test_loss, test_auc = epoch_val_lstm(
   lstm_att_model, test_dataloader, loss_function)#, return_preds=False

print(f"Test Loss: {test_loss:.3f} | Test AUC: {test_auc:.2f}")

Test Loss: 0.334 | Test AUC: 0.90


## 2. XGBoost Model Training

### Data Preprocessing

In [104]:
x_train_one_hot_path = 'output/{}/xgboost/data/train_one_hot.csv'.format(seq_len)
x_valid_one_hot_path = 'output/{}/xgboost/data/val_one_hot.csv'.format(seq_len)
x_test_one_hot_path = 'output/{}/xgboost/data/test_one_hot.csv'.format(seq_len)

x_train_data_path = 'output/{}/xgboost/data/train.csv'.format(seq_len)
x_valid_data_path = 'output/{}/xgboost/data/val.csv'.format(seq_len)
x_test_data_path = 'output/{}/xgboost/data/test.csv'.format(seq_len)

s3_output_data_dir = 's3://merck-paper-bucket/{}/data'.format(seq_len)

In [105]:
df = pd.read_csv(train_data_path)
print(df.shape)
df.head()

(18000, 903)


Unnamed: 0,index,899,898,897,896,895,894,893,892,891,...,7,6,5,4,3,2,1,0,label,patient_id
0,58,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,...,hay_fever_N,backache_N,quad_injury_N,quad_injury_N,ACL_tear_N,pneumonia_H,pneumonia_H,cardiac_rehab_U,0,W522K3U1NM
1,583,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,...,myopia_N,dental_exam_N,peanut_allergy_N,quad_injury_N,ingrown_nail_N,myopia_N,hay_fever_N,tachycardia_H,0,4PY82TXWUI
2,2882,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,...,eye_exam_N,ACL_tear_N,annual_physical_N,ACL_tear_N,myopia_N,headache_N,foot_pain_N,apnea_H,1,HD8EMMXS9G
3,2159,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,...,quad_injury_N,ingrown_nail_N,foot_pain_N,eye_exam_N,ACL_tear_N,peanut_allergy_N,hay_fever_N,myopia_N,1,A07ZWN6OWW
4,2317,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,...,ingrown_nail_N,quad_injury_N,annual_physical_N,peanut_allergy_N,eye_exam_N,eye_exam_N,foot_pain_N,annual_physical_N,1,I37X5VLIV1


In [106]:
# def get_valid_tokens(tokens):
#     """Get all tokens except <pad> and <unk>"""
#     my_tokens = []
#     for key, val in tokens.items():
#         if val>=2:
#             my_tokens.append(key)
#     my_tokens
#     return my_tokens

In [107]:
tokens = lstm_model.vocab._vocab
my_tokens = get_valid_tokens(tokens)
#my_tokens

In [108]:
df.head()

Unnamed: 0,index,899,898,897,896,895,894,893,892,891,...,7,6,5,4,3,2,1,0,label,patient_id
0,58,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,...,hay_fever_N,backache_N,quad_injury_N,quad_injury_N,ACL_tear_N,pneumonia_H,pneumonia_H,cardiac_rehab_U,0,W522K3U1NM
1,583,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,...,myopia_N,dental_exam_N,peanut_allergy_N,quad_injury_N,ingrown_nail_N,myopia_N,hay_fever_N,tachycardia_H,0,4PY82TXWUI
2,2882,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,...,eye_exam_N,ACL_tear_N,annual_physical_N,ACL_tear_N,myopia_N,headache_N,foot_pain_N,apnea_H,1,HD8EMMXS9G
3,2159,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,...,quad_injury_N,ingrown_nail_N,foot_pain_N,eye_exam_N,ACL_tear_N,peanut_allergy_N,hay_fever_N,myopia_N,1,A07ZWN6OWW
4,2317,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,...,ingrown_nail_N,quad_injury_N,annual_physical_N,peanut_allergy_N,eye_exam_N,eye_exam_N,foot_pain_N,annual_physical_N,1,I37X5VLIV1


In [109]:
prepare_data(train_data_path, x_train_one_hot_path, x_train_data_path, seq_len, target_colname, my_tokens, s3_output_data_dir)
prepare_data(valid_data_path, x_valid_one_hot_path, x_valid_data_path, seq_len, target_colname, my_tokens, s3_output_data_dir)
prepare_data(test_data_path, x_test_one_hot_path, x_test_data_path, seq_len, target_colname, my_tokens, s3_output_data_dir)

Sucess!
Sucess!
Sucess!


### XGBoost Model Training

In [110]:
BUCKET = 'merck-paper-bucket'
DATA_PREFIX = '{}/data'.format(seq_len)
MODEL_PREFIX = '{}/xgboost/model'.format(seq_len)
label = 'label'

output_results_path = 'output/{}/xgboost/train/train_results.csv'.format(seq_len)
local_model_dir = 'output/{}/xgboost/models/'.format(seq_len)
s3_output_path = 's3://{}/{}/output'.format(BUCKET, MODEL_PREFIX)

###Algorithm config
ALGORITHM = 'xgboost'
REPO_VERSION = '1.2-1'

###Hyperparameter tuning config
TRAIN_INSTANCE_TYPE = 'ml.m5.4xlarge'#'ml.m4.16xlarge'
TRAIN_INSTANCE_COUNT = 1
MAX_PARALLEL_JOBS = 1#4 #TODO: Remove
MAX_TRAIN_JOBS = 1#20

EVALUATION_METRIC = 'auc'
OBJECTIVE = 'binary:logistic'
OBJECTIVE_METRIC_NAME = 'validation:auc'

#Update hyperparameter ranges
# HYPERPARAMETER_RANGES = {'eta': ContinuousParameter(0, 1),
#                         'alpha': ContinuousParameter(0, 2),
#                         'max_depth': IntegerParameter(1, 10)}

HYPERPARAMETER_RANGES = {'eta': ContinuousParameter(0.1, 0.5),
                       'alpha': ContinuousParameter(0, 2),
                       'max_depth': IntegerParameter(1, 10),
                       'gamma': ContinuousParameter(0, 5),
                       'num_round': IntegerParameter(200, 500),
                       'colsample_bylevel': ContinuousParameter(0.1, 1.0),
                       'colsample_bynode': ContinuousParameter(0.1, 1.0),
                       'colsample_bytree': ContinuousParameter(0.5, 1.0),
                       'lambda': ContinuousParameter(0, 1000),
                       'max_delta_step': IntegerParameter(0, 10),
                       'min_child_weight': ContinuousParameter(0, 120),
                       'subsample': ContinuousParameter(0.5, 1.0),
                       }

In [111]:
### SageMaker Initialization
region = boto3.Session().region_name
role = sagemaker.get_execution_role()
smclient = boto3.Session().client('sagemaker')

sess = sagemaker.Session()

container = retrieve(ALGORITHM, region, version=REPO_VERSION)

start = time.time()
print('Training for seq_len={}, label={}...'.format(seq_len, label))
#Prepare the input train & validation data path
s3_input_train = sagemaker.inputs.TrainingInput(s3_data='s3://{}/{}/train'.format(BUCKET, DATA_PREFIX), content_type='csv')
s3_input_validation = sagemaker.inputs.TrainingInput(s3_data='s3://{}/{}/val'.format(BUCKET, DATA_PREFIX), content_type='csv')

#Class Imbalance
scale_pos_weight = 1.0 # negative/positive

data_channels = {'train': s3_input_train, 'validation': s3_input_validation}

tuner = train_hpo(hyperparameter_ranges=HYPERPARAMETER_RANGES, 
                  container=container, 
                  execution_role=role, 
                  instance_count=TRAIN_INSTANCE_COUNT, 
                  instance_type=TRAIN_INSTANCE_TYPE, 
                  output_path=s3_output_path, 
                  sagemaker_session=sess, 
                  eval_metric=EVALUATION_METRIC, 
                  objective=OBJECTIVE, 
                  objective_metric_name=OBJECTIVE_METRIC_NAME, 
                  max_train_jobs=MAX_TRAIN_JOBS, 
                  max_parallel_jobs=MAX_PARALLEL_JOBS, 
                  scale_pos_weight=scale_pos_weight, 
                  data_channels=data_channels)

#Get the hyperparameter tuner status at regular interval
val_auc, best_model_path = get_tuner_status_and_result_until_completion(tuner, seq_len, label)

result = [label, seq_len, val_auc, best_model_path]
training_results = [result]

print('Success! Total training time={} mins.'.format((time.time()-start)/60.0))
#Save the results to file
df_results = pd.DataFrame(training_results, columns=['class', 'seq_len', 'val_auc', 'best_model_path'])

if not os.path.isdir(os.path.split(output_results_path)[0]):
    os.makedirs(os.path.split(output_results_path)[0])

df_results.to_csv(output_results_path, index=False)
print('ALL SUCCESS!')

Training for seq_len=900, label=label...
.............................................!
Total jobs completed: 1
Metric: validation:auc
Best AUC: 0.8989
Success! Total training time=3.8267794847488403 mins.
ALL SUCCESS!
