# Readmission Modeling: Script 2

In [1]:
import torch 
import pandas as pd
import numpy as np
import torch.nn.functional as F
from torch import nn
import torch.optim as optim
import torch.optim.lr_scheduler as lr_scheduler
from torch.utils.data import Dataset, DataLoader
import boto3
import pickle
from sklearn.metrics import roc_auc_score
import matplotlib
import matplotlib.pyplot as plt
import sys
from torchvision import transforms
import math
import torch.optim as optim
import time
from sklearn.model_selection import StratifiedKFold
from torch.nn import TransformerEncoder, TransformerEncoderLayer
from torch.autograd import Variable

**Verify that GPUs are availble, will use all**

In [2]:
print(torch.cuda.device_count())
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(DEVICE)

8
cuda


## Transformer model definition

In [3]:
class PositionalEncoding(nn.Module):
    '''Positional encoding to be used in transfromer model class
    '''
    def __init__(self, d_model, seq_length, dim, dropout=0.1, max_len=5000):
        '''
        Args:
            d_model: embedding size
            seq_length: length of events
            dim: num_events possible per day
        '''
        super(PositionalEncoding, self).__init__()
        
        self.dropout = nn.Dropout(p=dropout)

        # initialize positional encoding
        pe = torch.zeros(max_len, d_model)
        
        position = torch.tensor(
            [float(i) for i in range(dim)] * seq_length + 
            [0.0] * (max_len - seq_length * dim), 
            dtype=torch.float).unsqueeze(1)
        
        
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        
        # standard positional encoding
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

In [31]:
class TransformerModel(nn.Module):

    def __init__(self, ntoken, ninp, nhead, nhid, nlayers, num_classes, seq_length=366, num_events=10, dropout=0.5):  
        '''
        Initialize a transformer model for hospital readmissions. The model consists of the following:
        - Transformer encoder layers
        - Single 1D CNN layer
        - Final fully connected layer to determine probability of readmissions
        
        Args:
            
            ntoken: number of tokens in embedding layer (vocabulary size)
            ninp: embedding dimension (number of inputs)
            
            nhead: number of heads in transformers
            nhid: number of transformer linear dimensions
            
            nlayers: number of layers in transfromer
            
            num_classes: number of classes to predict (in this case, binary)
            
            seq_length: length of sequence in batched data
            num_events: maximum number of events per day
            
            dropout: strength of regularization
        '''
        super(TransformerModel, self).__init__()
        self.model_type = 'Transformer'
        print("parameters: embsize:{}, nhead:{}, nhid:{}, nlayers:{}, dropout:{}".format(
            ninp, nhead, nhid, nlayers, dropout))
        
        # Inputs into transformer: positional encoding and embeddings
        self.pos_encoder = PositionalEncoding(ninp, seq_length, num_events, dropout)
        self.seq_emb = nn.Embedding(ntoken, ninp)
        
        # Transformer layer
        encoder_layers = TransformerEncoderLayer(ninp, nhead, nhid, dropout)
        self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
        
        # CNN & fully connected layers
        self.ff = nn.Linear(int(seq_length) * num_events, int(seq_length))
        self.fc = nn.Linear(int(seq_length), num_classes)
        self.softmax = nn.Softmax(-1)
        self.Conv1d = nn.Conv1d(ninp, 1, 1, stride=1)
        
        # record
        self.ninp = ninp
        self.dropout = dropout
        self.num_events= num_events
        self.num_classes = num_classes
        
        # initalize weights
        self.init_weights()
    
    def seq_embedding(self, seq): 
        '''Convert the sequence of events into embedding vectors, into single row per observation'''
        batch, length_seq, dim = seq.size()
        seq = seq.contiguous().view(batch * length_seq, dim)
        
        seq = self.seq_emb(seq)
        
        seq = seq.contiguous().view(batch, -1, self.ninp)
        
        return seq


    def init_weights(self):
        '''Initialize weights in embedding and fully connected layers'''
        initrange = 0.1
        
        self.seq_emb.weight.data.uniform_(-initrange, initrange)
        
        self.fc.bias.data.zero_()
        self.fc.weight.data.uniform_(-initrange, initrange)
        
        self.ff.bias.data.zero_()
        self.ff.weight.data.uniform_(-initrange, initrange)

    def forward(self, src, mask=None): 
        '''
        Forward propagation steps:
        - convert events into embedding vectors & positional encoding
        - transformer encoder layers
        - CNN layer
        - final 
        '''        
        # create mask to remove padded entries from calculations for interpretability
        if mask is not None:
            mask= mask.view(mask.size()[0], -1)
            src_mask = (mask == 0)
            src_mask = src_mask.view(src_mask.size()[0], -1)
            out_mask = mask.float().masked_fill(mask == 0.0, float(-100.0)).masked_fill(mask == 1.0, float(0.0)).view(mask.size()[0], -1)
               
        src = self.seq_embedding(src).transpose(0,1) * math.sqrt(self.ninp)
        src = self.pos_encoder(src)
        
        trans_output = self.transformer_encoder(src, src_key_padding_mask=src_mask).transpose(0, 1).transpose(1,2)
        
        final_feature_map = self.Conv1d(trans_output).squeeze()
        
        #if out_mask is not None:
            # extract normalized feature importances per prediction
        importance_out = self.softmax(final_feature_map + out_mask)
        
        
        output = self.ff(final_feature_map)
        output = self.fc(output)
        
        # ensure no accidental additional dimensions
        if len(output.size()) != 2:
            output = output.view(1, 2)
        
        return output, importance_out

## Dataset and data loader functions to create batches

In [5]:
class BuildDataset(Dataset):
    '''
    Read in dataset, if data is already split into train, test, and/or validation sets.
    
    ProcessData: extract input, labels, mask from an existing Python object (via pickle or otherwise)
    ReadNewData: extract data directly from file.
    '''
    def __init__(self, data_file, seq_length=366, event_length=5, data_list=None, mode='read'): 
        '''mode: 'read' will process data'''
        if mode != 'read' and data_list != None:
            self.data, self.label, self.mask = self.ProcessData(data_list, seq_length, event_length)
        else:
            self.data, self.label, self.mask = self.ReadNewData(data_file, seq_length, event_length)
            
    def ReadNewData(self, file_dir, seq_length, event_length):
        # file needs to be in binary format, pickled from script #1
        with open(file_dir, 'rb') as f:
            data, label, mask = pickle.load(f)
            cut_data = data[:,-seq_length:,:event_length]
            cut_mask = mask[:,-seq_length:,:event_length]
            label = label.astype(int)
        return cut_data, label, cut_mask
    
    def ProcessData(self, data_list, seq_length, event_length):
        input_data, labels, mask = (data_list[0][:,-seq_length:,:event_length], 
                                    data_list[1], 
                                    data_list[2][:,-seq_length:,:event_length])
        return input_data, labels, mask
    
    def __len__(self):
        return len(self.label)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        return torch.tensor(self.data[idx]), torch.tensor(self.label[idx]), torch.tensor(self.mask[idx])

In [6]:
def ReadData(file_dir, seq_length, event_length):
    '''
    Function to read is specific number of recent events from specific number of days
    
    Used to read a large dataset, and script will apply kfold instead.
    '''
    with open(file_dir, 'rb') as f:
        ids, data, label, mask = pickle.load(f)
        ids = ids.astype(str)
        cut_data = data[:,-seq_length:,:event_length]
        cut_mask = mask[:,-seq_length:,:event_length]
        label = label.astype(int)
        
    return ids, cut_data, label, mask

## Training and Evaluation functions

In [7]:
def EopochTrain(model, dataloader, optimizer, criterion, device=DEVICE, metric='acc'):
    '''
    Model training, called by ModelProcess function
    
    Note: Does not return prediction importance scores
    '''
    epoch_loss = 0
    epoch_metric = 0
    
    model.train()
    
    # initialize lists to compare predictions & ground truth labels for metric calculation
    order_labels = []
    prediction_scores = []
    
    for idx, [seq, labels, mask] in enumerate(dataloader):
        optimizer.zero_grad()
        
        labels = labels.squeeze().long()
        seq, labels, mask = seq.cuda(), labels.cuda(), mask.cuda()
        
        predictions, _ = model(seq, mask=mask)
        
        loss = criterion(predictions, labels)
        loss.backward()
        optimizer.step()
        
        order_labels.extend(labels.cpu().numpy())
        prediction_scores.extend(F.softmax(predictions, dim=-1).detach().cpu().numpy()[:,1])
        
        epoch_loss += loss.item()
    
    # calculate results
    if metric == 'acc':
        epoch_metric = get_average_accuracy(prediction_scores, order_labels)
    elif metric == 'auc':
        epoch_metric = roc_auc_score(order_labels, prediction_scores)
    
    return epoch_loss / len(dataloader), epoch_metric

In [8]:
def EopochVal(model, dataloader, optimizer, criterion, device=DEVICE, metric='auc'):
    '''
    Evaluate model performance, called by ModelProcess function
    
    Returns predictions, metrics and importance scores
    '''
    epoch_loss = 0
    epoch_metric = 0
    
    model.eval()
    
    # initialize lists to compare predictions & ground truth labels
    # and extract importance scores for prediction
    order_labels = []
    prediction_scores = []
    events = []
    important_scores = []
    
    for idx, [seq, labels, mask] in enumerate(dataloader):
        # data formatting/loading
        labels = labels.squeeze().long()
        events.extend(seq.view(seq.size()[0], -1).squeeze().numpy())
        seq, labels, mask = seq.cuda(), labels.cuda(), mask.cuda()
        
        predictions, importance = model(seq, mask=mask)
        
        loss = criterion(predictions, labels)
        
        important_scores.extend(importance.detach().cpu().numpy())
        order_labels.extend(labels.cpu().numpy())
        prediction_scores.extend(F.softmax(predictions, dim=-1).detach().cpu().numpy()[:,1])
        
        epoch_loss += loss.item()
        
    if metric == 'acc':
        epoch_metric = get_average_accuracy(prediction_scores, order_labels)
    elif metric == 'auc':
        epoch_metric = roc_auc_score(order_labels, prediction_scores)
        
    return epoch_loss / len(dataloader), epoch_metric, [order_labels, events, important_scores, prediction_scores]

In [9]:
def TrainingProcess(model, epoch, dataloaders:list, device=DEVICE, metric='auc'):
    '''
    Main function to call for model training.
    
    Must have at least training & test dataloaders
    
    Args:
        model: instantiation of model to be trained
        epoch: total number of epochs to train
        dataloaders: at least training & test dataloader, validation dataloader optional
        device: cpu or gpu
        metric: auc or accuracy (acc)
    
    Returns:
        tuple containing:
            (training metrics, val metrics, test_metrics, feature importances evaluated on test data)
    '''
    criterion = nn.CrossEntropyLoss().cuda() if device != 'cpu' else nn.CrossEntropyLoss()
    
    lr = 0.001
    optimizer = optim.AdamW(model.parameters(), lr=lr)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.9)

    print("device: ", device)
    
    # initialize lists for documenting training performance
    pre_test_metric = 0.0
    train_loss=[]
    train_metric=[]
    val_loss=[]
    val_metric=[]
    test_loss=[]
    test_metric=[]
    
    for i in range(epoch):
        print('-' * 10)
        print('Epoch {}/{}'.format(i+1, epoch))
        print('-' * 10)
        
        epoch_train_loss, epoch_train_metric = EopochTrain(
            model, dataloaders[0], optimizer, criterion, device=DEVICE, metric=metric)
        train_metric.append(epoch_train_metric)
        print('epoch_train_loss:',np.mean(epoch_train_loss),
              'epoch_train_metric:', np.mean(epoch_train_metric))
        
        # validation also provided
        if len(dataloaders) > 2:
            epoch_val_loss, epoch_val_metric, val_results = EopochVal(
                model, dataloaders[1], optimizer, criterion, device=DEVICE, metric=metric)
            
            torch.save(model.module.state_dict(), 
                       './model_weights/emsize-{}_head-{}_layers-{}_epoch-{}_valauc-{}.pth'.format(
                           emsize, nhead, nlayers, str(i), np.round(epoch_val_metric, decimals=3)))
            
            val_loss.append(epoch_val_loss)
            val_metric.append(epoch_val_metric)
            
            print('epoch_val_loss:',np.mean(epoch_val_loss),
                  'epoch_val_metric:', np.mean(epoch_val_metric))
            
        # predictions on test data 
        epoch_test_loss, epoch_test_metric, importance_results = EopochVal(
            model, dataloaders[-1], optimizer, criterion, device=DEVICE, metric=metric)
        test_loss.append(epoch_test_loss)
        test_metric.append(epoch_test_metric)
        print('epoch_test_loss:',np.mean(epoch_test_loss),
              'epoch_test_metric:', np.mean(epoch_test_metric))
        
        if epoch_test_metric > pre_test_metric:
            print("updated")
            pre_test_metric = epoch_test_metric
            final_importance_results = importance_results
            torch.save(model.module.state_dict(), 
                       './model_weights/emsize-{}_head-{}_layers-{}_epoch-{}_auc-{}.pth'.format(
                           emsize, nhead, nlayers, str(i), np.round(epoch_test_metric, decimals=3)))
        
        scheduler.step()
        
    return train_metric, val_metric, test_metric, final_importance_results

## Load data & vocabulary

In [32]:
whole_ids, whole_data, whole_labels, whole_mask = ReadData('data/np_re_last30_non3digit_latest.pkl', 
                                                           seq_length=30, 
                                                           event_length=30)
  
print('whole data is done')

whole data is done


In [33]:
whole_mask.shape

(1562223, 30, 30)

In [12]:
vocab = torch.load('data/pos_vocab_last30_non3digit')
vocab.itos[52]

'd_0389'

## Model training & evaluation

In [34]:
count = 0
train_batch_size = 1280
test_batch_size = 1280

ntokens = len(vocab.stoi) # the size of vocabulary
emsize = 8 # embedding dimension
nhid = 16 # the dimension of the feedforward network model in nn.TransformerEncoder
nlayers = 1 # the number of nn.TransformerEncoderLayer in nn.TransformerEncoder
nhead = 1 # the number of heads in the multiheadattention models
dropout = 0.1 # the dropout value
n_class = 2


In [36]:

# split data for 5 fold cross validation
skf = StratifiedKFold(n_splits=5, random_state=9999)
splits = list(skf.split(whole_data, whole_labels))

test_metrics = []
results = []
for train_index, test_index in splits:
    ids = whole_ids[test_index]
    torch.save(ids, 'data/discharge_ids_test_' + str(count))
    X_train, y_train, mask_train = whole_data[train_index], whole_labels[train_index], whole_mask[train_index]
    X_test, y_test, mask_test = whole_data[test_index], whole_labels[test_index], whole_mask[test_index]


    # datasets
    train_dataset = BuildDataset('', seq_length=30, event_length=30, data_list=[X_train, y_train, mask_train], mode='load')
    test_dataset = BuildDataset('', seq_length=30, event_length=30, data_list=[X_test, y_test, mask_test], mode='load')
    train_dataloader = DataLoader(train_dataset, batch_size=train_batch_size, shuffle=True, num_workers=4)
    test_dataloader = DataLoader(test_dataset, batch_size=test_batch_size, shuffle=False, num_workers=4)

    # create model
    model = TransformerModel(ntokens, emsize, nhead, nhid, nlayers, n_class, seq_length=30, num_events=30, dropout=dropout).to(DEVICE)
    if torch.cuda.is_available():
        model = model.cuda()
    if torch.cuda.device_count()>1:
        model = nn.DataParallel(model)

    # training
    epoch=6
    train_metric, val_metric, test_metric, final_importance_results = TrainingProcess(
        model, epoch, [train_dataloader, test_dataloader])

    # save data
    final_importance_results.insert(0, ids[:len(final_importance_results[0])].tolist())
    test_metrics.append(np.mean(test_metric))
    results.append(final_importance_results)
    torch.save(final_importance_results, 'data/explain/final_importance_results_kfold' + str(count))
    #break
    count += 1


# final output result    
print(test_metrics, np.mean(test_metrics))


parameters: embsize:8, nhead:1, nhid:16, nlayers:1, dropout:0.1
device:  cuda
----------
Epoch 1/6
----------
epoch_train_loss: 0.4066599204816008 epoch_train_metric: 0.6407312776102858
epoch_test_loss: 0.40719436029998624 epoch_test_metric: 0.663379431053796
updated
----------
Epoch 2/6
----------
epoch_train_loss: 0.39838327590982403 epoch_train_metric: 0.6702953738264921
epoch_test_loss: 0.4057706110331477 epoch_test_metric: 0.6654229397910136
updated
----------
Epoch 3/6
----------
epoch_train_loss: 0.39644428733793474 epoch_train_metric: 0.6767852635864537
epoch_test_loss: 0.4056453086891953 epoch_test_metric: 0.6645787184083483
----------
Epoch 4/6
----------
epoch_train_loss: 0.39492144704841176 epoch_train_metric: 0.681439028781885
epoch_test_loss: 0.4069599960531507 epoch_test_metric: 0.6631115803219891
----------
Epoch 5/6
----------
epoch_train_loss: 0.39377244818906093 epoch_train_metric: 0.6849433009579863
epoch_test_loss: 0.40850276740229857 epoch_test_metric: 0.663167621

## Manual checking results and uploading data back to S3

In [239]:
#id, label, event, importance, probability
import random
row = random.randint(0, len(results[-1][2]))
row = 0
indices = np.nonzero(results[-1][2][row])
print(indices)
arg_list = np.argsort(results[-1][3][row])[::-1]
print(results[-1][1][row], np.array(results[-1][3][row])[arg_list[:50]], results[-1][4][row])
for i, v in enumerate(arg_list[:50]):
    if v not in indices[0]:
        print(i, v, vocab.itos[results[-1][2][row][v]])

(array([  0,  30,  60,  90, 120, 150, 180, 210, 240, 270, 300, 330, 360,
       390, 420, 450, 480, 481, 482, 510, 540, 570, 600, 630, 660, 690,
       691, 692, 693, 694, 695, 720, 750, 780, 781, 782, 783, 784, 785,
       786, 787, 788, 789, 810, 811, 812, 813, 840, 841, 842, 843, 844,
       845, 846, 847, 848, 849, 850, 870, 871, 872]),)
1 [0.14690582 0.11748163 0.10799016 0.10100966 0.07234918 0.06466733
 0.0642831  0.06264017 0.05430687 0.05067316 0.03408067 0.02019857
 0.0186831  0.01547983 0.01340081 0.00656008 0.00517903 0.00486624
 0.00425511 0.00425511 0.00378504 0.002817   0.0013862  0.00106932
 0.0007394  0.00069025 0.0003895  0.0003895  0.0003895  0.0003895
 0.0003895  0.0003895  0.0003895  0.0003895  0.0003895  0.0003895
 0.0003895  0.0003895  0.0003895  0.0003895  0.0003895  0.0003895
 0.0003895  0.0003895  0.0003895  0.0003895  0.0003895  0.0003895
 0.0003895  0.0003895 ] 0.17865418


In [26]:
#np.argwhere(np.array(results[-1][4]) > 0.5)
print(len(results))
for i in results[-1][4]:
    if i > 0.5:
        print(i)
print(np.count_nonzero(results[-1][1]))

1
0.51421535
0.5024973
0.52611023
0.5035754
0.5024492
0.5106884
0.5038529
0.50124544
46799


**Double check to make sure all the lengths agree**

In [62]:
print(len(final_importance_results[0]), len(final_importance_results[1]), len(final_importance_results[2]), len(final_importance_results[3]), len(final_importance_results[4]))

312300 312300 312300 312300 312300


In [118]:
#torch.save(final_importance_results, 'data/explainability')
! aws s3 cp Readmit_Transformer_whole.ipynb s3://cmsai-mrk-amzn/xianzeng/ lcheong/explain_transformer.ipynb .
!aws s3 cp data/pos_vocab_late30_non3digit s3://cmsai-mrk-amzn/xianzeng/
!aws s3 cp data/explain/final_importance_results_whole s3://cmsai-mrk-amzn/xianzeng/explain/explainablity #--recursive
! aws s3 cp data/discharge_ids_test_0 s3://cmsai-mrk-amzn/xianzeng/ 


The user-provided path data/pos_vocab_late30_non3digit does not exist.


In [37]:
import metrics

In [40]:
labels, scores = np.array(results[-3][1]), np.array(results[-3][4])
df = metrics.compute_metrics(labels, scores, 0.5, target_names=None, risk_list=[0.5, 1, 2, 5])
df.head()

Unnamed: 0,auroc,avgpr,precis,precis 0.5%,precis 1%,precis 2%,precis 5%,recall,recall 0.5%,recall 1%,recall 2%,recall 5%,calibration
0,0.649582,0.23218,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.018027


In [3]:
! aws s3 cp Readmit_Transformer_whole_commented.ipynb s3://cmsai-mrk-amzn/xianzeng/

upload: ./Readmit_Transformer_whole_commented.ipynb to s3://cmsai-mrk-amzn/xianzeng/Readmit_Transformer_whole_commented.ipynb
