# DeepLabeler

In this notebook we will train the DeepLabeler architecture.

In [None]:
from torch.utils.data import Dataset, TensorDataset
from torch.utils.data import DataLoader
from torch.utils.data.dataset import random_split
import torch.nn as nn
from gensim.models import Word2Vec
import gensim
from sklearn.preprocessing import MultiLabelBinarizer
import pandas as pd
import numpy as np
import torch
from tqdm import tqdm
tqdm.pandas()
from functools import partial
from sklearn.metrics import precision_recall_fscore_support, roc_auc_score
import matplotlib.pyplot as plt

from datetime import datetime

start_time = datetime.now()

torch.backends.cudnn.benchmark = True
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

## Load in data and pretrained models

In [None]:
data = pd.read_parquet('prepared-data.pq')
data.head()

In [None]:
data['toks'].apply(lambda x: len(x)).min()

Load in word2vec model

In [None]:
w2v = Word2Vec.load('word2vec.model')

Loading in these vectors as tensors before the collate function saved a lot of time. 

In [None]:
vecs = {word: torch.tensor(vec, device=device, dtype=torch.float) for word, vec in zip(w2v.wv.index_to_key, w2v.wv.vectors)}

Define the dataset

In [None]:
class CustomDataset(Dataset):
    
    def __init__(self, data):
        self.mlb = MultiLabelBinarizer()
        self.y = torch.tensor(np.vstack(self.mlb.fit_transform(data['ICD9_CODE'].to_list())), dtype=torch.float, device=device)
        self.W = data['toks'].to_list()
    
    def __len__(self):
        return self.y.shape[0]
    
    def __getitem__(self, index):
        return self.W[index], self.y[index]
    
dataset = CustomDataset(data)

In [None]:
assert next(iter(dataset))[-1].shape[0] == dataset.mlb.classes_.shape[0]

Define the collate function

In [None]:
def collate(w2v, max_token_length, data):
    """
    Get word vectors and document vectors for each batch

    Arguments:
        data: batch data from generator
        w2v: Pre-trained Word2Vec model
        max_token_length: Maximum length of sequences of tokens to use

    Returns:
        word_matrix (batch, max_token_length, w2v embedding size), document_vector (batch_size, d2v embedding size), y (batch_size, max_icd9_length)
    """
    W, y = zip(*data)
    y = torch.vstack(y)

    word_matrix = torch.zeros((len(W), max_token_length, w2v.vector_size), device=device)
    # D = torch.vstack(D)
    for i, sentence in enumerate(W):
        l = []
        j = 0
        for word in sentence:
            if j < 700:
                if vecs.get(word) is not None:
                    l.append(vecs[word])
                    j += 1
            else:
                break
        word_matrix[i, :len(l)] = torch.vstack(l)

    return word_matrix, y

In [None]:
# I used this cell to test my preprocessing script
# I found that minimizing copying tensors over (even in gpu) saves time

# # %%timeit
# W, D, y = dataset[:10]

# word_matrix = torch.zeros((len(W), max_token_length, w2v.vector_size), device=device)
# # D = torch.vstack(D)
# for i, sentence in enumerate(W):
#     j = 0
#     l = []
#     for word in sentence:
#         if j < 700:
#             if vecs.get(word) is not None:
#                 # word_matrix[i, j] = vecs[word]
#                 l.append(vecs[word])
#                 j += 1
#         else:
#             break
#     word_matrix[i, :len(l)] = torch.vstack(l)

Create train/test split and create data loaders

In [None]:
split = int(len(dataset)*0.8)
lengths = [split, len(dataset) - split]
train_dataset, val_dataset = random_split(dataset, lengths)

# Hyperparameters
batch_size = 64
max_token_length = 700
embedding_size = 100
output_size = len(dataset.mlb.classes_)

collate_fn = partial(collate, w2v, max_token_length)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn, num_workers=0)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn, num_workers=0)

In [None]:
# I used this cell for testing the time to loop, it eventually took 30s to loop through the whole dataset
# for W, D, y in tqdm(train_loader):
#     pass

Calcuate the convolution kernel size

In [None]:
def conv_output_volume(W, F, S, P):
    
    """
    TODO: Given the input volume size $W$, the kernel/filter size $F$, 
    the stride $S$, and the amount of zero padding $P$ used on the border, 
    calculate the output volume size.
    Note the output should a integer. 
    """
    
    # your code here
#     raise NotImplementedError
    return (W - F + 2*P ) // S + 1

## Define the Model

In [None]:
class DeepLabeler(nn.Module):
    def __init__(self, embedding_size, max_token_length, output_size):
        super().__init__()
        self.embedding_size = embedding_size
        self.max_token_length = max_token_length
        self.output_size = output_size

        self.cnn1 = nn.Conv2d(in_channels=1, out_channels=64, kernel_size=(3,3))
        self.cnn2 = nn.Conv2d(in_channels=1, out_channels=64, kernel_size=(4,4))
        self.cnn3 = nn.Conv2d(in_channels=1, out_channels=64, kernel_size=(5,5))
        self.pool1 = nn.MaxPool2d((conv_output_volume(self.max_token_length, 3, 1, 0), conv_output_volume(self.embedding_size, 3, 1, 0)))
        self.pool2 = nn.MaxPool2d((conv_output_volume(self.max_token_length, 4, 1, 0), conv_output_volume(self.embedding_size, 4, 1, 0)))
        self.pool3 = nn.MaxPool2d((conv_output_volume(self.max_token_length, 5, 1, 0), conv_output_volume(self.embedding_size, 5, 1, 0)))
        self.dropout = nn.Dropout(p=0.75)
        self.fc1 = nn.Linear(64*3, output_size)


    def forward(self, W: torch.Tensor):
        W = W.unsqueeze(dim=1)
        out1 = self.pool1(self.dropout(torch.relu(self.cnn1(W))))
        out2 = self.pool2(self.dropout(torch.relu(self.cnn2(W))))
        out3 = self.pool3(self.dropout(torch.relu(self.cnn3(W))))

        W_embeddings = torch.concat((out1.squeeze().squeeze(), out2.squeeze().squeeze(), out3.squeeze().squeeze()), dim=1)
        
        return torch.sigmoid(self.fc1(W_embeddings))

model = DeepLabeler(embedding_size, max_token_length, output_size)
model

In [None]:
pytorch_total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
pytorch_total_params

## Setup Loss and Optimizer

In [None]:
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

criterion.cuda()
model.cuda()


## Train Model

In [None]:
n_epoch = 5
epoch_loss = []

for epoch in range(n_epoch):
    curr_epoch_loss = []
    for W, y in tqdm(train_loader):
        """
        TODO: Within the loop, do the normal training procedures:
                pass the input through the model
                pass the output through loss_func to compute the loss (name the variable as *loss*)
                zero out currently accumulated gradient, use loss.basckward to backprop the gradients, then call optimizer.step
        """
        # clear gradients
        optimizer.zero_grad()
        
        # with torch.cuda.amp.autocast():
                # forward pass
        y_hat = model(W)

        # calucate loss
        loss = criterion(y_hat, y)

        # backward pass
        loss.backward()
        
        # optimizer step
        optimizer.step()
        
        curr_epoch_loss.append(loss.cpu().data.numpy())

    print(f"Epoch {epoch}: curr_epoch_loss={np.mean(curr_epoch_loss)}")
    epoch_loss.append(np.mean(curr_epoch_loss))

In [None]:
plt.figure()
plt.title("Epoch Loss")
plt.plot(epoch_loss)
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend(loc='upper right')
# plt.text()
# plt.savefig('deeplabeler-fscore.png')
plt.show()

In [None]:
torch.save(model.state_dict(), 'model-no-d2v.pt')

In [None]:
end = datetime.now()
total_time = end - start_time
total_time

# Evaluate Model

In [None]:
model2 = DeepLabeler(embedding_size, max_token_length, output_size)
model2.cuda()
model2.load_state_dict(torch.load('model-no-d2v.pt'))

In [None]:
Y = None
Y_hat = None
model2.eval()
with torch.no_grad():
    y_hat_list = []
    y_list = []
    for W, y in tqdm(val_loader):
        y_hat = model2(W).detach().cpu().numpy()
        y_list.append(y.detach().cpu().numpy())
        y_hat_list.append(y_hat)
    Y = np.vstack(y_list)
    Y_hat = np.vstack(y_hat_list)

## Evaluate at different thresholds

In [None]:
thresholds = np.arange(0, 1, 0.1)
data = []
for threshold in tqdm(thresholds):
    y_pred_cls = (Y_hat > threshold) * 1
    precision, recall, fscore, support = precision_recall_fscore_support(Y, y_pred_cls, average='micro')
    data.append((threshold, precision, recall, fscore, support))

## Store Results and Create Plots

In [None]:
df = pd.DataFrame(data, columns=['threshold', 'precision', 'recall', 'fscore', 'support'])
df.head()

In [None]:
df.iloc[df['fscore'].argmax()]

In [None]:
plt.figure()
plt.title("DeepLabeler Minus Doc2Vec F-score Curve")
plt.plot(df['threshold'], df['fscore'], label='F-Score')
plt.plot(df['threshold'], df['recall'], label='Recall')
plt.plot(df['threshold'], df['precision'], label='Precision')
plt.xlabel('Threshold Cutoff')
plt.ylabel('Metric')
plt.legend(loc='upper right')
# plt.text()
plt.savefig('deeplabeler-fscore-no-d2v.png')
plt.show()

In [None]:
df.to_csv('deeplabeler-scores-no-d2v.csv')

In [None]:
# I used this cell for debugging
import torch
import sys
print('__Python VERSION:', sys.version)
print('__pyTorch VERSION:', torch.__version__)
print('__CUDA VERSION', )
from subprocess import call
# call(["nvcc", "--version"]) does not work
! nvcc --version
print('__CUDNN VERSION:', torch.backends.cudnn.version())
print('__Number CUDA Devices:', torch.cuda.device_count())
print('__Devices')
# call(["nvidia-smi", "--format=csv", "--query-gpu=index,name,driver_version,memory.total,memory.used,memory.free"])
print('Active CUDA Device: GPU', torch.cuda.current_device())
print ('Available devices ', torch.cuda.device_count())
print ('Current cuda device ', torch.cuda.current_device())

In [None]:
end = datetime.now()
total_time = end - start_time
total_time