In [1]:
from __future__ import print_function
from __future__ import division
import os
import pickle
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from gram_helpers import train_model, calculate_dimSize, get_rootCode, pad_matrix, build_tree
from gram_module import GRAM as gram_model

In [2]:
tree_file = 'outputs/mimic'
seq_file = 'outputs/mimic.seqs'
label_file = 'outputs/mimic.3digitICD9.seqs'

embd_dim_size = 100
attn_dim_size = 100
rnn_dim_size = 100

# torch.cuda.is_available() checks and returns a Boolean True if a GPU is available, else it'll return False
device = torch.device('cuda:2' if torch.cuda.is_available() else 'cpu')

# Batch size for training (change depending on how much memory you have)
batch_size = 100

# Number of epochs to train for
num_epochs = 10

In [3]:
num_leaves = calculate_dimSize(seq_file)
num_classes = calculate_dimSize(label_file)
num_ancestors = get_rootCode(tree_file+'.level2.pk') - num_leaves + 1

In [4]:
leaves_list = []
ancestors_list = []
for i in range(5, 0, -1):
    leaves, ancestors = build_tree(tree_file + '.level' + str(i) + '.pk')
    leaves_list.extend(leaves)
    ancestors_list.extend(ancestors)

seqs = pickle.load(open(seq_file, 'rb'))
labels = pickle.load(open(label_file, 'rb'))

In [5]:
model = gram_model(leaves_list, ancestors_list, num_leaves, num_ancestors,
                   embd_dim_size, attn_dim_size, rnn_dim_size, num_classes, device)
# Send the model to GPU
model = model.to(device)

In [6]:
model

GRAM(
  (embed_init): Embedding(5623, 100)
  (dag_attention): DAGAttention(
    (linear1): Linear(in_features=200, out_features=100, bias=True)
    (linear2): Linear(in_features=100, out_features=1, bias=True)
  )
  (gru): GRUNet(
    (gru): GRU(100, 100, num_layers=2, batch_first=True, dropout=0.2)
    (fc): Linear(in_features=100, out_features=942, bias=True)
  )
)

In [7]:
params_to_update = model.parameters()
for name,param in model.named_parameters():
    if param.requires_grad == True:
        print("\t",name)

	 embed_init.weight
	 dag_attention.linear1.weight
	 dag_attention.linear1.bias
	 dag_attention.linear2.weight
	 dag_attention.linear2.bias
	 gru.gru.weight_ih_l0
	 gru.gru.weight_hh_l0
	 gru.gru.bias_ih_l0
	 gru.gru.bias_hh_l0
	 gru.gru.weight_ih_l1
	 gru.gru.weight_hh_l1
	 gru.gru.bias_ih_l1
	 gru.gru.bias_hh_l1
	 gru.fc.weight
	 gru.fc.bias


In [8]:
# Observe that all parameters are being optimized
optimizer = optim.Adadelta(params_to_update, lr=0.0001, momentum=0.9)
# # Setup the loss fxn
# criterion = nn.CrossEntropyLoss()
# Setup the loss fxn
# criterion = nn.BCELoss()

In [9]:
criterion = nn.MSELoss()

In [10]:
import copy
from gram_helpers import load_data
import random
import time

In [14]:
sequences = seqs
val_loss_history = []
best_model_wts = copy.deepcopy(model.state_dict())
best_loss = 0.0
epoch_duration = 0.0

print('Loading data ... ')
train_set, valid_set, test_set = load_data(sequences, labels)
data_dict = dict()
data_dict['train'] = train_set
data_dict['val'] = valid_set
data_dict['test'] = test_set

print('done!!')

for epoch in range(num_epochs):
    print('Epoch {}/{}'.format(epoch, num_epochs - 1))
    print('-' * 10)

    # Each epoch has a training and validation phase
    for phase in ['train', 'val', 'test']:
        if phase == 'train':
            model.train()  # Set model to training mode
        else:
            model.eval()  # Set model to evaluate mode

        running_loss = 0.0

        data_set = data_dict[phase]
        n_batches = int(np.ceil(float(len(data_set[0])) / float(batch_size)))

        start_time = time.time()
        # Iterate over data.
        for index in random.sample(range(n_batches), n_batches):
            batchX = data_set[0][index * batch_size:(index + 1) * batch_size]
            batchY = data_set[1][index * batch_size:(index + 1) * batch_size]
            x, y, mask, lengths = pad_matrix(batchX, batchY, num_leaves, num_classes)

            batchX = torch.from_numpy(x).to(device)
            batchY = torch.from_numpy(y).to(device)
            lengths = torch.from_numpy(lengths).to(device)

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward
            # track history if only in train
            with torch.set_grad_enabled(phase == 'train'):
                # Get model outputs and calculate loss
                outputs = model(batchX, mask)

                # Customise Loss function
                logEps = 1e-8
                cross_entropy = -(batchY * torch.log(outputs + logEps) +
                                  (1. - batchY) * torch.log(1. - outputs + logEps))
                loglikelihood = cross_entropy.sum(axis=2).sum(axis=1) / lengths
                loss = torch.mean(loglikelihood)

                # backward + optimize only if in training phase
                if phase == 'train':
                    loss.backward(retain_graph=True)
                    # loss.backward()
                    optimizer.step()

            # statistics
            running_loss += loss.item()
        duration = time.time() - start_time
        epoch_loss = running_loss / n_batches
        print('{} Loss: {:.4f}, Duration: {}'.format(phase, epoch_loss, duration))

        # deep copy the model
        if phase == 'val' and epoch_loss < best_loss:
            best_loss = epoch_loss
            best_model_wts = copy.deepcopy(model.state_dict())
        if phase == 'val':
            val_loss_history.append(epoch_loss)
        if phase == 'train':
            epoch_duration += duration

print('Training complete in {:.0f}m {:.0f}s'.format(epoch_duration // 60, epoch_duration % 60))
print('Best val Loss: {:4f}'.format(best_loss))

# load best model weights
model.load_state_dict(best_model_wts)

Loading data ... 
done!!
Epoch 0/9
----------
train Loss: 479.9710, Duration: 69.00863933563232
val Loss: 367.9665, Duration: 0.051615238189697266
test Loss: 364.6855, Duration: 0.07424521446228027
Epoch 1/9
----------
train Loss: 269.6667, Duration: 68.89263582229614
val Loss: 188.2605, Duration: 0.04938030242919922
test Loss: 186.2393, Duration: 0.08147525787353516
Epoch 2/9
----------
train Loss: 150.5180, Duration: 68.90287613868713
val Loss: 120.8479, Duration: 0.05173969268798828
test Loss: 119.5378, Duration: 0.0677804946899414
Epoch 3/9
----------
train Loss: 106.1804, Duration: 69.24955630302429
val Loss: 93.8826, Duration: 0.04978537559509277
test Loss: 92.8466, Duration: 0.0735173225402832
Epoch 4/9
----------
train Loss: 86.2953, Duration: 69.3009614944458
val Loss: 80.0565, Duration: 0.0503997802734375
test Loss: 79.1619, Duration: 0.07654929161071777
Epoch 5/9
----------
train Loss: 75.4458, Duration: 69.01796674728394
val Loss: 71.9194, Duration: 0.049913644790649414
tes

<All keys matched successfully>

In [None]:
model_ft, hist = train_model(model, seqs, labels, criterion, optimizer, device, batch_size, num_epochs, num_leaves, num_classes)