# Model training all folds d60_s30_vpos

In [1]:
import os
import pickle
import gc
import copy

import torch
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader

from utils import get_cuda
from data_proc import read_data, remove_death, build_vocab
from dataset_func import build_dataset, BuildDataset, get_dataloader, read_pickled_ds
from transformer_model import TransformerCNNModel
from model_func import training_process, epoch_val, epoch_train

In [2]:
torch.multiprocessing.set_sharing_strategy('file_system') 

### Input

In [3]:
# Filepaths
num_folds = 5

main_dir = "../../../data/readmission/"
train_dirs = [os.path.join(main_dir, f"fold_{idx}/train/") for idx in range(num_folds)]
test_dirs = [os.path.join(main_dir, f"fold_{idx}/test/") for idx in range(num_folds)]
vocab_dirs = [os.path.join(main_dir, f"fold_{idx}/vocab/") for idx in range(num_folds)]

train_dl_fps = [os.path.join(train_dir, "train_datalist_d60_s30_vall_mf10.pkl") for train_dir in train_dirs]
test_dl_fps = [os.path.join(test_dir, "test_datalist_d60_s30_vall_mf10.pkl") for test_dir in test_dirs]
vocab_fps = [os.path.join(vocab_dir, f"vocab_d60_s30_vall_mf10") for vocab_dir in vocab_dirs]

#model_dir = "./models_d30_s30_vpos/"
# Options
x_lst = [str(x) for x in range(365, -1, -1)]
n_days = 60
seq_per_day = 30
y_target = "unplanned_readmission"
uid = "discharge_id"
batch_size=140

### Read in data and create data loader: single fold

In [4]:
emsize = 8 # embedding dimension
nhid = 32 # the dimension of the feedforward network model in nn.TransformerEncoder
nlayers = 1 # the number of nn.TransformerEncoderLayer in nn.TransformerEncoder
nhead = 4 # the number of heads in the multiheadattention models
dropout = 0.4 # the dropout value
n_class = 2

In [None]:
train_metric = {}
for fold, (train_fp, test_fp, vocab_fp) in enumerate(
    zip(train_dl_fps, test_dl_fps, vocab_fps)
):
    print("\n" + "*" * 30 + "Fold {}".format(fold) + "*" * 50)
    print(
        f"Processing \n \
        \t train: {train_fp} \n \
        \t test: {test_fp} \n \
        \t vocab: {vocab_fp} \n"
    )

    whole_ids, whole_data, whole_labels, whole_mask = read_pickled_ds(
        file_dir=train_fp, seq_length=n_days, event_length=seq_per_day
    )
        
    train_dataset = BuildDataset(seq_length=60, event_length=30, 
        data_list=[whole_ids, whole_data, whole_labels, whole_mask]
    )
    train_dataloader = DataLoader(
        train_dataset, batch_size=batch_size, shuffle=True, num_workers=4
    )

    whole_ids, whole_data, whole_labels, whole_mask = read_pickled_ds(
        file_dir=test_fp, seq_length=n_days, event_length=seq_per_day
    )
    test_dataset = BuildDataset(seq_length=60, event_length=30, 
        data_list=[whole_ids, whole_data, whole_labels, whole_mask]
    )
    test_dataloader = DataLoader(
        test_dataset, batch_size=batch_size, shuffle=False, num_workers=4
    )
    
    vocab = torch.load(vocab_fp)
    ntokens = len(vocab.stoi)
    print(f"Nb tokens: {ntokens}")
    
    model = TransformerCNNModel(
        ntokens,
        emsize,
        nhead,
        nhid,
        nlayers,
        n_class,
        device="gpu",
        seq_length=n_days,
        num_events=seq_per_day,
        dropout=dropout,
    )

    train_metric["fold_" + str(fold)] = training_process(
        model=model,
        epoch=6,
        dataloaders={"train": train_dataloader, "val": test_dataloader},
        save_model=None,
        test=False,
    )


******************************Fold 0**************************************************
Processing 
         	 train: ../../../data/readmission/fold_0/train/train_datalist_d60_s30_vall_mf10.pkl 
         	 test: ../../../data/readmission/fold_0/test/test_datalist_d60_s30_vall_mf10.pkl 
         	 vocab: ../../../data/readmission/fold_0/vocab/vocab_d60_s30_vall_mf10 

Nb tokens: 20440
parameters: embsize:8, nhead:4, nhid:32, nlayers:1, dropout:0.4
available device: cuda
number of GPUS available: 4
device: gpu
----------Epoch 1/6------------------------------
epoch_train_loss: 0.4054347395896912 epoch train AUC: 0.6464787897025881
epoch_val_loss: 0.4038284572913589 epoch val AUC: 0.6637537158247352
----------Epoch 2/6------------------------------
epoch_train_loss: 0.40014958649990473 epoch train AUC: 0.6651080082782889
epoch_val_loss: 0.40214353180161094 epoch val AUC: 0.6658956928194959
----------Epoch 3/6------------------------------
epoch_train_loss: 0.3984028331864527 epoch train A

In [None]:
overall_auc = []
print("individual fold results")
for idx in range(num_folds):
    print(f"fold_{idx}: {max(train_metric[f'fold_{idx}']['val_metric'])}")
    overall_auc.append(max(train_metric[f'fold_{idx}']['val_metric']))
          
print('Average AUC: {}'.format(np.round(np.mean(overall_auc), 3)))
print('Std AUC: {}'.format(np.round(np.std(overall_auc), 3)))

In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize=(10, 10))
for idx in range(num_folds):
    plt.plot(train_metric[f'fold_{idx}']['val_metric'], color='red')
    plt.plot(train_metric[f'fold_{idx}']['train_metric'], color='purple')
    
plt.grid(color='gray')
