# Create vocabulary and dataset for readmissions (all folds)

**Author: Lin Lee Cheong <br> Last updated: 11/1
/20**

**Notebook to create vocabulary and dataset for readmissions for all 5 folds**
- training data & vocabulary
- test data

**Required:**
- input file: raw_train_data.csv, raw_test_data.csv
- outputs: pickle file of datalist (whole_ids, whole_data, whole_labels, whole_mask)

**Nomenclature:**
- d30: **60** days
- s30: max 30 sequence a day
- vpos: vocabulary positive only

In [1]:
import os
import pickle

import torch
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader

from utils import get_cuda
from data_proc import read_data, remove_death, build_vocab
from dataset_func import build_dataset, BuildDataset, get_dataloader

### Input filepaths for training, test, vocabulary

In [2]:
# Filepaths
num_folds = 5

main_dir = "../../../data/readmission/"
train_dirs = [os.path.join(main_dir, f"fold_{idx}/train/") for idx in range(num_folds)]
test_dirs = [os.path.join(main_dir, f"fold_{idx}/test/") for idx in range(num_folds)]
vocab_dirs = [os.path.join(main_dir, f"fold_{idx}/vocab/") for idx in range(num_folds)]

train_fps = [os.path.join(train_dir, "raw_train_data.csv") for train_dir in train_dirs]
test_fps = [os.path.join(test_dir, "raw_test_data.csv") for test_dir in test_dirs]

train_dl_fps = [os.path.join(train_dir, "train_datalist_d60_s30_vpos.pkl") for train_dir in train_dirs]
test_dl_fps = [os.path.join(test_dir, "test_datalist_d60_s30_vpos.pkl") for test_dir in test_dirs]

for vocab_dir in vocab_dirs:    
    if not os.path.isdir(vocab_dir):
        os.makedirs(vocab_dir)
vocab_fps = [os.path.join(vocab_dir, f"vocab_d60_s30_vpos") for vocab_dir in vocab_dirs]

# Options
x_lst = [str(x) for x in range(365, -1, -1)]
n_days = 60
seq_per_day = 30
y_target = "unplanned_readmission"
uid = "discharge_id"

### Data processing and DataLoader creation

In [3]:
def create_dataset(data_fp, x_lst, day_length, seq_length, y_target, uid, train=True, vocab_fp=None, datalist_fp=None):
    # read in raw dataset, remove deaths
    raw_df = read_data(
        data_fp=data_fp,
        check=True,
        y_target=y_target,
        uid=uid,
        test=0
    )
    raw_df = remove_death(raw_df, y_target, x_lst)    

    # build vocabulary and save if training dataset
    if train:
        print('Vocab generation required')
        vocab = build_vocab(raw_df, x_lst[-day_length:])
        if vocab_fp is not None:
            torch.save(vocab, vocab_fp)
    else:
        vocab = torch.load(vocab_fp)
        
    # build dataset and save
    whole_data = build_dataset(
        raw_df, vocab, x_lst, [y_target], day_length=day_length, max_length=seq_length
    )

    if datalist_fp is not None:
        pickle.dump(whole_data, open(datalist_fp, 'wb'), protocol=4)


### Generate training data and vocabulary

In [4]:
for idx, (train_fp, train_dl_fp, vocab_fp) in enumerate(zip(train_fps, train_dl_fps, vocab_fps)):
    print("\n\n" + "*" * 100)
    print(f"Processing fold {idx}\n" + "*" * 100)
    
    create_dataset(
        data_fp=train_fp,
        x_lst=x_lst,
        day_length=n_days,
        seq_length=seq_per_day,
        y_target=y_target,
        uid=uid,
        train=True,
        vocab_fp=vocab_fp,
        datalist_fp=train_dl_fp,
    )
    
    print(f"Completed, wrote to vocab: {vocab_fp}, \n train data:{train_dl_fp}")



****************************************************************************************************
Processing fold 0
****************************************************************************************************
Read data from ../../../data/readmission/fold_0/train/raw_train_data.csv


Data size: (1295326, 370)

Label ratio for unplanned_readmission
False    0.855436
True     0.144564
Name: unplanned_readmission, dtype: float64

Discharge_id duplicates: 0


Removing bad words: 44492 rows contain the word death
Vocab generation required


start word number:  (11235060,)
exact word number:  25062090
Completed vocabulary

Used days:  59 0
Total size before building dataset:  (1250834, 60)
New dataset created
Sequence length:  1250834
Completed, wrote to vocab: ../../../data/readmission/fold_0/vocab/vocab_d60_s30_vpos, 
 train data:../../../data/readmission/fold_0/train/train_datalist_d60_s30_vpos.pkl


*****************************************************************************

### Generate test data

In [5]:
for idx, (test_fp, test_dl_fp, vocab_fp) in enumerate(
    zip(test_fps, test_dl_fps, vocab_fps)
):
    print("\n\n" + "*" * 100)
    print(f"Processing fold {idx}\n" + "*" * 100)
    create_dataset(
        data_fp=test_fp,
        x_lst=x_lst,
        day_length=n_days,
        seq_length=seq_per_day,
        y_target=y_target,
        uid=uid,
        train=False,
        vocab_fp=vocab_fp,
        datalist_fp=test_dl_fp,
    )

    print(f"Completed, read from vocab: {vocab_fp}, \n wrote to {test_dl_fp}")



****************************************************************************************************
Processing fold 0
****************************************************************************************************
Read data from ../../../data/readmission/fold_0/test/raw_test_data.csv


Data size: (323832, 370)

Label ratio for unplanned_readmission
False    0.855434
True     0.144566
Name: unplanned_readmission, dtype: float64

Discharge_id duplicates: 0


Removing bad words: 11077 rows contain the word death

Used days:  59 0
Total size before building dataset:  (312755, 60)
New dataset created
Sequence length:  312755
Completed, read from vocab: ../../../data/readmission/fold_0/vocab/vocab_d60_s30_vpos, 
 wrote to ../../../data/readmission/fold_0/test/test_datalist_d60_s30_vpos.pkl


****************************************************************************************************
Processing fold 1
****************************************************************************