# Output dataset and create vocabulary for readmissions (all folds)

**Author: Lin Lee Cheong <br> Last updated: 11/9/20**

**Notebook to convert 365 version to 1000 version, and save CSV and vocabulary for  readmissions for all 5 folds**
- training data & vocabulary
- test data
- up to 1000 events, from full 365 day dataset

**Required:**
- input file: raw_train_data.csv, raw_test_data.csv
- outputs: csv files in 1000 format, and vocabulary

**Nomenclature:**
- d30: **60** days
- s30: max 30 sequence a day
- vpos: vocabulary positive only

In [1]:
import torch
torch.__version__

'1.4.0'

In [2]:
import os
import pickle

import torch
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader

from utils import get_cuda
from data_proc import read_data, remove_death, build_vocab
from dataset_func import build_dataset, BuildDataset, get_dataloader

torch.multiprocessing.set_sharing_strategy('file_system') 

### Input filepaths for training, test, vocabulary

In [3]:
# Filepaths
num_folds = 5

main_dir = "../../../data/readmission/"
train_dirs = [os.path.join(main_dir, f"fold_{idx}/train/") for idx in range(num_folds)]
test_dirs = [os.path.join(main_dir, f"fold_{idx}/test/") for idx in range(num_folds)]
vocab_dirs = [os.path.join(main_dir, f"fold_{idx}/vocab/") for idx in range(num_folds)]

train_fps = [os.path.join(train_dir, "raw_train_data.csv") for train_dir in train_dirs]
test_fps = [os.path.join(test_dir, "raw_test_data.csv") for test_dir in test_dirs]

train_dl_fps = [os.path.join(train_dir, "train_datalist_1000_vall_30days.pkl") for train_dir in train_dirs]
test_dl_fps = [os.path.join(test_dir, "test_datalist_1000_vall_30days.pkl") for test_dir in test_dirs]

for vocab_dir in vocab_dirs:    
    if not os.path.isdir(vocab_dir):
        os.makedirs(vocab_dir)
vocab_fps = [os.path.join(vocab_dir, f"vocab_1000_vall_30days") for vocab_dir in vocab_dirs]

# Options
ndays = 30
x_lst = [str(x) for x in range(ndays, -1, -1)]
x_flat_lst = [str(x) for x in range(999, -1, -1)]
y_target = "unplanned_readmission"
uid = "discharge_id"

### Data flattening

In [4]:
def flatten(x, n_events=1000):
    """Flatten the 365 dataset into N long events"""
    def get_days(x):
        """Calculate number of days between events"""
        new_lst = []
        counter = 0
        counting = False
        for event in x:
            if event is np.nan:
                if not counting:
                    counting = True
                counter += 1
            if event is not np.nan:
                if counting:
                    counting = False
                    new_lst.append(f"{counter}_days")
                    new_lst.append(event)
                    counter = 0
                else:
                    new_lst.append(event)
        return new_lst
    
    x = np.array(get_days(x))
    lst = [str(day).replace(" ", "").split(",") for day in x.ravel("K")]
    lst = [event for day in lst for event in day]
    if len(lst) >= n_events:
        return lst[-n_events:]

    return ["<pad>"] * (n_events - len(lst)) + lst

In [5]:
def get_flat_df(raw_df, x_lst, copy_lst):
    '''
    Function to flatten dataframe into 1000 long sequence
    '''
    flat_df = pd.DataFrame(raw_df[x_lst].apply(flatten, axis=1).tolist(),
                        columns=[str(x) for x in range(999, -1, -1)])
    for colname in copy_lst:
        flat_df[colname] = raw_df[colname].copy(deep=True)
    
    return flat_df

In [6]:
def create_flat_dataset(
    data_fp,
    x_lst,
    x_flat_lst,
    y_target,
    uid,
    train=True,
    vocab_fp=None,
    datalist_fp=None,
    min_freq=10,
    save_csv=False,
):
    # read in raw dataset, remove deaths
    raw_df = read_data(data_fp=data_fp, check=True, y_target=y_target, uid=uid, test=0)
    raw_df = remove_death(raw_df, y_target, x_lst)

    raw_df = get_flat_df(
        raw_df,
        x_lst=x_lst,
        copy_lst=[y_target, "discharge_id", "discharge_dt", "patient_id"],
    )

    if save_csv:
        data_fp_write = data_fp.replace(".csv", "_flatten_30days.csv")
        raw_df.to_csv(data_fp_write, index=False)

    # build vocabulary and save if training dataset
    if train:
        print("Vocab generation required")
        vocab = build_vocab(raw_df, x_flat_lst, min_freq=min_freq, pos_labs_vocab=False)
        
        print(f"Nb of tokens: {len(vocab.stoi)}")
        if vocab_fp is not None:
            torch.save(vocab, vocab_fp)
    # else:
    #    vocab = torch.load(vocab_fp)

    # build dataset and save
    # whole_data = build_dataset(
    #    raw_df, vocab, x_flat_lst, [y_target], day_length=1000, max_length=1
    # )

    # print('Whole data created"')
    # if datalist_fp is not None:
    #    pickle.dump(whole_data, open(datalist_fp, "wb"), protocol=4)

### Generate training data and vocabulary

In [7]:
for idx, (train_fp, train_dl_fp, vocab_fp) in enumerate(zip(train_fps, train_dl_fps, vocab_fps)):
    print("\n\n" + "*" * 100)
    print(f"Processing fold {idx}\n" + "*" * 100)
    
    create_flat_dataset(
        data_fp=train_fp,
        x_lst=x_lst,
        x_flat_lst=x_flat_lst,
        y_target=y_target,
        uid=uid,
        train=True,
        vocab_fp=vocab_fp,
        datalist_fp=train_dl_fp,
        min_freq=1,
        save_csv=True,
    )
    
    print(f"Completed, wrote to vocab: {vocab_fp}, \n train data:{train_dl_fp}")



****************************************************************************************************
Processing fold 0
****************************************************************************************************
Read data from ../../../data/readmission/fold_0/train/raw_train_data.csv


Data size: (1295326, 370)

Label ratio for unplanned_readmission
False    0.855436
True     0.144564
Name: unplanned_readmission, dtype: float64

Discharge_id duplicates: 0


Removing bad words: 44465 rows contain the word death
Vocab generation required


start word number:  (1250861000,)
exact word number:  1250861000
Completed vocabulary: 28833 vocabs
Nb of tokens: 28833
Completed, wrote to vocab: ../../../data/readmission/fold_0/vocab/vocab_1000_vall_30days, 
 train data:../../../data/readmission/fold_0/train/train_datalist_1000_vall_30days.pkl


****************************************************************************************************
Processing fold 1
***************************

### Generate test data

In [8]:
for idx, (test_fp, test_dl_fp, vocab_fp) in enumerate(
    zip(test_fps, test_dl_fps, vocab_fps)
):
    print("\n\n" + "*" * 100)
    print(f"Processing fold {idx}\n" + "*" * 100)
    create_flat_dataset(
        data_fp=test_fp,
        x_lst=x_lst,
        x_flat_lst=x_flat_lst,
        y_target=y_target,
        uid=uid,
        train=False,
        vocab_fp=vocab_fp,
        datalist_fp=train_dl_fp,
        save_csv=True
    )

    print(f"Completed, read from vocab: {vocab_fp}, \n wrote to {test_dl_fp}")



****************************************************************************************************
Processing fold 0
****************************************************************************************************
Read data from ../../../data/readmission/fold_0/test/raw_test_data.csv


Data size: (323832, 370)

Label ratio for unplanned_readmission
False    0.855434
True     0.144566
Name: unplanned_readmission, dtype: float64

Discharge_id duplicates: 0


Removing bad words: 11075 rows contain the word death
Completed, read from vocab: ../../../data/readmission/fold_0/vocab/vocab_1000_vall_30days, 
 wrote to ../../../data/readmission/fold_0/test/test_datalist_1000_vall_30days.pkl


****************************************************************************************************
Processing fold 1
****************************************************************************************************
Read data from ../../../data/readmission/fold_1/test/raw_test_data.csv


Data si