# Output dataset and create vocabulary for readmissions (all folds)

**Author: Lin Lee Cheong <br> Last updated: 11/23/20** updated with fixes identified by Xiangyu

**Notebook to convert 365 version to 1000 version, and save CSV and vocabulary for  readmissions for all 5 folds**
- training data & vocabulary
- test data
- up to 1000 events, from full 365 day dataset

**Required:**
- input file: raw_train_data.csv, raw_test_data.csv
- outputs: csv files in 1000 format, and vocabulary

**Nomenclature:**
- d30: **30** days
- vpos: vocabulary positive only

In [1]:
import torch
torch.__version__

'1.7.0'

In [2]:
import os
import pickle
import math

import torch
import pandas as pd
import numpy as np
from more_itertools import unique_everseen
from torch.utils.data import Dataset, DataLoader

from utils import get_cuda
from data_proc import read_data, remove_death, build_vocab
from dataset_func import build_dataset, BuildDataset, get_dataloader

torch.multiprocessing.set_sharing_strategy('file_system') 

### Input filepaths for training, test, vocabulary

In [10]:
# Filepaths
num_folds = 5

main_dir = "../../../data/readmission/"
train_dirs = [os.path.join(main_dir, f"fold_{idx}/train/") for idx in range(num_folds)]
test_dirs = [os.path.join(main_dir, f"fold_{idx}/test/") for idx in range(num_folds)]
vocab_dirs = [os.path.join(main_dir, f"fold_{idx}/vocab/") for idx in range(num_folds)]

train_fps = [os.path.join(train_dir, "raw_train_data.csv") for train_dir in train_dirs]
test_fps = [os.path.join(test_dir, "raw_test_data.csv") for test_dir in test_dirs]

out_train_fps = [os.path.join(train_dir, "raw_train_data_1000_30days.csv") for train_dir in train_dirs]
out_test_fps = [os.path.join(test_dir, "raw_test_data_1000_30days.csv") for test_dir in test_dirs]

for vocab_dir in vocab_dirs:    
    if not os.path.isdir(vocab_dir):
        os.makedirs(vocab_dir)
vocab_fps = [os.path.join(vocab_dir, f"vocab_1000_vall_30days") for vocab_dir in vocab_dirs]

# Options
ndays = 30 # number of days to keep
x_lst = [str(x) for x in range(ndays, -1, -1)] # total days in datasets, usually 365. Check all for death events
x_flat_lst = [str(x) for x in range(999, -1, -1)] # up to 1000 events in flattened list
y_target = "unplanned_readmission"
uid = "discharge_id"

### Data flattening

In [4]:
def flatten(x, n_events=1000):
    """Flatten the 365 dataset into N long events"""
    def get_days(x):
        """Calculate number of days between events"""
        new_lst = []
        counter = 1
        counting = False
        for event in x:
            if event is np.nan or (type(event) == float and math.isnan(event)):
                if not counting:
                    counting = True
                counter += 1
            else:
                
                if counting:
                    counting = False
                    try:
                        event = f"{counter + 1}_days," + event
                    except:
                        print(type(counter), counter)
                        print(event, type(event))
                    new_lst.append(event)
                    counter = 0
                else:
                    event = "1_days," + event
                    new_lst.append(event)
                    
        return new_lst
    
    # count days with no events, move admission/discharge to the end of the day, dedupe events per day
    x = np.array(get_days(x))
    lst = [move_ad_dis(str(day).replace(" ", "").split(",")) for day in x.ravel("K")]
    
    # flatten, clean up corner cases
    lst = [event for day in lst for event in day]
    if '_days' in lst[0]:
        lst = lst[1:]
    if len(lst) >= n_events:
        return lst[-n_events:]

    return ["<pad>"] * (n_events - len(lst)) + lst

In [5]:
def move_ad_dis(events_in_day):
    """Move admission and discharge to the end of the list, dedupe events"""
    if not isinstance(events_in_day, list):
        return events_in_day

    events_in_day = list(unique_everseen(events_in_day))
    has_admission = False
    has_discharge = False

    if "admission" in events_in_day:
        has_admission = True
        events_in_day.remove("admission")

    if "discharge" in events_in_day:
        has_discharge = True
        events_in_day.remove("discharge")

    if has_admission:
        events_in_day.append("admission")

    if has_discharge:
        events_in_day.append("discharge")

    return events_in_day

In [6]:
def get_flat_df(raw_df, x_lst, copy_lst):
    """
    Function to flatten dataframe into 1000 long sequence.
    
    Calls function flatten, which in turn calls move_ad_dis
    """
    flat_df = pd.DataFrame(
        raw_df[x_lst].apply(flatten, axis=1).tolist(),
        columns=[str(x) for x in range(999, -1, -1)],
    )

    for colname in copy_lst:
        flat_df[colname] = raw_df[colname].copy(deep=True)

    return flat_df

In [7]:
def create_flat_dataset(
    data_fp,
    x_lst,
    x_flat_lst,
    y_target,
    uid,
    vocab_fp=None,
    output_fp=None,
    min_freq=500,
    return_csv=False,
    nrows=0
):
    """
    Main function to create flattened dataset: Reads in raw data, 
    removes death events, and flattens and saves to output CSV.
    
    Arguments:
    ----------
    data_fp (str) : input filepath, csv
    x_lst (list) : list of column names (days) to use for flattening
    x_flat_lst (list) : list of column names to use for writing out the 
                        flattened file (1000 events)
    y_target (str) : column name of target
    uid (str) : column name of unique identifier
    vocab_fp (str) : path to write out vocab
        default None (will not generate vocabulary)
    output_fp (str) : path to write out flattened CSV file
        default None (will not save)
    min_freq (int) : minimum frequency associated with vocabulary generation
    return_csv (bool) : default False, returns flattened dataframe if True
    nrows (int) : default 0 to read and process all, otherwise
                  will read in nrows in CSV only (for testing)
    
    Returns:
    ---------
    default None unless return_csv is True, then return
        dataframe containing flattened data
    """

    # read in raw dataset, remove deaths
    raw_df = read_data(
        data_fp=data_fp, check=True, y_target=y_target, uid=uid, test=nrows
    )
    raw_df = remove_death(raw_df, y_target, x_lst)

    raw_df = get_flat_df(
        raw_df,
        x_lst=x_lst,
        copy_lst=[y_target, "discharge_id", "discharge_dt", "patient_id"],
    )

    if output_fp is not None and isinstance(output_fp, str):
        raw_df.to_csv(output_fp, index=False)

    # build vocabulary
    if vocab_fp is not None:
        print("Vocab generation required")
        vocab = build_vocab(raw_df, x_flat_lst, min_freq=min_freq, pos_labs_vocab=False)

        print(f"Nb of tokens: {len(vocab.stoi)}")
        torch.save(vocab, vocab_fp)

    if return_csv:
        return raw_df

### Generate training data and vocabulary

In [8]:
for idx, (train_fp, output_fp, vocab_fp) in enumerate(
    zip(train_fps, out_train_fps, vocab_fps)
):
    print("\n\n" + "*" * 100)
    print(f"Processing fold {idx}\n" + "*" * 100)

    create_flat_dataset(
        data_fp=train_fp,
        x_lst=x_lst,
        x_flat_lst=x_flat_lst,
        y_target=y_target,
        uid=uid,
        vocab_fp=vocab_fp,
        output_fp=output_fp,
        min_freq=500,
        return_csv=False,
        nrows=0,
    )

    print(f"Completed, wrote to vocab: {vocab_fp}, \n train data:{output_fp}")



****************************************************************************************************
Processing fold 0
****************************************************************************************************
Read data from ../../../data/readmission/fold_0/train/raw_train_data.csv


Data size: (5000, 370)

Label ratio for unplanned_readmission
False    0.846
True     0.154
Name: unplanned_readmission, dtype: float64

Discharge_id duplicates: 0


Removing bad words: 155 rows contain the word death
Vocab generation required


start word number:  (4845000,)
exact word number:  4845000
Completed vocabulary: 99 vocabs
Nb of tokens: 99
Completed, wrote to vocab: ../../../data/readmission/fold_0/vocab/vocab_1000_vall_30days, 
 train data:../../../data/readmission/fold_0/train/raw_train_data_1000_30days.csv


****************************************************************************************************
Processing fold 1
*******************************************************

### Generate test data

In [11]:
for idx, (test_fp, output_fp, vocab_fp) in enumerate(
    zip(test_fps, out_test_fps, vocab_fps)
):
    print("\n\n" + "*" * 100)
    print(f"Processing fold {idx}\n" + "*" * 100)
    create_flat_dataset(
        data_fp=test_fp,
        x_lst=x_lst,
        x_flat_lst=x_flat_lst,
        y_target=y_target,
        uid=uid,
        vocab_fp=None,
        output_fp=output_fp,
        min_freq=500,
        return_csv=False,
        nrows=0,
    )

    print(f"Completed, wrote to {output_fp}")



****************************************************************************************************
Processing fold 0
****************************************************************************************************
Read data from ../../../data/readmission/fold_0/test/raw_test_data.csv


Data size: (5000, 370)

Label ratio for unplanned_readmission
False    0.8532
True     0.1468
Name: unplanned_readmission, dtype: float64

Discharge_id duplicates: 0


Removing bad words: 136 rows contain the word death
Completed, wrote to ../../../data/readmission/fold_0/test/raw_test_data_1000_30days.csv


****************************************************************************************************
Processing fold 1
****************************************************************************************************
Read data from ../../../data/readmission/fold_1/test/raw_test_data.csv


Data size: (5000, 370)

Label ratio for unplanned_readmission
False    0.8534
True     0.1466
Name: un