# Output dataset and create vocabulary for C.Diff Adverse Event

**Author: Tesfagabir Meharizghi *(Adopted from Lin Lee Notebook)* <br> Last updated: 01/06/21**

**Notebook to convert 365 version to 1000 version, and save CSV and vocabulary for  CDiff (event_id=d_00845) Adverse Events**
- training data & vocabulary
- valid data
- test data
- up to 1000 events, from full 365 day dataset

**Required:**
- input files: train.csv, val.csv, test.csv (unflattened)
    - Run [this ipynb](01_data_prepare_CDiff_d00845.ipynb) to generate these data splits
- outputs: csv files in flattened 1000 format (could be changed), and vocabulary

**Nomenclature:**
- d30: **30** days
- vpos: vocabulary positive only

In [17]:
#! pip install torchtext

#!pip install nb-black

In [18]:
%load_ext lab_black

%load_ext autoreload

%autoreload 2

The lab_black extension is already loaded. To reload it, use:
  %reload_ext lab_black
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [19]:
import os
import pickle
import math

import torch
import pandas as pd
import numpy as np
from more_itertools import unique_everseen
from torch.utils.data import Dataset, DataLoader

from data_proc import read_data, remove_death, build_vocab
from dataset_func import build_dataset, BuildDataset, get_dataloader

torch.multiprocessing.set_sharing_strategy("file_system")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

### Input filepaths for training, test, vocabulary

In [20]:
# Filepaths
n_events = 1000
n_rows = 1e9
ndays = 365  # number of days to keep

main_dir = "/home/ec2-user/SageMaker/CMSAI/modeling/tes/data/anonymize/AE_CDiff_d00845/"
input_dir = os.path.join(main_dir, "split")
output_dir = os.path.join(main_dir, f"{n_events}_{ndays}days")
in_fnames = ["train.csv", "val.csv", "test.csv"]
out_fnames = ["train.csv", "val.csv", "test.csv"]
input_fps = [os.path.join(input_dir, fname) for fname in in_fnames]
output_fps = [os.path.join(output_dir, fname) for fname in out_fnames]
vocab_fp = os.path.join(output_dir, "vocab".format(n_events, ndays))

if not os.path.join(output_dir):
    os.makedirs(output_dir)

# Options
x_lst = [
    str(x) for x in range(ndays, -1, -1)
]  # total days in datasets, usually 365. Check all for death events
x_flat_lst = [
    str(x) for x in range(n_events - 1, -1, -1)
]  # up to 1000 events in flattened list
y_target = "d_00845"
uid = "patient_id"

### Data flattening

In [21]:
def flatten(x, n_events=1000):
    """Flatten the 365 dataset into N long events"""

    def get_days(x):
        """Calculate number of days between events"""
        new_lst = []
        counter = 1
        counting = False
        for event in x:
            if event is np.nan or (type(event) == float and math.isnan(event)):
                if not counting:
                    counting = True
                counter += 1
            else:

                if counting:
                    counting = False
                    try:
                        event = f"{counter + 1}_days," + event
                    except:
                        print(type(counter), counter)
                        print(event, type(event))
                    new_lst.append(event)
                    counter = 0
                else:
                    event = "1_days," + event
                    new_lst.append(event)

        return new_lst

    # count days with no events, move admission/discharge to the end of the day, dedupe events per day
    x = np.array(get_days(x))
    lst = [move_ad_dis(str(day).replace(" ", "").split(",")) for day in x.ravel("K")]

    # flatten, clean up corner cases
    lst = [event for day in lst for event in day]
    if not lst:
        return ["<pad>"] * (n_events - len(lst)) + lst

    if "_days" in lst[0]:
        lst = lst[1:]

    if len(lst) >= n_events:
        return lst[-n_events:]

    return ["<pad>"] * (n_events - len(lst)) + lst

In [22]:
def move_ad_dis(events_in_day):
    """Move target_event and patient_id to the end of the list, dedupe events"""
    if not isinstance(events_in_day, list):
        return events_in_day

    events_in_day = list(unique_everseen(events_in_day))
    has_admission = False
    has_discharge = False

    if "admission" in events_in_day:
        has_admission = True
        events_in_day.remove("admission")

    if "discharge" in events_in_day:
        has_discharge = True
        events_in_day.remove("discharge")

    if has_admission:
        events_in_day.append("admission")

    if has_discharge:
        events_in_day.append("discharge")

    return events_in_day

In [23]:
def get_flat_df(raw_df, x_lst, copy_lst, n_events):
    """
    Function to flatten dataframe into 1000 long sequence.

    Calls function flatten, which in turn calls move_ad_dis
    """
    columns = [str(x) for x in range(n_events - 1, -1, -1)]
    flat_df = pd.DataFrame(
        raw_df[x_lst].apply(flatten, args=(n_events,), axis=1).tolist(),
        columns=columns,
    )

    for colname in copy_lst:
        flat_df[colname] = raw_df[colname].tolist()

    return flat_df

In [24]:
def create_flat_dataset(
    data_fp,
    x_lst,
    x_flat_lst,
    y_target,
    uid,
    vocab_fp=None,
    output_fp=None,
    min_freq=500,
    n_events=1000,
    return_csv=False,
    nrows=0,
):
    """
    Main function to create flattened dataset: Reads in raw data,
    removes death events, and flattens and saves to output CSV.

    Arguments:
    ----------
    data_fp (str) : input filepath, csv
    x_lst (list) : list of column names (days) to use for flattening
    x_flat_lst (list) : list of column names to use for writing out the
                        flattened file (1000 events)
    y_target (str) : column name of target
    uid (str) : column name of unique identifier
    vocab_fp (str) : path to write out vocab
        default None (will not generate vocabulary)
    output_fp (str) : path to write out flattened CSV file
        default None (will not save)
    min_freq (int) : minimum frequency associated with vocabulary generation
    return_csv (bool) : default False, returns flattened dataframe if True
    nrows (int) : default 0 to read and process all, otherwise
                  will read in nrows in CSV only (for testing)

    Returns:
    ---------
    default None unless return_csv is True, then return
        dataframe containing flattened data
    """

    # read in raw dataset, remove deaths
    raw_df = read_data(
        data_fp=data_fp, check=True, y_target=y_target, uid=uid, test=nrows
    )
    raw_df = remove_death(raw_df, y_target, x_lst)

    raw_df = get_flat_df(
        raw_df,
        x_lst=x_lst,
        copy_lst=[y_target, "patient_id"],
        n_events=n_events,
    )

    output_dir = os.path.dirname(output_fp)
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    if output_fp is not None and isinstance(output_fp, str):
        raw_df.to_csv(output_fp, index=False)

    # build vocabulary
    if vocab_fp is not None:
        print("Vocab generation required")
        vocab = build_vocab(raw_df, x_flat_lst, min_freq=min_freq, pos_labs_vocab=False)

        print(f"Nb of tokens: {len(vocab.stoi)}")
        torch.save(vocab, vocab_fp)

    if return_csv:
        return raw_df

### Generate training data and vocabulary

In [25]:
train_fp = input_fps[0]
vocab_fp = vocab_fp
output_fp = output_fps[0]
return_csv = True

df = create_flat_dataset(
    data_fp=train_fp,
    x_lst=x_lst,
    x_flat_lst=x_flat_lst,
    y_target=y_target,
    uid=uid,
    vocab_fp=vocab_fp,
    output_fp=output_fp,
    min_freq=500,
    n_events=n_events,
    return_csv=return_csv,
    nrows=n_rows,
)

Read data from /home/ec2-user/SageMaker/CMSAI/modeling/tes/data/anonymize/AE_CDiff_d00845/split/train.csv


Data size: (1522738, 368)

Label ratio for d_00845
0    0.999433
1    0.000567
Name: d_00845, dtype: float64

patient_id duplicates: 0


Removing bad words: 0 rows contain the word death
Vocab generation required


start word number:  (1522738000,)
exact word number:  1522738000
Completed vocabulary: 6290 vocabs
Nb of tokens: 6290


In [26]:
df.head()

Unnamed: 0,999,998,997,996,995,994,993,992,991,990,...,7,6,5,4,3,2,1,0,d_00845,patient_id
0,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,...,8_days,d_s3320,discharge,1_days,h_1CHK1,h_G0154,2_days,h_G0154,0,IXD7U0Z74
1,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,...,h_92014,h_92015,172_days,d_sV066,h_90658,h_90732,h_G0008,h_G0009,0,4TJJ3BGPT
2,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,...,h_85027,h_99308,h_P9604,3_days,d_s72981,h_73130,h_Q0092,h_R0070,0,QIW5R08DN
3,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,...,d_s2724,d_sV0481,h_90658,h_99213,h_G0008,9_days,d_s37033,h_92014,0,WDPIV5G4M
4,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,...,36_days,d_sV762,h_81000,h_G0101,h_Q0091,19_days,d_s4660,h_99213,0,JI24AUR24


## Generate Val and Test Data

In [27]:
val_fp = input_fps[1]
output_fp = output_fps[1]

create_flat_dataset(
    data_fp=val_fp,
    x_lst=x_lst,
    x_flat_lst=x_flat_lst,
    y_target=y_target,
    uid=uid,
    vocab_fp=None,
    output_fp=output_fp,
    min_freq=500,
    n_events=n_events,
    return_csv=False,
    nrows=n_rows,
)

Read data from /home/ec2-user/SageMaker/CMSAI/modeling/tes/data/anonymize/AE_CDiff_d00845/split/val.csv


Data size: (190342, 368)

Label ratio for d_00845
0    0.999475
1    0.000525
Name: d_00845, dtype: float64

patient_id duplicates: 0


Removing bad words: 0 rows contain the word death


In [28]:
test_fp = input_fps[2]
output_fp = output_fps[2]

create_flat_dataset(
    data_fp=test_fp,
    x_lst=x_lst,
    x_flat_lst=x_flat_lst,
    y_target=y_target,
    uid=uid,
    vocab_fp=None,
    output_fp=output_fp,
    min_freq=500,
    n_events=n_events,
    return_csv=False,
    nrows=n_rows,
)

Read data from /home/ec2-user/SageMaker/CMSAI/modeling/tes/data/anonymize/AE_CDiff_d00845/split/test.csv


Data size: (190343, 368)

Label ratio for d_00845
0    0.999391
1    0.000609
Name: d_00845, dtype: float64

patient_id duplicates: 0


Removing bad words: 0 rows contain the word death
