# Output dataset for readmissions with full raw data

**Author: Lin Lee Cheong <br> Last updated: 11/20/20**

**Notebook to convert 365 version to 1000 version, and save CSV for readmission**
- from full raw data
- up to 1000 events, from full 365 day dataset

**Required:**
- input file: raw_data.csv
- outputs: csv files in 1000 format, and vocabulary

**Nomenclature:**
- d30: **30** days
- s30: max 30 sequence a day

In [1]:
import torch
torch.__version__

'1.7.0'

In [25]:
import math
import os
import pickle

import torch
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader

from utils import get_cuda
from data_proc import read_data, remove_death, build_vocab
from dataset_func import build_dataset, BuildDataset, get_dataloader

torch.multiprocessing.set_sharing_strategy('file_system') 

### Input filepaths for training, test, vocabulary

In [3]:
!ls  ../../../data/readmission/raw_data/

readmission_input_targets_365_v2.csv  readmission_targets_with_date.csv


In [4]:
train_365_fp = '../../../data/readmission/fold_0/train/raw_train_data.csv'
val_365_fp = '../../../data/readmission/fold_0/test/raw_test_data.csv'

# Options
ndays =30
x_lst = [str(x) for x in range(ndays, -1, -1)]
x_flat_lst = [str(x) for x in range(999, -1, -1)]
y_target = "unplanned_readmission"
uid = "discharge_id"

### Functions

In [5]:
from more_itertools import unique_everseen
def move_ad_dis(events_in_day):
    """Move admission and discharge to the end of the list"""
    if not isinstance(events_in_day, list):
        return events_in_day
    
    events_in_day = list(unique_everseen(events_in_day))
    has_admission = False
    has_discharge = False
    if "admission" in events_in_day:
        has_admission = True
        events_in_day.remove("admission")
        
    if "discharge" in events_in_day:
        has_discharge = True
        events_in_day.remove("discharge")
    
    if has_admission:
        events_in_day.append('admission')
    
    if has_discharge:
        events_in_day.append('discharge')
    
    return events_in_day
    

In [9]:
def get_days(x):
    """Calculate number of days between events"""
    new_lst = []
    counter = 1
    counting = False
    
    try:
        for event in x:
            nan_event = (
                (event == np.nan)
                or (type(event) == float and math.isnan(event))
                or (str(event) == "nan")
            )
            
            if nan_event:
                if not counting:
                    counting = True
                counter += 1
                
            if not nan_event:
                if counting:
                    counting = False
                    event = f"{counter + 1}_days," + event
                    new_lst.append(event)
                    counter = 0
                else:
                    event = "1_days," + event
                    new_lst.append(event)
    except:
        print(f"error: {event}")
        print(f"counter: {counter}")
        print(f"new lst: {new_lst}")
        print(f"org lst: {x}")

    return new_lst

In [10]:
def create_flat_dataset_loopy(
    data_fp, x_lst, x_flat_lst, y_target, uid, output_fp, return_csv=False, test=0
):
    '''
    Arguments:
    -----------
        data_fp : input filepath to CSV containing 365 dataset
        x_lst : list of column names to use in 365 dataset (each day is a col)
        x_flat_lst : length of columns in flattened dataset, usually 1000
        y_target : label name
        uid : unique tag for each observations, used for dedupe
        output_fp : path to write out flattened CSV
        return_csv : bool returns DF if enabled
        test : 0 if read all, otherwise reads test number of rows
    '''
    
    # read in raw dataset, remove deaths
    raw_df = read_data(
        data_fp=data_fp, check=True, y_target=y_target, uid=uid, test=test
    )
    raw_df = remove_death(raw_df, y_target, x_lst)

    # loopy instead of apply
    patient_id, discharge_dt, discharge_id, label = [], [], [], []
    data = []
    n_events = len(x_flat_lst)
    
    for _, row in raw_df.iterrows():
        patient_id.append(row['patient_id'])
        discharge_dt.append(row['discharge_dt'])
        discharge_id.append(row[uid])
        label.append(row[y_target])
        
        #print(row[uid])
        events_by_day = row[x_lst].values.tolist()
        events_day_adjusted = np.array(get_days(events_by_day)) # counted days in between no events and inserted
        lst = [move_ad_dis(str(day).replace(" ", "").split(",")) for day in events_day_adjusted.ravel("K")]
        lst = [event for day in lst for event in day]

        if '_days' in lst[0]:
            lst = lst[1:]
        if len(lst) >= n_events:
            lst = lst[-n_events:]

        data.append(["<pad>"] * (n_events - len(lst)) + lst)

    loopy = pd.DataFrame(data)
    loopy['patient_id'] = patient_id
    loopy['discharge_dt'] = discharge_dt
    loopy['discharge_id'] = discharge_id
    loopy['unplanned_readmission'] = label        
            
    loopy.to_csv(output_fp, index=False)
    
    print(f'Completed: {loopy.shape}')
    if return_csv:
        return loopy
    


In [12]:
    org_df = read_data(
        data_fp=train_365_fp, check=True, y_target=y_target, uid=uid, test=0
    )
    raw_df = remove_death(org_df, y_target, x_lst)

    # loopy instead of apply
    patient_id, discharge_dt, discharge_id, label = [], [], [], []
    data = []
    n_events = len(x_flat_lst)
    

Read data from ../../../data/readmission/fold_0/train/raw_train_data.csv


Data size: (1295326, 370)

Label ratio for unplanned_readmission
False    0.855436
True     0.144564
Name: unplanned_readmission, dtype: float64

Discharge_id duplicates: 0


Removing bad words: 44465 rows contain the word death


In [28]:
    for _, row in raw_df.iterrows():
        patient_id.append(row['patient_id'])
        discharge_dt.append(row['discharge_dt'])
        discharge_id.append(row[uid])
        label.append(row[y_target])
        
        #print(row[uid])
        events_by_day = row[x_lst].values.tolist()
        events_day_adjusted = np.array(get_days(events_by_day)) # counted days in between no events and inserted
        lst = [move_ad_dis(str(day).replace(" ", "").split(",")) for day in events_day_adjusted.ravel("K")]
        lst = [event for day in lst for event in day]

        if '_days' in lst[0]:
            lst = lst[1:]
        if len(lst) >= n_events:
            lst = lst[-n_events:]

        data.append(["<pad>"] * (n_events - len(lst)) + lst)

In [14]:
row

patient_id                                                       100002085
discharge_dt                                                      20110922
discharge_id                                            100002085_20110922
365                                                                    NaN
364                                                                    NaN
                                               ...                        
3                        d_5119, d_51919, d_7931, d_80709, h_00520, h_7...
2                        d_5119, d_5128, d_5180, d_7931, d_80709, h_710...
1                        d_5119, d_5183, d_80709, d_8600, d_V5399, h_32...
0                        admission, d_496, d_72887, d_78605, d_78650, d...
unplanned_readmission                                                False
Name: 0, Length: 370, dtype: object

In [15]:
        events_by_day = row[x_lst].values.tolist()

In [16]:
events_by_day

[nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 'd_43310, d_4414, h_99204',
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 'admission, d_486, d_496, d_5128, d_78605, d_78650, d_80704, d_80709, d_8600, d_8604, d_E8859, d_V5399, h_3120F, h_32551, h_71010, h_71101, h_71250, h_99222, h_99285, p_3404',
 'd_2722, d_2930, d_3310, d_3312, d_33183, d_4019, d_4928, d_5128, d_78097, d_80709, d_8600, h_70450, h_71010, h_99221, h_99223, h_99232, h_99233',
 'd_2722, d_4019, d_43310, d_51189, d_5119, d_5128, d_80709, d_8600, h_71010, h_93880, h_99231, h_99232, h_99233',
 'd_2722, d_4019, d_49121, d_51189, d_5128, d_80700, d_80709, d_8600, h_71010, h_99231, h_99232, h_99233',
 'd_2722, d_4019, d_49121, d_51189, d_5119, d_5128, d_80700, d_80709, d_8600, h_71010, h_99232, h_99233',
 'd_2722, d_4019, d_5128, d_80709, d_8600, d_V5882, h_71010, h_99231, h_99232, h_99233',
 'd_29410, d_5128, d_80709, h_99222, h_99231, h_99233',
 'd_486, d_5119, d_5128, d_5183, d_80709, h_71010, h_71250, h_99231, h_99233',
 'd_496, d

In [27]:
events_day_adjusted = np.array(get_days(events_by_day)) # counted days in between no events and inserted

In [18]:
x = events_by_day

In [26]:
    new_lst = []
    counter = 1
    counting = False
    
    try:
        for event in x:
            nan_event = (
                (event == np.nan)
                or (type(event) == float and math.isnan(event))
                or (str(event) == "nan")
            )
            
            if nan_event:
                if not counting:
                    counting = True
                counter += 1
                
            if not nan_event:
                if counting:
                    counting = False
                    event = f"{counter + 1}_days," + event
                    new_lst.append(event)
                    counter = 0
                else:
                    event = "1_days," + event
                    new_lst.append(event)
    except:
        print(f"error: {event}")
        print(f"counter: {counter}")
        print(f"new lst: {new_lst}")
        print(f"org lst: {x}")

In [24]:
import math
(type(event) == float and math.isnan(event))

True

In [38]:
    loopy = pd.DataFrame(data)
    loopy['patient_id'] = patient_id[1:]
    loopy['discharge_dt'] = discharge_dt[1:]
    loopy['discharge_id'] = discharge_id[1:]
    loopy['unplanned_readmission'] = label [1:]       
            
    loopy.to_csv('../lstm/loopy_train_all_fold0_30days.csv', index=False)

In [None]:
loopy.shape

In [8]:
train_df = create_flat_dataset_loopy(
    data_fp=train_365_fp, 
    x_lst=x_lst,
    x_flat_lst=x_flat_lst, 
    y_target=y_target, 
    uid=uid, 
    output_fp='../lstm/loopy_train_all_fold0_30days.csv',
    return_csv=True, 
    test=0
)

Read data from ../../../data/readmission/fold_0/train/raw_train_data.csv


Data size: (1295326, 370)

Label ratio for unplanned_readmission
False    0.855436
True     0.144564
Name: unplanned_readmission, dtype: float64

Discharge_id duplicates: 0


Removing bad words: 44465 rows contain the word death
error: nan
counter: 1
new lst: []


IndexError: list index out of range

In [39]:
val_df = create_flat_dataset_loopy(
    data_fp=val_365_fp, 
    x_lst=x_lst,
    x_flat_lst=x_flat_lst, 
    y_target=y_target, 
    uid=uid, 
    output_fp='../lstm/loopy_val_all_fold0_30days.csv',
    return_csv=True, 
    test=0
)

Read data from ../../../data/readmission/fold_0/test/raw_test_data.csv


Data size: (323832, 370)

Label ratio for unplanned_readmission
False    0.855434
True     0.144566
Name: unplanned_readmission, dtype: float64

Discharge_id duplicates: 0


Removing bad words: 11075 rows contain the word death
Completed: (312757, 1004)


In [25]:
did = '102878613_20111115'
[x for x in raw_df[raw_df.discharge_id == did].values.tolist()[0] if x != '<pad>']

['d_V560',
 'h_90960',
 '5_days',
 'd_9961',
 'h_93990',
 '26_days',
 'd_V560',
 'h_90960',
 '1_days',
 'd_36511',
 'h_92012',
 '22_days',
 'd_4260',
 'd_42781',
 'h_93288',
 'h_99214',
 '5_days',
 'd_V560',
 'h_90960',
 '2_days',
 'd_5856',
 'd_72981',
 'd_V4511',
 'h_1000F',
 'h_1036F',
 'h_99212',
 '7_days',
 'd_61172',
 'd_6119',
 'd_79389',
 'h_76645',
 'h_77051',
 'h_G0204',
 '13_days',
 'd_25052',
 'h_A4253',
 'p_D1E',
 '1_days',
 'd_25000',
 'd_2512',
 'd_2893',
 'd_51889',
 'd_6111',
 'd_61172',
 'd_6119',
 'd_78009',
 'd_7808',
 'd_79381',
 'h_19102',
 'h_19103',
 'h_38505',
 'h_71010',
 'h_88305',
 'h_99285',
 'h_A0425',
 'h_A0427',
 '7_days',
 'd_79389',
 'h_99212',
 '1_days',
 'd_5856',
 'd_99673',
 'd_V560',
 'h_36145',
 'h_75790',
 'h_75962',
 'h_75978',
 'h_90960',
 'h_G0392',
 'h_G0393',
 'h_J2997',
 '8_days',
 'd_44020',
 'h_73660',
 '7_days',
 'd_79389',
 'h_76645',
 '6_days',
 'd_6110',
 'h_99243',
 '8_days',
 'd_8930',
 'h_99213',
 '1_days',
 'd_V560',
 'h_90960',


In [26]:
for idx in range(0, 30):
    print([x for x in org_df.loc[org_df.discharge_id==did, str(idx)]])

['admission, d_486, d_486, d_5990, d_7197, d_7282, discharge, h_99239, h_A0425, h_A0428']
[nan]
[nan]
['d_78079, h_93010']
['admission, d_340, d_490, d_51889, d_5990, d_7850, h_71020, h_71020, h_96361, h_96374, h_99285, h_A0425, h_A0429, h_J7030']
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
[nan]
['d_5950, h_81002, h_99213']
[nan]
[nan]
[nan]


In [27]:
org_obs = org_df[org_df.discharge_id == did]
org_obs

Unnamed: 0,patient_id,discharge_dt,discharge_id,365,364,363,362,361,360,359,...,8,7,6,5,4,3,2,1,0,unplanned_readmission
48314,102878613,20111115,102878613_20111115,,,,,,,,...,,,,,"admission, d_340, d_490, d_51889, d_5990, d_78...","d_78079, h_93010",,,"admission, d_486, d_486, d_5990, d_7197, d_728...",True


In [31]:
    
    x = np.array(get_days(org_obs[x_lst].values[0]))
    lst = [move_ad_dis(str(day).replace(" ", "").split(",")) for day in x.ravel("K")]
    lst = [event for day in lst for event in day]

In [32]:
len(lst)

98

In [33]:
lst

['21_days',
 'discharge',
 '1_days',
 'h_1BGL1',
 '4_days',
 'h_G0151',
 '14_days',
 'h_G0151',
 '14_days',
 'd_340',
 'h_99214',
 '5_days',
 'd_340',
 'h_G0180',
 '1_days',
 'd_7859',
 'h_93880',
 '26_days',
 'd_43311',
 'h_99213',
 '79_days',
 'd_340',
 'h_99213',
 'h_G8553',
 '92_days',
 'd_7823',
 'h_36415',
 'h_80051',
 'h_82565',
 'h_84520',
 'h_99213',
 '6_days',
 'd_7852',
 'h_93306',
 '22_days',
 'd_5854',
 'h_99205',
 '4_days',
 'h_36415',
 'h_80069',
 'h_81001',
 'h_82306',
 'h_82570',
 'h_83540',
 'h_83550',
 'h_83970',
 'h_84156',
 'h_85025',
 'h_86334',
 'h_86335',
 '2_days',
 'd_5854',
 'h_76775',
 '15_days',
 'h_36415',
 'h_80053',
 'h_80061',
 'h_84443',
 'h_85025',
 'h_86141',
 '7_days',
 'd_5854',
 'h_99214',
 '18_days',
 'd_5854',
 'h_99214',
 'h_G0420',
 '10_days',
 'd_5950',
 'h_81002',
 'h_99213',
 '22_days',
 'd_340',
 'd_490',
 'd_51889',
 'd_5990',
 'd_7850',
 'h_71020',
 'h_96361',
 'h_96374',
 'h_99285',
 'h_A0425',
 'h_A0429',
 'h_J7030',
 'admission',
 '1_

In [34]:
no_death_df = remove_death(org_df, y_target, x_lst)



Removing bad words: 44492 rows contain the word death


In [35]:
nodeath_obs = no_death_df[no_death_df.discharge_id == did]
nodeath_obs

Unnamed: 0,patient_id,discharge_dt,discharge_id,365,364,363,362,361,360,359,...,8,7,6,5,4,3,2,1,0,unplanned_readmission
48314,102878613,20111115,102878613_20111115,,,,,,,,...,,,,,"admission, d_340, d_490, d_51889, d_5990, d_78...","d_78079, h_93010",,,"admission, d_486, d_486, d_5990, d_7197, d_728...",True


In [36]:
    nodeath_x = np.array(get_days(nodeath_obs[x_lst].values[0]))
    nodeath_lst = [move_ad_dis(str(day).replace(" ", "").split(",")) for day in nodeath_x.ravel("K")]
    nodeath_lst = [event for day in nodeath_lst for event in day]

In [37]:
len(nodeath_lst)

98

In [41]:
nodeath_obs[x_lst].apply(flatten, axis=1).tolist()

[['<pad>',
  '<pad>',
  '<pad>',
  '<pad>',
  '<pad>',
  '<pad>',
  '<pad>',
  '<pad>',
  '<pad>',
  '<pad>',
  '<pad>',
  '<pad>',
  '<pad>',
  '<pad>',
  '<pad>',
  '<pad>',
  '<pad>',
  '<pad>',
  '<pad>',
  '<pad>',
  '<pad>',
  '<pad>',
  '<pad>',
  '<pad>',
  '<pad>',
  '<pad>',
  '<pad>',
  '<pad>',
  '<pad>',
  '<pad>',
  '<pad>',
  '<pad>',
  '<pad>',
  '<pad>',
  '<pad>',
  '<pad>',
  '<pad>',
  '<pad>',
  '<pad>',
  '<pad>',
  '<pad>',
  '<pad>',
  '<pad>',
  '<pad>',
  '<pad>',
  '<pad>',
  '<pad>',
  '<pad>',
  '<pad>',
  '<pad>',
  '<pad>',
  '<pad>',
  '<pad>',
  '<pad>',
  '<pad>',
  '<pad>',
  '<pad>',
  '<pad>',
  '<pad>',
  '<pad>',
  '<pad>',
  '<pad>',
  '<pad>',
  '<pad>',
  '<pad>',
  '<pad>',
  '<pad>',
  '<pad>',
  '<pad>',
  '<pad>',
  '<pad>',
  '<pad>',
  '<pad>',
  '<pad>',
  '<pad>',
  '<pad>',
  '<pad>',
  '<pad>',
  '<pad>',
  '<pad>',
  '<pad>',
  '<pad>',
  '<pad>',
  '<pad>',
  '<pad>',
  '<pad>',
  '<pad>',
  '<pad>',
  '<pad>',
  '<pad>',
  '<pad>',

In [62]:
patient_id, discharge_dt, discharge_id, label = [], [], [], []
data = []
n_events = 1000
counter = 0
lim = 1e9
for _, row in no_death_df.iterrows():
    patient_id.append(row['patient_id'])
    discharge_dt.append(row['discharge_dt'])
    discharge_id.append(row['discharge_id'])
    label.append(row['unplanned_readmission'])
    
    x = row[x_lst].values.tolist()
    x2 = np.array(get_days(x))
    lst = [move_ad_dis(str(day).replace(" ", "").split(",")) for day in x2.ravel("K")]
    lst = [event for day in lst for event in day]
    
    if '_days' in lst[0]:
        lst = lst[1:]
    if len(lst) >= n_events:
        lst = lst[-n_events:]

    data.append(["<pad>"] * (n_events - len(lst)) + lst)
    
    counter += 1
    if counter > lim:
        break

In [63]:
loopy = pd.DataFrame(data)
loopy['patient_id'] = patient_id
loopy['discharge_dt'] = discharge_dt
loopy['discharge_id'] = discharge_id
loopy['unplanned_readmission'] = label

In [64]:
loopy.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,994,995,996,997,998,999,patient_id,discharge_dt,discharge_id,unplanned_readmission
0,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,...,h_99238,h_99306,h_A0425,h_A0428,admission,discharge,100002085,20110922,100002085_20110922,False
1,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,...,h_90732,h_99232,h_99233,h_G0009,p_9955,discharge,100002829,20111013,100002829_20111013,False
2,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,...,d_V4989,h_99231,h_A0425,h_A0428,admission,discharge,100003379,20091207,100003379_20091207,True
3,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,...,d_78650,h_99231,2_days,d_78650,h_99238,discharge,100008869,20101116,100008869_20101116,False
4,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,<pad>,...,d_5849,d_78904,h_99232,h_99239,admission,discharge,100009927,20090617,100009927_20090617,False


In [65]:
import os
os.makedirs('../lstm/tmp_loopy/')

FileExistsError: [Errno 17] File exists: '../lstm/tmp_loopy/'

In [None]:
loopy.iloc[:-10000].to_csv('../lstm/tmp_loopy/loppy_train_all.csv', index=False)

loopy.iloc[-10000:].to_csv('../lstm/tmp_loopy/loppy_test_all.csv', index=False)

### Generate training data and vocabulary

In [19]:
for idx, (train_fp, train_dl_fp, vocab_fp) in enumerate(zip(train_fps, train_dl_fps, vocab_fps)):
    print("\n\n" + "*" * 100)
    print(f"Processing fold {idx}\n" + "*" * 100)
    
    create_flat_dataset(
        data_fp=train_fp,
        x_lst=x_lst,
        x_flat_lst=x_flat_lst,
        y_target=y_target,
        uid=uid,
        train=True,
        vocab_fp=vocab_fp,
        datalist_fp=train_dl_fp,
        min_freq=1,
        save_csv=True,
    )
    
    print(f"Completed, wrote to vocab: {vocab_fp}, \n train data:{train_dl_fp}")



****************************************************************************************************
Processing fold 0
****************************************************************************************************
Read data from ../../../data/readmission/fold_0/train/raw_train_data.csv


Data size: (50000, 370)

Label ratio for unplanned_readmission
False    0.8539
True     0.1461
Name: unplanned_readmission, dtype: float64

Discharge_id duplicates: 0


Removing bad words: 1637 rows contain the word death
Vocab generation required


start word number:  (48363000,)
exact word number:  48363000
Completed vocabulary: 20058 vocabs
Nb of tokens: 20058
Completed, wrote to vocab: ../../../data/readmission/fold_0/vocab/vocab_1000_vall_30days, 
 train data:../../../data/readmission/fold_0/train/train_datalist_1000_vall_30days.pkl


****************************************************************************************************
Processing fold 1
**************************************

### Generate test data

In [20]:
for idx, (test_fp, test_dl_fp, vocab_fp) in enumerate(
    zip(test_fps, test_dl_fps, vocab_fps)
):
    print("\n\n" + "*" * 100)
    print(f"Processing fold {idx}\n" + "*" * 100)
    create_flat_dataset(
        data_fp=test_fp,
        x_lst=x_lst,
        x_flat_lst=x_flat_lst,
        y_target=y_target,
        uid=uid,
        train=False,
        vocab_fp=vocab_fp,
        datalist_fp=train_dl_fp,
        save_csv=True
    )

    print(f"Completed, read from vocab: {vocab_fp}, \n wrote to {test_dl_fp}")



****************************************************************************************************
Processing fold 0
****************************************************************************************************
Read data from ../../../data/readmission/fold_0/test/raw_test_data.csv


Data size: (50000, 370)

Label ratio for unplanned_readmission
False    0.8579
True     0.1421
Name: unplanned_readmission, dtype: float64

Discharge_id duplicates: 0


Removing bad words: 1807 rows contain the word death
Completed, read from vocab: ../../../data/readmission/fold_0/vocab/vocab_1000_vall_30days, 
 wrote to ../../../data/readmission/fold_0/test/test_datalist_1000_vall_30days.pkl


****************************************************************************************************
Processing fold 1
****************************************************************************************************
Read data from ../../../data/readmission/fold_1/test/raw_test_data.csv


Data size: (5