In [None]:
import numpy as np
import pandas as pd
from tqdm.autonotebook import tqdm
import pickle
from sklearn.model_selection import train_test_split

In [None]:
mypath_input = '/home/jupyter/datasets/training_data/data_before_24hrs_icu/data_grouped_HADM_ID/all_events.json'

In [None]:
mypath_output = "/home/jupyter/datasets/training_data/data_before_24hrs_icu/data_grouped_HADM_ID/padded_arrays/"

In [None]:
df = pd.read_json(mypath_input).reset_index(drop = True)

In [None]:
df = df.rename(columns = {'events':'event'})

In [None]:
df.head(2)

In [None]:
df[df['HOSPITAL_EXPIRE_FLAG']==1].head(2)

In [None]:
# read ICUTAYS.csv
df_icu =  pd.read_csv('/home/jupyter/datasets/raw/ICUSTAYS.csv')

In [None]:
df_icu.head()

In [None]:
#dropping ICUstays with LOS<1
df_icu = df_icu[df_icu['LOS'] > 1]

In [None]:
len(df.HADM_ID.unique())

In [None]:
df = df[df['HADM_ID'].isin(df_icu['HADM_ID'].unique())]
df = df.reset_index(drop=True)

In [None]:
df = df.merge(df_icu[['HADM_ID','LOS']], how='left',left_on='HADM_ID', right_on='HADM_ID')

In [None]:
df.head()

In [None]:
# remove records with LOS missing
df = df[df['LOS'].notnull()]

In [None]:
# create binary LOS i.e. >7 or 7
df['LOS'] = df['LOS'].map(lambda x: 0 if x < 7 else 1)

In [None]:
print("number of patients {}".format(len(df.SUBJECT_ID.unique())))
print("number of admissions {}".format(len(df.HADM_ID.unique())))
print("number of in-hospital mortality {}".format(sum(df['HOSPITAL_EXPIRE_FLAG'])))
print('patient with atleast 7 days icu stay {}'.format(sum(df['LOS'])))

In [None]:
# Creating a freq list of SUBJECT_ID
df_subject_frq = pd.DataFrame(df['SUBJECT_ID'].value_counts().reset_index().rename(columns = {'SUBJECT_ID':'freq', 'index':'SUBJECT_ID'}))

In [None]:
#split SubjectID into train/valid/test
X_train, X_test = train_test_split(df_subject_frq.SUBJECT_ID,test_size=0.15, random_state=1234)

X_train, X_valid = train_test_split(X_train,test_size=0.15, random_state=1234)

print(len(X_train),len(X_valid),len(X_test))
print(len(df[df.SUBJECT_ID.isin(X_train)]),len(df[df.SUBJECT_ID.isin(X_valid)]),len(df[df.SUBJECT_ID.isin(X_test)]))

In [None]:
# Export the list of hospital admission id for train , valid, test
# Save SUBJECT_ID for training data
import json
with open(mypath_output+'trainlist_SUBJECT_ID.json', 'w') as filehandle:
    json.dump(X_train.tolist(), filehandle)
    
# Save HADM_ID for training data
import json
with open(mypath_output+'validlist_SUBJECT_ID.json', 'w') as filehandle:
    json.dump(X_valid.tolist(), filehandle)

# Save HADM_ID for training data
import json
with open(mypath_output+'testlist_SUBJECT_ID.json', 'w') as filehandle:
    json.dump(X_test.tolist(), filehandle)

In [None]:
print("The ratio of in-hospital mortality in train, valid and test are {:.3} {:.3} {:.3}".format(
    sum(df[df.SUBJECT_ID.isin(X_train)]['HOSPITAL_EXPIRE_FLAG'])/len(df[df.SUBJECT_ID.isin(X_train)]),
    sum(df[df.SUBJECT_ID.isin(X_valid)]['HOSPITAL_EXPIRE_FLAG'])/len(df[df.SUBJECT_ID.isin(X_valid)]),
    sum(df[df.SUBJECT_ID.isin(X_test)]['HOSPITAL_EXPIRE_FLAG'])/len(df[df.SUBJECT_ID.isin(X_test)])))

In [None]:
print("The ratio of icustays atleast 7 days in train, valid and test are {:.4} {:.4} {:.4}".format(
    sum(df[df.SUBJECT_ID.isin(X_train)]['LOS'])/len(df[df.SUBJECT_ID.isin(X_train)]),
    sum(df[df.SUBJECT_ID.isin(X_valid)]['LOS'])/len(df[df.SUBJECT_ID.isin(X_valid)]),
    sum(df[df.SUBJECT_ID.isin(X_test)]['LOS'])/len(df[df.SUBJECT_ID.isin(X_test)])))

In [None]:
# takes input as a pd.series of lists and determines longest length of a list in the series
def cal_max_len(x):
    a = []
    for i in range(len(x)):
        a.append(len(x[i]))
    max_len = int(max(a))
    pctl_999 = int(np.percentile(a, 99.9))
    return max_len, pctl_999

In [None]:
#create dictionary to hold max length of list in each column
max_len_dict = {}
pctl_999_dict = {}
for i in df.columns:
    if 'event' in i:
        max_len_dict.update({i:cal_max_len(df[i])[0]})
        pctl_999_dict.update({i:cal_max_len(df[i])[1]})

In [None]:
max_len_dict

In [None]:
pctl_999_dict

In [None]:
print(max_len_dict)
# Save dict containing max length of each event type for training data
import json
with open(mypath_output+'max_padlen_dict.json', 'w') as filehandle:
    json.dump(max_len_dict, filehandle)

In [None]:
pctl_999_dict
print(pctl_999_dict)
# Save dict containing 99.9 pctl length of each event type for training data
import json
with open(mypath_output+'pctl999_padlen_dict.json', 'w') as filehandle:
    json.dump(pctl_999_dict, filehandle)

### Create and Fit Tokenizer for Train Data

In [None]:
# Creating training, valid, test Dataset
df_train = df[df.SUBJECT_ID.isin(X_train)].reset_index(drop = True)
df_valid = df[df.SUBJECT_ID.isin(X_valid)].reset_index(drop = True)
df_test = df[df.SUBJECT_ID.isin(X_test)].reset_index(drop = True)

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [None]:
# create output path
tokenizer_path = "/home/jupyter/output/tokenizer/"
import os
os.makedirs(tokenizer_path, exist_ok=True)

In [None]:
%%time
#Training tokenizers on train dataset
tokenizer_dict = {} 
for i in df.columns:
    if 'event' in i:
        # initiate tokenizer
        t = Tokenizer(lower=True,split=',', filters= '')
        # Fit tokenizer on event
        t.fit_on_texts(df_train[i])
        tokenizer_dict.update({i:t})
        print("Tokenizer built for {}".format(i))
print('All tokenizers are built')

In [None]:
#saving the Chartevents tokenizer to disk
with open(tokenizer_path+"tokenizer_chartevents.pickle", 'wb') as handle:
    pickle.dump(tokenizer_dict['event'],handle)  
    
#saving the Inputevents_cv tokenizer to disk
with open(tokenizer_path+"tokenizer_inputevents_cv.pickle", 'wb') as handle:
    pickle.dump(tokenizer_dict['inputevents_cv'],handle) 
    
#saving the Inputevents_mv tokenizer to disk
with open(tokenizer_path+"tokenizer_inputevents_mv.pickle", 'wb') as handle:
    pickle.dump(tokenizer_dict['inputevents_mv'],handle) 

#saving the Labevents tokenizer to disk
with open(tokenizer_path+"tokenizer_labevents.pickle", 'wb') as handle:
    pickle.dump(tokenizer_dict['labevents'],handle) 
    
#saving the Microbioevents tokenizer to disk
with open(tokenizer_path+"tokenizer_microbioevents.pickle", 'wb') as handle:
    pickle.dump(tokenizer_dict['microbioevents'],handle) 
    
#saving the Notevents tokenizer to disk
with open(tokenizer_path+"tokenizer_noteevents.pickle", 'wb') as handle:
    pickle.dump(tokenizer_dict['noteevents'],handle) 

#saving the Outputevents tokenizer to disk
with open(tokenizer_path+"tokenizer_outputevents.pickle", 'wb') as handle:
    pickle.dump(tokenizer_dict['outputevents'],handle) 

#saving the Prescriptionevents tokenizer to disk
with open(tokenizer_path+"tokenizer_prescriptionevents.pickle", 'wb') as handle:
    pickle.dump(tokenizer_dict['prescriptionevents'],handle) 
    
#saving the Procedureevents tokenizer to disk
with open(tokenizer_path+"tokenizer_procedureevents.pickle", 'wb') as handle:
    pickle.dump(tokenizer_dict['procedureevents'],handle) 

### Integer Encode Train Data, Valid Data, Test Data

In [None]:
# function to remove low count words from tokenizer
def tokenizer_low_count(tokenizer):
    t = tokenizer
    #initialize unknown token
    t.oov_token = 'UNK'
    # add "UNK" token and an integer value for it to the tokenizer word index
    t.word_index.update({'UNK':len(t.word_index)+1})
    # Create a list of tokens that occur only once
    low_count_words = []
    for k,v in t.word_counts.items():
        if v==1:
            low_count_words.append(k)
    #Removed words with low count from tokenizer
    for w in low_count_words:
        del t.word_index[w]
    return t

In [None]:
# loading the tokenizer form disk
with open(tokenizer_path+"tokenizer_procedureevents.pickle", 'rb') as handle:
    t = pickle.load(handle)

# Removing tokens with low count
tokenizer_low_count(t)

# replacing tokens with their integer codes
df_train['procedureevents'] = df_train['procedureevents'].map(lambda y: t.texts_to_sequences([y])[0])
df_valid['procedureevents'] = df_valid['procedureevents'].map(lambda y: t.texts_to_sequences([y])[0])
df_test['procedureevents'] = df_test['procedureevents'].map(lambda y: t.texts_to_sequences([y])[0])

In [None]:
# loading the tokenizer form disk
with open(tokenizer_path+"tokenizer_chartevents.pickle", 'rb') as handle:
    t = pickle.load(handle) 

# Removing tokens with low count
tokenizer_low_count(t)

# replacing tokens with their integer codes
df_train['event'] = df_train['event'].map(lambda y: t.texts_to_sequences([y])[0])
df_valid['event'] = df_valid['event'].map(lambda y: t.texts_to_sequences([y])[0])
df_test['event'] = df_test['event'].map(lambda y: t.texts_to_sequences([y])[0])

In [None]:
# loading the tokenizer form disk
with open(tokenizer_path+"tokenizer_inputevents_cv.pickle", 'rb') as handle:
    t = pickle.load(handle) 

# Removing tokens with low count
tokenizer_low_count(t)

# replacing tokens with their integer codes
df_train['inputevents_cv'] = df_train['inputevents_cv'].map(lambda y: t.texts_to_sequences([y])[0])
df_valid['inputevents_cv'] = df_valid['inputevents_cv'].map(lambda y: t.texts_to_sequences([y])[0])
df_test['inputevents_cv'] = df_test['inputevents_cv'].map(lambda y: t.texts_to_sequences([y])[0])

In [None]:
# loading the tokenizer form disk
with open(tokenizer_path+"tokenizer_inputevents_mv.pickle", 'rb') as handle:
    t = pickle.load(handle) 

# Removing tokens with low count
tokenizer_low_count(t)

# replacing tokens with their integer codes
df_train['inputevents_mv'] = df_train['inputevents_mv'].map(lambda y: t.texts_to_sequences([y])[0])
df_valid['inputevents_mv'] = df_valid['inputevents_mv'].map(lambda y: t.texts_to_sequences([y])[0])
df_test['inputevents_mv'] = df_test['inputevents_mv'].map(lambda y: t.texts_to_sequences([y])[0])

In [None]:
# loading the tokenizer form disk
with open(tokenizer_path+"tokenizer_labevents.pickle", 'rb') as handle:
    t = pickle.load(handle) 

# Removing tokens with low count
tokenizer_low_count(t)

# replacing tokens with their integer codes
df_train['labevents'] = df_train['labevents'].map(lambda y: t.texts_to_sequences([y])[0])
df_valid['labevents'] = df_valid['labevents'].map(lambda y: t.texts_to_sequences([y])[0])
df_test['labevents'] = df_test['labevents'].map(lambda y: t.texts_to_sequences([y])[0])

In [None]:
# loading the tokenizer form disk
with open(tokenizer_path+"tokenizer_microbioevents.pickle", 'rb') as handle:
    t = pickle.load(handle) 

# Removing tokens with low count
tokenizer_low_count(t)

# replacing tokens with their integer codes
df_train['microbioevents'] = df_train['microbioevents'].map(lambda y: t.texts_to_sequences([y])[0])
df_valid['microbioevents'] = df_valid['microbioevents'].map(lambda y: t.texts_to_sequences([y])[0])
df_test['microbioevents'] = df_test['microbioevents'].map(lambda y: t.texts_to_sequences([y])[0])

In [None]:
# loading the tokenizer form disk
with open(tokenizer_path+"tokenizer_noteevents.pickle", 'rb') as handle:
    t = pickle.load(handle) 

# Removing tokens with low count
tokenizer_low_count(t)

# replacing tokens with their integer codes
df_train['noteevents'] = df_train['noteevents'].map(lambda y: t.texts_to_sequences([y])[0])
df_valid['noteevents'] = df_valid['noteevents'].map(lambda y: t.texts_to_sequences([y])[0])
df_test['noteevents'] = df_test['noteevents'].map(lambda y: t.texts_to_sequences([y])[0])

In [None]:
# loading the tokenizer form disk
with open(tokenizer_path+"tokenizer_outputevents.pickle", 'rb') as handle:
    t = pickle.load(handle) 

# Removing tokens with low count
tokenizer_low_count(t)

# replacing tokens with their integer codes
df_train['outputevents'] = df_train['outputevents'].map(lambda y: t.texts_to_sequences([y])[0])
df_valid['outputevents'] = df_valid['outputevents'].map(lambda y: t.texts_to_sequences([y])[0])
df_test['outputevents'] = df_test['outputevents'].map(lambda y: t.texts_to_sequences([y])[0])

In [None]:
# loading the tokenizer form disk
with open(tokenizer_path+"tokenizer_prescriptionevents.pickle", 'rb') as handle:
    t = pickle.load(handle) 

# Removing tokens with low count
tokenizer_low_count(t)

# replacing tokens with their integer codes
df_train['prescriptionevents'] = df_train['prescriptionevents'].map(lambda y: t.texts_to_sequences([y])[0])
df_valid['prescriptionevents'] = df_valid['prescriptionevents'].map(lambda y: t.texts_to_sequences([y])[0])
df_test['prescriptionevents'] = df_test['prescriptionevents'].map(lambda y: t.texts_to_sequences([y])[0])

In [None]:
#save the integer encoded data to disk
# create output path
file_save_path = '/home/jupyter/datasets/training_data/data_before_24hrs_icu/data_grouped_HADM_ID/train_test_valid/'
import os
os.makedirs(file_save_path, exist_ok=True)
# export the dataframe to JSON format
df_train.to_json(file_save_path+"all_events_train.json",orient = 'records')
df_valid.to_json(file_save_path+"all_events_valid.json",orient = 'records') 
df_test.to_json(file_save_path+"all_events_test.json",orient = 'records') 