## Data preparation

* All the models should run on same datasets to have equivalent comparison
* Preparing the dataset here ensures the train, test and validation sets are the same across models

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

### Loading all the datasets

In [2]:
#Load all the datasets
demo_df = pd.read_csv('datasets/Demographics.csv')
ICD_df = pd.read_csv('datasets/ICD.csv')
INP_MED_df = pd.read_csv('datasets/INP_MED.csv')
OUT_MED_df = pd.read_csv('datasets/OUT_MED.csv')
LABS_df = pd.read_csv('datasets/LABS.csv')
vision_df = pd.read_csv('datasets/Vision.csv', index_col=0)
vitals_df = pd.read_csv('datasets/Vitals.csv')

#### International Classification of Diseases ICD - Remove binary indicators (redundant)

In [3]:
ICD_columns = [col for col in ICD_df.columns if "frequency" in col] + ["idx"]
ICD_df = ICD_df[ICD_columns]
ICD_df.head()

Unnamed: 0,MATERNAL CAUSES OF PERINATAL MORBIDITY AND MORTALITY:frequency,OTHER CONDITIONS ORIGINATING IN THE PERINATAL PERIOD:frequency,COMPLICATIONS MAINLY RELATED TO PREGNANCY:frequency,COMPLICATIONS OCCURRING MAINLY IN THE COURSE OF LABOR AND DELIVERY:frequency,COMPLICATIONS OF THE PUERPERIUM:frequency,ECTOPIC AND MOLAR PREGNANCY:frequency,"NORMAL DELIVERY, AND OTHER INDICATIONS FOR CARE IN PREGNANCY, LABOR, AND DELIVERY:frequency",OTHER MATERNAL AND FETAL COMPLICATIONS:frequency,OTHER PREGNANCY WITH ABORTIVE OUTCOME:frequency,Anencephalus and similar anomalies:frequency,...,Symptoms involving cardiovascular system:frequency,Symptoms involving digestive system:frequency,Symptoms involving head and neck:frequency,Symptoms involving nervous and musculoskeletal systems:frequency,Symptoms involving respiratory system and other chest symptoms:frequency,Symptoms involving skin and other integumentary tissue:frequency,Symptoms involving urinary system:frequency,SUPPLEMENTARY CLASSIFICATION OF EXTERNAL CAUSES OF INJURY AND POISONING:frequency,SUPPLEMENTARY CLASSIFICATION OF FACTORS INFLUENCING HEALTH STATUS AND CONTACT WITH HEALTH SERVICES:frequency,idx
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,84
1,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,2248
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,3,0,0,0,0,2271
3,0,0,0,0,0,0,0,0,0,0,...,1,3,0,0,2,0,0,0,0,1691
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3286


#### Inpatient medications - Remove binary indicators (redundant)

In [4]:
INP_MED_columns = [col for col in INP_MED_df.columns if "Frequeny" in col] + ["idx"]
INP_MED_df = INP_MED_df[INP_MED_columns]
INP_MED_df.head()

Unnamed: 0,"LAXATIVES, LOCAL/RECTAL:Frequeny",PLATELET AGGREGATION INHIBITORS:Frequeny,nan:Frequeny,"NOSE PREPARATIONS, VASOCONSTRICTORS(OTC):Frequeny","ANALGESIC/ANTIPYRETICS,NON-SALICYLATE:Frequeny",ANTIHYPERLIPIDEMIC - HMG COA REDUCTASE INHIBITORS:Frequeny,SELECTIVE SEROTONIN REUPTAKE INHIBITOR (SSRIS):Frequeny,"VASODILATORS,CORONARY:Frequeny",ANTIEMETIC/ANTIVERTIGO AGENTS:Frequeny,BETA-ADRENERGIC BLOCKING AGENTS:Frequeny,...,THROMBOLYTIC - NUCLEOTIDE TYPE:Frequeny,SELECTIVE SEROTONIN 5-HT2A INVERSE AGONISTS (SSIA):Frequeny,ANTINEOPLASTIC - HEDGEHOG PATHWAY INHIBITOR:Frequeny,ORGAN TRANSPLANTATION PRESERVATION SOLUTIONS:Frequeny,FEEDING DEVICES:Frequeny,"DRUGS TO TX GAUCHER DX-TYPE 1, SUBSTRATE REDUCING:Frequeny","TOPICAL PREPARATIONS,NON-MEDICINAL:Frequeny","ANTI-INFLAMMATORY, INTERLEUKIN-1 BETA BLOCKERS:Frequeny","ACNE AGENTS,SYSTEMIC:Frequeny",idx
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,84
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2248
2,0,0,0,0,2,4,0,0,1,0,...,0,0,0,0,0,0,0,0,0,2271
3,1,2,0,0,4,0,0,0,5,0,...,0,0,0,0,0,0,0,0,0,1691
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3286


#### Outpatient medications - Remove binary indicators (redundant)

In [5]:
OUT_MED_columns = [col for col in OUT_MED_df.columns if "Frequeny" in col]+ ["idx"]
OUT_MED_df = OUT_MED_df[OUT_MED_columns]
OUT_MED_df.head()

Unnamed: 0,"Outpatient_LAXATIVES, LOCAL/RECTAL:Frequeny",Outpatient_PLATELET AGGREGATION INHIBITORS:Frequeny,Outpatient_nan:Frequeny,"Outpatient_NOSE PREPARATIONS, VASOCONSTRICTORS(OTC):Frequeny","Outpatient_ANALGESIC/ANTIPYRETICS,NON-SALICYLATE:Frequeny",Outpatient_ANTIHYPERLIPIDEMIC - HMG COA REDUCTASE INHIBITORS:Frequeny,Outpatient_SELECTIVE SEROTONIN REUPTAKE INHIBITOR (SSRIS):Frequeny,"Outpatient_VASODILATORS,CORONARY:Frequeny",Outpatient_ANTIEMETIC/ANTIVERTIGO AGENTS:Frequeny,Outpatient_BETA-ADRENERGIC BLOCKING AGENTS:Frequeny,...,Outpatient_THROMBOLYTIC - NUCLEOTIDE TYPE:Frequeny,Outpatient_SELECTIVE SEROTONIN 5-HT2A INVERSE AGONISTS (SSIA):Frequeny,Outpatient_ANTINEOPLASTIC - HEDGEHOG PATHWAY INHIBITOR:Frequeny,Outpatient_ORGAN TRANSPLANTATION PRESERVATION SOLUTIONS:Frequeny,Outpatient_FEEDING DEVICES:Frequeny,"Outpatient_DRUGS TO TX GAUCHER DX-TYPE 1, SUBSTRATE REDUCING:Frequeny","Outpatient_TOPICAL PREPARATIONS,NON-MEDICINAL:Frequeny","Outpatient_ANTI-INFLAMMATORY, INTERLEUKIN-1 BETA BLOCKERS:Frequeny","Outpatient_ACNE AGENTS,SYSTEMIC:Frequeny",idx
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,84
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2248
2,0,0,0,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2271
3,1,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1691
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3286


#### Lab values - Remove binary indicators (redundant)

In [6]:
LABS_columns = [col for col in LABS_df.columns if "Value" in col]+ ["idx"]
LABS_df = LABS_df[LABS_columns]
LABS_df.head()

Unnamed: 0,albumin:Value,alk:Value,ast:Value,anion:Value,bilirubin:Value,bun:Value,bun_cre:Value,calcium:Value,creatinine:Value,d-dimer:Value,...,a1c:Value,hgb:Value,inr:Value,lactate:Value,platelet:Value,potassium:Value,ptt:Value,sodium:Value,wbc:Value,idx
0,2.6,51.0,15.0,6.0,0.8,12.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,116.0,0.0,0.0,0.0,0.3,84
1,4.4,60.0,24.0,7.0,1.3,11.0,0.0,0.0,0.93,0.0,...,0.0,0.0,0.0,0.0,655.0,0.0,0.0,0.0,13.8,2248
2,2.6,75.0,22.0,7.0,0.5,14.0,0.0,0.0,1.2,0.0,...,0.0,0.0,1.3,0.0,356.0,0.0,15.9,0.0,6.8,2271
3,3.0,117.0,71.0,12.0,0.6,23.0,0.0,0.0,1.3,0.0,...,0.0,0.0,0.0,1.8,426.0,0.0,0.0,0.0,10.0,1691
4,3.2,64.0,28.0,8.0,0.5,12.0,0.0,0.0,0.7,0.0,...,0.0,0.0,2.0,0.0,131.0,0.0,21.7,0.0,4.0,3286


#### Combine all datasets - Remove features with less than 1% variance

In [7]:
#Combine all the datasets and set the index
processed_ehr_dfs = []
for df in [demo_df, ICD_df, INP_MED_df, OUT_MED_df, LABS_df, vitals_df]:
    
    #Remove all the feature with less than 1% variance
    df = df.loc[:,df.apply(pd.Series.nunique) != 1]
    # set index 
    df = df.set_index('idx')
    
    processed_ehr_dfs.append(df)

ehr_df = pd.concat(processed_ehr_dfs, axis=1)
EMR_FEATURE_COLS = ehr_df.columns.tolist()

# Join vision information with emr dataframe
vision_df = vision_df.set_index('idx')
df = pd.concat([vision_df, ehr_df], axis=1)
df.head(5)

Unnamed: 0_level_0,label,pred,pe_type,split,current_age_yrs,Female,Male,Asian,Black,Native American,...,wbc:Value,SBP,DBP,height_inch,weight_kg,bmi,tempf,respirations,spO2,pulse
idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1436,0,0.405236,,train,68.54,0,1,0,0,0,...,13.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1880,1,0.836337,segmental,train,74.35,0,1,0,0,0,...,10.3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2738,0,0.596504,,val,78.95,1,0,0,0,0,...,12.3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2883,0,0.108968,,train,55.08,1,0,0,0,0,...,5.9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2302,1,0.808755,segmental,train,67.76,1,0,0,0,0,...,19.8,0.333333,-3.333333,-10000.0,3.033333,-10000.0,28.4,-17.0,1.333333,10.333333


#### Remove test set for comparison with the previous study

In [8]:
test_set = df[df['split'] == "test"].drop(["pred","pe_type","split"], axis = 1)

#### Prepare the remaining dataset for train and validation split

In [9]:
ehr_df = df[df['split'] != "test"].drop(["pred","pe_type","split"], axis = 1)

In [10]:
X = ehr_df.drop('label', axis=1)
y = ehr_df['label']

In [11]:
def train_valid_split(df):
    X = df.drop('label', axis=1)
    y = df['label']
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size= 0.2, stratify= y, random_state = 10)
    train_mean = X_train.mean()
    train_sd = X_train.std()
    train_sd = train_sd.replace(0, 1)
    df_X_train = (X_train - train_mean) / train_sd
    df_X_val = (X_val - train_mean) / train_sd

    df_X_train = pd.concat([df_X_train, y_train], axis=1)
    df_X_val = pd.concat([df_X_val, y_val], axis=1)

    return (df_X_train, df_X_val, train_mean, train_sd)

In [12]:
df_X_train, df_X_val, train_mean, train_sd = train_valid_split(ehr_df)

In [13]:
X_test = test_set.drop('label', axis=1)
y_test = test_set['label']

# test_mean = X_test.mean()
# test_sd = X_test.std()
# test_sd = test_sd.replace(0, 1)

df_X_test = (X_test - train_mean) / train_sd
df_X_test = pd.concat([df_X_test, y_test], axis=1)

# df_X_test = (X_test - test_mean) / test_sd
# df_X_test = pd.concat([df_X_test, y_test], axis=1)

In [14]:
df_X_test

Unnamed: 0_level_0,current_age_yrs,Female,Male,Asian,Black,Native American,Other,Pacific Islander,Unknown_race,White,...,SBP,DBP,height_inch,weight_kg,bmi,tempf,respirations,spO2,pulse,label
idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2506,0.324501,-1.200965,1.200965,-0.301771,-0.292699,-0.038984,-0.369081,-0.078147,2.697682,-1.197202,...,0.119548,0.108374,-1.677850,-1.996396,-1.771743,0.262369,0.294943,0.345002,0.282748,0
2832,-0.067414,-1.200965,1.200965,-0.301771,-0.292699,-0.038984,2.707374,-0.078147,-0.370407,-1.197202,...,0.131406,0.118773,0.575844,0.507628,0.550683,0.285998,0.281087,0.345758,0.285682,1
1487,0.238958,0.832031,-0.832031,-0.301771,-0.292699,-0.038984,-0.369081,-0.078147,-0.370407,0.834647,...,0.071952,0.066130,0.576299,0.500463,0.547864,0.285541,0.301871,0.337975,0.295247,1
15,0.415157,-1.200965,1.200965,-0.301771,-0.292699,-0.038984,-0.369081,-0.078147,-0.370407,0.834647,...,0.102166,0.106100,0.577210,0.503184,0.548084,0.286455,0.281087,0.346083,0.278795,1
1763,0.558813,0.832031,-0.832031,-0.301771,3.413884,-0.038984,-0.369081,-0.078147,-0.370407,-1.197202,...,0.102375,0.104463,-1.677850,-1.996396,-1.771743,0.253054,0.300439,0.344542,0.281664,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2178,0.446771,-1.200965,1.200965,-0.301771,-0.292699,-0.038984,-0.369081,-0.078147,-0.370407,0.834647,...,0.103141,0.105125,0.597460,0.500064,0.565572,0.250472,0.301871,0.344461,0.281090,1
1047,0.533243,0.832031,-0.832031,-0.301771,-0.292699,-0.038984,-0.369081,-0.078147,-0.370407,0.834647,...,0.103141,0.105125,0.597460,0.500064,0.565572,0.250472,0.301871,0.344461,0.281090,0
897,0.722925,0.832031,-0.832031,-0.301771,-0.292699,-0.038984,-0.369081,-0.078147,-0.370407,0.834647,...,0.103141,0.105125,0.597460,0.500064,0.565572,0.250472,0.301871,0.344461,0.281090,1
2311,1.367748,0.832031,-0.832031,-0.301771,-0.292699,-0.038984,-0.369081,-0.078147,-0.370407,0.834647,...,0.098593,0.096676,-1.677850,-1.996396,-1.771743,-3.904609,-3.344477,-2.898405,0.285682,1


In [15]:
#df_X_test.to_csv("datasets/test_set_preprocessed.csv")

In [14]:
# df_X_train.to_csv("datasets/training_set.csv")
# df_X_val.to_csv("datasets/validation_set.csv")

#### Sample train size n = 1000, n = 500, n = 300

In [15]:
df_sampling = ehr_df.copy()
df_sampling = df_sampling.reset_index(drop=False)

In [16]:
# Calculate the class imbalance
imbalance = df_sampling['label'].value_counts(normalize=True)

# Randomly sample 1000, 500, and 300 observations from the train split, 
df_1000 = df_sampling.groupby('label').apply(lambda x: x.sample(n=int(1000*imbalance[x.name]), replace=False, random_state=42)).reset_index(drop=True)
df_500 = df_sampling.groupby('label').apply(lambda x: x.sample(n=int(500*imbalance[x.name]), replace=False, random_state=42)).reset_index(drop=True)
df_300 = df_sampling.groupby('label').apply(lambda x: x.sample(n=int(300*imbalance[x.name]), replace=False, random_state=42)).reset_index(drop=True)

In [17]:
df_1000.set_index('idx', inplace=True)
df_500.set_index('idx', inplace=True)
df_300.set_index('idx', inplace=True)

In [18]:
df_X_train_1000, df_X_val_1000 = train_valid_split(df_1000)
df_X_train_500, df_X_val_500 = train_valid_split(df_500)
df_X_train_300, df_X_val_300 = train_valid_split(df_300)

In [19]:
# df_X_train_1000.to_csv("datasets/training_set_1000.csv")
# df_X_val_1000.to_csv("datasets/validation_set_1000.csv")
# df_X_train_500.to_csv("datasets/training_set_500.csv")
# df_X_val_500.to_csv("datasets/validation_set_500.csv")
# df_X_train_300.to_csv("datasets/training_set_300.csv")
# df_X_val_300.to_csv("datasets/validation_set_300.csv")