In [53]:
#This code compresses PROCEDURES_ICD9, and DIAGNOSES_ICD9 into single entries per admission (sorted by SEQ_NUM)
#and merges this data with the ADMISSIONS dataframe, providing two separate dataframes with this merged structure.
#One dataframe corresponds to patients diagnosed with specified ICD-9 codes and the other contains the control patients.

#Updates made:
# 1. Separated lab events from the merged dataframes into separate dataframes to support easier unpacking later.
# 2. Fixed logic for pulling SUBJECT_ID for controls patients to prevent leakage of diseased patients into controls DF
# 3. Used chunking while processing the large LABEVENTS CSV to prevent memory overload and crashes
# 4. Specified labs specific to aortic dissection and pulled corresponding ITEMNAMES from Lab_Item_Codes.txt.
#    Only patients with these labs exist in the controls LABEVENTS DF and the diseased patients LABEVENTS DF
# 5. Patient admissions for diseased group from AFTER first diagnosis are not included in either controls nor diseased final DF

import numpy as np
import pandas as pd
import seaborn as sns
import datetime
import matplotlib.pyplot as plt

ADMISSIONS = pd.read_csv("./ADMISSIONS.csv")
DIAGNOSES_ICD = pd.read_csv("./DIAGNOSES_ICD.csv")
PATIENTS = pd.read_csv("./PATIENTS.csv")
PROCEDURES_ICD = pd.read_csv("./PROCEDURES_ICD.csv")

#Input ICD9 code that you want to look at here:
my_icd9_code = ["44100", "44101", "44102", "44103"] #441 is arotic dissection. Change to 421 for bacterial endocarditis
#check what any following numbers would be in the ICD9 code
#need to update to be 441.00, 441.01, 441.02, 441.03

#Returns patients with aortic dissection
AD_SUBJECT_ID = DIAGNOSES_ICD.loc[
    DIAGNOSES_ICD["ICD9_CODE"].astype(str).isin(my_icd9_code),
    "SUBJECT_ID"
].unique() 

#Returns all admissions for patients never diagnosed with AD
CONTROL_SUBJECT_ID = DIAGNOSES_ICD.loc[
    ~DIAGNOSES_ICD["SUBJECT_ID"].isin(AD_SUBJECT_ID), 
    "SUBJECT_ID"
].unique()

#Returns the specific admissions where aortic dissection was diagnosed
AD_HADM_ID = DIAGNOSES_ICD.loc[
    DIAGNOSES_ICD["ICD9_CODE"].astype(str).isin(my_icd9_code),
    "HADM_ID"
].unique()

"""
Question for Daniel: do we need to filter this again keeping only the first AD diagnosis?
"""

#Identify all diagnoses for patients diagnosed with aortic dissection, including for admissions where they were not diagnosed with AD
PATIENT_DIAGNOSES = DIAGNOSES_ICD[DIAGNOSES_ICD['SUBJECT_ID'].isin(AD_SUBJECT_ID)]

CONTROL_DIAGNOSES = DIAGNOSES_ICD[DIAGNOSES_ICD['SUBJECT_ID'].isin(CONTROL_SUBJECT_ID)]

#Return a new dataframe with all the ICD9 codes for each admission condensed into a single row,col val as a compressed list
PATIENT_DIAGNOSES = (
    PATIENT_DIAGNOSES
    .sort_values(['HADM_ID','SEQ_NUM'])
    .groupby(['SUBJECT_ID','HADM_ID'])['ICD9_CODE']
    .apply(list)
    .reset_index(name='DIAGNOSES')
)

#Return a new dataframe with all the ICD9 codes for each admission condensed into a single row,col val as a compressed list for control patients
CONTROL_DIAGNOSES = (
    CONTROL_DIAGNOSES
    .sort_values(['HADM_ID','SEQ_NUM'])
    .groupby(['SUBJECT_ID','HADM_ID'])['ICD9_CODE']
    .apply(list)
    .reset_index(name='DIAGNOSES')
)

#Remove DIAGNOSES_ICD to conserve memory since we have already filtered for the relevant data
del DIAGNOSES_ICD

#Return all procedures for patients diagnosed with AD, including for admissions where they were not diagnosed with AD
PATIENT_PROCEDURES = PROCEDURES_ICD[PROCEDURES_ICD['SUBJECT_ID'].isin(AD_SUBJECT_ID)]

#identify all procedures for control patients as well
CONTROL_PROCEDURES = PROCEDURES_ICD[PROCEDURES_ICD['SUBJECT_ID'].isin(CONTROL_SUBJECT_ID)]

#Return a new dataframe with all procedure codes for each admission compressed into a single row,col val as a compressed list
PATIENT_PROCEDURES = (
    PATIENT_PROCEDURES
    .sort_values(['HADM_ID','SEQ_NUM'])
    .groupby(['SUBJECT_ID','HADM_ID'])['ICD9_CODE']
    .apply(list)
    .reset_index(name='PROCEDURE TYPE')
)

#Return a new dataframe with all procedure codes for each admission compressed into a single row,col val as a compressed list for control patients
CONTROL_PROCEDURES = (
    CONTROL_PROCEDURES
    .sort_values(['HADM_ID','SEQ_NUM'])
    .groupby(['SUBJECT_ID','HADM_ID'])['ICD9_CODE']
    .apply(list)
    .reset_index(name='PROCEDURE TYPE')
)

#Remove PROCEDURES_ICD to conserve memory since we have already extracted the relevant rows
del PROCEDURES_ICD

#Return every admission entry for patients who were diagnosed with AD at some point
PATIENT_ADMISSIONS = ADMISSIONS[ADMISSIONS['SUBJECT_ID'].isin(AD_SUBJECT_ID)]

#pull control group admissions as well
CONTROL_ADMISSIONS = ADMISSIONS[ADMISSIONS['SUBJECT_ID'].isin(CONTROL_SUBJECT_ID)]

#Remove redundant columns from the other filtered dataframes for a cleaner merge
PATIENT_PROCEDURES.drop('SUBJECT_ID',inplace=True,axis=1)
PATIENT_DIAGNOSES.drop('SUBJECT_ID',inplace=True,axis=1)
CONTROL_PROCEDURES.drop('SUBJECT_ID',inplace=True,axis=1)
CONTROL_DIAGNOSES.drop('SUBJECT_ID',inplace=True,axis=1)

#Merge the compressed DFs engineered earlier with admissions so that each admission has lab event, diagnosis, and procedure data
PATIENT_ADMISSIONS_MERGED = PATIENT_ADMISSIONS.merge(PATIENT_DIAGNOSES, on="HADM_ID", how="left") \
            .merge(PATIENT_PROCEDURES, on="HADM_ID", how="left")

CONTROL_ADMISSIONS_MERGED = CONTROL_ADMISSIONS.merge(CONTROL_DIAGNOSES, on="HADM_ID", how="left") \
            .merge(CONTROL_PROCEDURES, on="HADM_ID", how="left")

#Rename columns for clarity since there is a text-based labeling column and the ICD-9 diagnosis column
PATIENT_ADMISSIONS_MERGED = PATIENT_ADMISSIONS_MERGED.rename(columns={"DIAGNOSIS": "DIAGNOSIS (LABEL)","DIAGNOSES": "DIAGNOSIS (ICD_9)"})
CONTROL_ADMISSIONS_MERGED = CONTROL_ADMISSIONS_MERGED.rename(columns={"DIAGNOSIS": "DIAGNOSIS (LABEL)","DIAGNOSES": "DIAGNOSIS (ICD_9)"})

#Drop redundant row
PATIENT_ADMISSIONS_MERGED.drop(['ROW_ID'],inplace=True,axis=1)
CONTROL_ADMISSIONS_MERGED.drop(['ROW_ID'],inplace=True,axis=1)

#Identify the admissions where AD was one of the diagnoses given to the patients, excluding admissions where AD was not diagnosed
#No need to do this for control group
DISEASE_ADMISSIONS = PATIENT_ADMISSIONS_MERGED[PATIENT_ADMISSIONS_MERGED['HADM_ID'].isin(AD_HADM_ID)]
DISEASE_ADMISSIONS = DISEASE_ADMISSIONS.copy()

#Convert ADMITTIME to datetime for processing
DISEASE_ADMISSIONS['ADMITTIME'] = pd.to_datetime(DISEASE_ADMISSIONS["ADMITTIME"], errors="coerce")

#convert to datetime for control group
CONTROL_ADMISSIONS_MERGED['ADMITTIME'] = pd.to_datetime(CONTROL_ADMISSIONS_MERGED["ADMITTIME"], errors="coerce")

#Sort by HADM_ID and ADMITTIME to get a sorted list for processing
DISEASE_ADMISSIONS = DISEASE_ADMISSIONS.sort_values(['HADM_ID','ADMITTIME'])

#sort control group by HADM_ID and ADMITTIME as well
CONTROL_ADMISSIONS_MERGED = CONTROL_ADMISSIONS_MERGED.sort_values(['HADM_ID','ADMITTIME'])

#Identify the earliest admission time in which patients were diagnosed with AD
DISEASE_FIRST_ADMISSIONS = DISEASE_ADMISSIONS.groupby('SUBJECT_ID',as_index=False)['ADMITTIME'].min()

#Rename this column to "Comparator" since it will be used for filtering admissions from after the patient was diagnosed with AD
DISEASE_FIRST_ADMISSIONS = DISEASE_FIRST_ADMISSIONS.rename(columns={"ADMITTIME": "Comparator"})

#Update ADMISSIONS_MERGED so it now contains all admissions for patients who were diagnosed with AD at some point
#Prior and including the admission with their first diagnosis of AD. Admissions after their first diagnosis are excluded
PATIENT_ADMISSIONS_MERGED['ADMITTIME'] = pd.to_datetime(PATIENT_ADMISSIONS_MERGED["ADMITTIME"], errors="coerce")
PATIENT_ADMISSIONS_MERGED = PATIENT_ADMISSIONS_MERGED.merge(DISEASE_FIRST_ADMISSIONS,on='SUBJECT_ID',how="left")
PATIENT_ADMISSIONS_MERGED = PATIENT_ADMISSIONS_MERGED[PATIENT_ADMISSIONS_MERGED['ADMITTIME']<=PATIENT_ADMISSIONS_MERGED['Comparator']]

#drop the comparator column now that filtering is done so that the DFs are the same
PATIENT_ADMISSIONS_MERGED = PATIENT_ADMISSIONS_MERGED.drop(['Comparator'],axis=1)

In [62]:
CONTROL_ADMISSIONS_MERGED

Unnamed: 0,SUBJECT_ID,HADM_ID,ADMITTIME,DISCHTIME,DEATHTIME,ADMISSION_TYPE,ADMISSION_LOCATION,DISCHARGE_LOCATION,INSURANCE,LANGUAGE,RELIGION,MARITAL_STATUS,ETHNICITY,EDREGTIME,EDOUTTIME,DIAGNOSIS (LABEL),HOSPITAL_EXPIRE_FLAG,HAS_CHARTEVENTS_DATA,DIAGNOSIS (ICD_9),PROCEDURE TYPE
46973,58526,100001,2117-09-11 11:46:00,2117-09-17 16:45:00,,EMERGENCY,CLINIC REFERRAL/PREMATURE,HOME,Private,ENGL,PROTESTANT QUAKER,DIVORCED,WHITE,2117-09-11 08:59:00,2117-09-11 12:35:00,DIABETIC KETOACIDOSIS,0,1,"[25013, 3371, 5849, 5780, V5867, 25063, 5363, ...",
46632,54610,100003,2150-04-17 15:34:00,2150-04-21 17:30:00,,EMERGENCY,EMERGENCY ROOM ADMIT,HOME,Private,ENGL,NOT SPECIFIED,SINGLE,WHITE,2150-04-17 13:10:00,2150-04-17 17:47:00,UPPER GI BLEED,0,1,"[53100, 2851, 07054, 5715, 45621, 53789, 4019,...","[4443, 9607, 9904, 3893]"
8963,9895,100006,2108-04-06 15:49:00,2108-04-18 17:18:00,,EMERGENCY,EMERGENCY ROOM ADMIT,HOME,Private,,NOT SPECIFIED,SINGLE,BLACK/AFRICAN AMERICAN,2108-04-06 11:39:00,2108-04-06 17:56:00,COPD FLARE,0,1,"[49320, 51881, 486, 20300, 2761, 7850, 3090, V...","[9390, 9925]"
27467,23018,100007,2145-03-31 05:33:00,2145-04-07 12:40:00,,EMERGENCY,EMERGENCY ROOM ADMIT,HOME,Private,,JEWISH,MARRIED,WHITE,2145-03-30 20:43:00,2145-03-31 06:08:00,BOWEL OBSTRUCTION,0,1,"[56081, 5570, 9973, 486, 4019]","[4562, 5459]"
837,533,100009,2162-05-16 15:56:00,2162-05-21 13:37:00,,EMERGENCY,TRANSFER FROM HOSP/EXTRAM,HOME HEALTH CARE,Private,,CATHOLIC,MARRIED,WHITE,,,CORONARY ARTERY DISEASE,0,1,"[41401, 99604, 4142, 25000, 27800, V8535, 4148...","[3613, 3615, 3795, 3961]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23816,20785,199993,2161-10-23 18:01:00,2161-11-17 08:10:00,,EMERGENCY,TRANSFER FROM HOSP/EXTRAM,SHORT TERM HOSPITAL,Private,,CATHOLIC,DIVORCED,UNKNOWN/NOT SPECIFIED,2161-10-23 16:23:00,2161-10-23 18:45:00,CORONARY ARTERY DISEASE,0,1,"[41031, 42821, 42731, 4271, 5180, 4240, 2760, ...","[3614, 3512, 3761, 8842, 8848, 3961, 3964, 340..."
29822,23761,199994,2188-07-07 18:47:00,2188-07-17 13:31:00,,EMERGENCY,EMERGENCY ROOM ADMIT,HOME HEALTH CARE,Medicare,,CATHOLIC,SINGLE,WHITE,2188-07-07 15:55:00,2188-07-07 20:21:00,MENTAL STATUS CHANGES,0,1,"[486, 4280, 51881, 3970, 496, 4169, 585, 42732...","[9671, 9604, 3995, 3891]"
24990,19412,199995,2137-12-11 17:35:00,2137-12-28 12:30:00,,EMERGENCY,PHYS REFERRAL/NORMAL DELI,HOME,Private,ENGL,PROTESTANT QUAKER,SINGLE,WHITE,,,ROOT ABSCESS,0,1,"[4210, 7464, 42971, 30401, 4412, 44284, V1259,...","[3521, 3961, 3845, 3539, 8841, 8847, 9929, 887..."
33116,27200,199998,2119-02-18 16:26:00,2119-02-24 14:25:00,,EMERGENCY,TRANSFER FROM HOSP/EXTRAM,HOME HEALTH CARE,Medicare,ENGL,CATHOLIC,MARRIED,WHITE,,,CORONARY ARTERY DISEASE,0,1,"[41401, 9971, 9975, 42731, 78820, 4111, V4582,...","[3612, 3615, 3964]"


In [55]:
#Thomas's logic retained below

print(PATIENT_ADMISSIONS_MERGED.columns)
print(PATIENT_ADMISSIONS_MERGED.shape)

print(CONTROL_ADMISSIONS_MERGED.columns)
print(CONTROL_ADMISSIONS_MERGED.shape)

Index(['SUBJECT_ID', 'HADM_ID', 'ADMITTIME', 'DISCHTIME', 'DEATHTIME',
       'ADMISSION_TYPE', 'ADMISSION_LOCATION', 'DISCHARGE_LOCATION',
       'INSURANCE', 'LANGUAGE', 'RELIGION', 'MARITAL_STATUS', 'ETHNICITY',
       'EDREGTIME', 'EDOUTTIME', 'DIAGNOSIS (LABEL)', 'HOSPITAL_EXPIRE_FLAG',
       'HAS_CHARTEVENTS_DATA', 'DIAGNOSIS (ICD_9)', 'PROCEDURE TYPE'],
      dtype='str')
(378, 20)
Index(['SUBJECT_ID', 'HADM_ID', 'ADMITTIME', 'DISCHTIME', 'DEATHTIME',
       'ADMISSION_TYPE', 'ADMISSION_LOCATION', 'DISCHARGE_LOCATION',
       'INSURANCE', 'LANGUAGE', 'RELIGION', 'MARITAL_STATUS', 'ETHNICITY',
       'EDREGTIME', 'EDOUTTIME', 'DIAGNOSIS (LABEL)', 'HOSPITAL_EXPIRE_FLAG',
       'HAS_CHARTEVENTS_DATA', 'DIAGNOSIS (ICD_9)', 'PROCEDURE TYPE'],
      dtype='str')
(58462, 20)


In [65]:
chunksize = 50000
LAB_EVENTS = pd.read_csv('./LABEVENTS.csv',chunksize=chunksize)
LAB_IDS = pd.read_csv("Lab_Item_Codes.txt", sep="\t")

PATIENT_LAB_EVENTS2 = []
CONTROL_LAB_EVENTS2 = []

labs_of_interest = [
    'TROPONIN', 'D-DIMER', 'CREATININE', 'CK-MB', 'BUN', 
    'UREA', 'C-REACTIVE', 'LDH', 'LACTATE DEHYDROGENASE', 
    'BILIRUBIN', 'AST', 'ALT', 
    'WHITE BLOOD', 'WBC', 'LYMPHOCYTE', 'NEUTROPHIL'
]

#create one large string which the string matcher will search through
pattern = '|'.join(labs_of_interest)

#check for any of the strings listed above in the pattern mega-string
lab_ids = LAB_IDS[
    LAB_IDS['Display'].str.contains(pattern, case=False, na=False)
]
lab_ids = lab_ids['Code']

for EVENT in LAB_EVENTS:

    unique_admission_count = len(EVENT['HADM_ID'].unique())
    unique_admission_count += unique_admission_count

    EVENT = EVENT.loc[
        EVENT['ITEMID'].isin(lab_ids)
    ]
    
    #Identify all patient labs for patients who were diagnosed with AD at any point. Do not include labs taken during admissions following their first diagnosis
    PATIENT_LAB_EVENTS = EVENT[EVENT['HADM_ID'].isin(PATIENT_ADMISSIONS_MERGED['HADM_ID'])]
    
    #pull control group lab events as well
    CONTROL_LAB_EVENTS = EVENT[EVENT['HADM_ID'].isin(CONTROL_ADMISSIONS_MERGED['HADM_ID'])]
    
    #Remove redundant rows
    PATIENT_LAB_EVENTS = PATIENT_LAB_EVENTS.drop(['ROW_ID','VALUE'],axis=1)
    CONTROL_LAB_EVENTS = CONTROL_LAB_EVENTS.drop(['ROW_ID','VALUE'],axis=1)
    
    #Convert CHARTTIME to a datetime for sorting
    PATIENT_LAB_EVENTS['CHARTTIME'] = pd.to_datetime(PATIENT_LAB_EVENTS["CHARTTIME"], errors="coerce")
    CONTROL_LAB_EVENTS['CHARTTIME'] = pd.to_datetime(CONTROL_LAB_EVENTS["CHARTTIME"], errors="coerce")
    
    #Only return values that are not na.
    PATIENT_LAB_EVENTS = PATIENT_LAB_EVENTS[PATIENT_LAB_EVENTS['HADM_ID'].notna()]
    CONTROL_LAB_EVENTS = CONTROL_LAB_EVENTS[CONTROL_LAB_EVENTS['HADM_ID'].notna()]

    PATIENT_LAB_EVENTS2.append(PATIENT_LAB_EVENTS)
    CONTROL_LAB_EVENTS2.append(CONTROL_LAB_EVENTS)
    
#Remove LAB_EVENTS to conserve lots of memory since we already have extracted the necessary data
del LAB_EVENTS

#Construct final labs DFs through concatenation
FINAL_PATIENT_LABS = pd.concat(PATIENT_LAB_EVENTS2)
FINAL_CONTROL_LABS = pd.concat(CONTROL_LAB_EVENTS2)

print(unique_admission_count)

136


In [90]:
print("Number of admissions captured for AD patients (patient admissions merged DF): ",len(PATIENT_ADMISSIONS_MERGED['HADM_ID'].unique()))
print("Number of admissions captured in lab events for AD patients (control admissions merged DF): ",len(FINAL_PATIENT_LABS['HADM_ID'].unique()))
print("Verify all lab events have been captured using HAS_CHARTEVENTS_DATA col in admissions merged DF: ",len(PATIENT_ADMISSIONS_MERGED[PATIENT_ADMISSIONS_MERGED['HAS_CHARTEVENTS_DATA']==1]),"\n")

print("Number of admissions captured for control patients (control admissions merged DF): ",len(CONTROL_ADMISSIONS_MERGED['HADM_ID'].unique()))
print("Number of admissions captured in lab events for control patients (control admissions merged DF): ",len(FINAL_CONTROL_LABS['HADM_ID'].unique()))
print("Verify all lab events have been captured using HAS_CHARTEVENTS_DATA col in admissions merged DF: ",len(CONTROL_ADMISSIONS_MERGED[CONTROL_ADMISSIONS_MERGED['HAS_CHARTEVENTS_DATA']==1]),"\n")

print("Based on the analysis it seems that the HAS_CHARTEVENTS_DATA may not be a reliable column, since more admissions are captured in LABEVENTS than HAS_CHARTEVENTS indicates")

Number of admissions captured for AD patients (patient admissions merged DF):  378
Number of admissions captured in lab events for AD patients (control admissions merged DF):  375
Verify all lab events have been captured using HAS_CHARTEVENTS_DATA col in admissions merged DF:  372 

Number of admissions captured for control patients (control admissions merged DF):  58462
Number of admissions captured in lab events for control patients (control admissions merged DF):  57539
Verify all lab events have been captured using HAS_CHARTEVENTS_DATA col in admissions merged DF:  56883 



In [57]:
FINAL_PATIENT_LABS.to_csv('RELEVANT_PATIENT_LABS.csv', index=False)
FINAL_CONTROL_LABS.to_csv('RELEVANT_CONTROL_LABS.csv', index=False)
PATIENT_ADMISSIONS_MERGED.to_csv('PATIENT_ADMISSIONS_MERGED.csv', index=False)
CONTROL_ADMISSIONS_MERGED.to_csv('CONTROL_ADMISSIONS_MERGED.csv', index=False)

In [58]:
#display unique ICD9 codes in the filtered patient admissions
print(PATIENT_ADMISSIONS_MERGED['DIAGNOSIS (ICD_9)'].explode().unique())
print(CONTROL_ADMISSIONS_MERGED['DIAGNOSIS (ICD_9)'].explode().unique())

'''
for i in range(len(PATIENT_ADMISSIONS_MERGED['DIAGNOSIS (ICD_9)'].explode().unique())):
    print(str(PATIENT_ADMISSIONS_MERGED['DIAGNOSIS (ICD_9)'].explode().unique()[i]) + ",")

for i in range(len(CONTROL_ADMISSIONS_MERGED['DIAGNOSIS (ICD_9)'].explode().unique())):
    print(str(CONTROL_ADMISSIONS_MERGED['DIAGNOSIS (ICD_9)'].explode().unique()[i]) + ",")

Thomas's Note Retained Below:

We need to add another Column having a hospital amdission ID value for each event further back from the event.

NOTE that we need to label the most recent event as 0 and count backwards in order to have all patients match.

Need to search in the literature what to do with repeated lab values. For example, if a patient across multiple admissions has their SpO2 value taken 6 times, what do we do with those values? For example, take average? Take most recent? Not sure

Search lab events that are important for the diagnoses and use those.

Also need to randomly sample from the non patient cohort rather than take them all.

'''

<StringArray>
['44101',  '5185', '99674', '56962',  '9971', '42731', '99859', '70703',
  '5793',  '2874',
 ...
 'V8542', '45184', 'V2652',  '2409',  '1124',  '7801', '43401', '78009',
 '44029', '34989']
Length: 844, dtype: str
<StringArray>
['25013',  '3371',  '5849',  '5780', 'V5867', '25063',  '5363',  '4580',
 '25043', '40390',
 ...
 '82534', 'V6889', 'E9238', '74921',  '9063', '72672', '37882', '38832',
 '40400', '44382']
Length: 6973, dtype: str


'\nfor i in range(len(PATIENT_ADMISSIONS_MERGED[\'DIAGNOSIS (ICD_9)\'].explode().unique())):\n    print(str(PATIENT_ADMISSIONS_MERGED[\'DIAGNOSIS (ICD_9)\'].explode().unique()[i]) + ",")\n\nfor i in range(len(CONTROL_ADMISSIONS_MERGED[\'DIAGNOSIS (ICD_9)\'].explode().unique())):\n    print(str(CONTROL_ADMISSIONS_MERGED[\'DIAGNOSIS (ICD_9)\'].explode().unique()[i]) + ",")\n\nThomas\'s Note Retained Below:\n\nWe need to add another Column having a hospital amdission ID value for each event further back from the event.\n\nNOTE that we need to label the most recent event as 0 and count backwards in order to have all patients match.\n\nNeed to search in the literature what to do with repeated lab values. For example, if a patient across multiple admissions has their SpO2 value taken 6 times, what do we do with those values? For example, take average? Take most recent? Not sure\n\nSearch lab events that are important for the diagnoses and use those.\n\nAlso need to randomly sample from the n