In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import datetime
import matplotlib.pyplot as plt

In [2]:
ADMISSIONS = pd.read_csv(r"C:\Users\kappl\OneDrive\Documents\BIOE5860 Precision Medicine\ADMISSIONS.csv") #pd.read_csv("./ADMISSIONS.csv")
DIAGNOSES_ICD = pd.read_csv(r"C:\Users\kappl\OneDrive\Documents\BIOE5860 Precision Medicine\DIAGNOSES_ICD.csv") #pd.read_csv("./DIAGNOSES_ICD.csv")
PATIENTS =  pd.read_csv(r"C:\Users\kappl\OneDrive\Documents\BIOE5860 Precision Medicine\PATIENTS.csv") #pd.read_csv("./PATIENTS.csv")
PROCEDURES_ICD = pd.read_csv(r"C:\Users\kappl\OneDrive\Documents\BIOE5860 Precision Medicine\PROCEDURES_ICD.csv")  #pd.read_csv("./PROCEDURES_ICD.csv")
LAB_EVENTS = pd.read_csv(r"C:\Users\kappl\OneDrive\Documents\BIOE5860 Precision Medicine\LABEVENTS.csv") #pd.read_csv("./LABEVENTS.csv")

In [3]:
#Input ICD9 code that you want to look at here:
my_icd9_code = "441" #441 is arotic dissection. Change to 421 for bacterial endocarditis

#Returns patients with aortic dissection
AD_SUBJECT_ID = DIAGNOSES_ICD.loc[
    DIAGNOSES_ICD["ICD9_CODE"].astype(str).str.startswith(my_icd9_code),
    "SUBJECT_ID"
].unique() 

#Place all other patients in a control group
CONTROL_SUBJECT_ID = DIAGNOSES_ICD.loc[
    ~DIAGNOSES_ICD["ICD9_CODE"].astype(str).str.startswith(my_icd9_code),
    "SUBJECT_ID"
].unique()

#Returns the specific admissions where aortic dissection was diagnosed
AD_HADM_ID = DIAGNOSES_ICD.loc[
    DIAGNOSES_ICD["ICD9_CODE"].astype(str).str.startswith(my_icd9_code),
    "HADM_ID"
].unique()

"""
Question for Daniel: do we need to filter this again keeping only the first AD diagnosis?
"""

#Identify all diagnoses for patients diagnosed with aortic dissection, including for admissions where they were not diagnosed with AD
PATIENT_DIAGNOSES = DIAGNOSES_ICD[DIAGNOSES_ICD['SUBJECT_ID'].isin(AD_SUBJECT_ID)]

#add diagnoses for control patients as well
CONTROL_DIAGNOSES = DIAGNOSES_ICD[DIAGNOSES_ICD['SUBJECT_ID'].isin(CONTROL_SUBJECT_ID)]

#Return a new dataframe with all the ICD9 codes for each admission condensed into a single row,col val as a compressed list
PATIENT_DIAGNOSES = (
    PATIENT_DIAGNOSES
    .sort_values(['HADM_ID','SEQ_NUM'])
    .groupby(['SUBJECT_ID','HADM_ID'])['ICD9_CODE']
    .apply(list)
    .reset_index(name='DIAGNOSES')
)

#Return a new dataframe with all the ICD9 codes for each admission condensed into a single row,col val as a compressed list for control patients
CONTROL_DIAGNOSES = (
    CONTROL_DIAGNOSES
    .sort_values(['HADM_ID','SEQ_NUM'])
    .groupby(['SUBJECT_ID','HADM_ID'])['ICD9_CODE']
    .apply(list)
    .reset_index(name='DIAGNOSES')
)

#Remove DIAGNOSES_ICD to conserve memory since we have already filtered for the relevant data
del DIAGNOSES_ICD

In [4]:
#Return all procedures for patients diagnosed with AD, including for admissions where they were not diagnosed with AD
PATIENT_PROCEDURES = PROCEDURES_ICD[PROCEDURES_ICD['SUBJECT_ID'].isin(AD_SUBJECT_ID)]

#identify all procedures for control patients as well
CONTROL_PROCEDURES = PROCEDURES_ICD[PROCEDURES_ICD['SUBJECT_ID'].isin(CONTROL_SUBJECT_ID)]

#Return a new dataframe with all procedure codes for each admission compressed into a single row,col val as a compressed list
PATIENT_PROCEDURES = (
    PATIENT_PROCEDURES
    .sort_values(['HADM_ID','SEQ_NUM'])
    .groupby(['SUBJECT_ID','HADM_ID'])['ICD9_CODE']
    .apply(list)
    .reset_index(name='PROCEDURE TYPE')
)

#Return a new dataframe with all procedure codes for each admission compressed into a single row,col val as a compressed list for control patients
CONTROL_PROCEDURES = (
    CONTROL_PROCEDURES
    .sort_values(['HADM_ID','SEQ_NUM'])
    .groupby(['SUBJECT_ID','HADM_ID'])['ICD9_CODE']
    .apply(list)
    .reset_index(name='PROCEDURE TYPE')
)

#Remove PROCEDURES_ICD to conserve memory since we have already extracted the relevant rows
del PROCEDURES_ICD

In [5]:
#Identify all patient labs for patients who were diagnosed with AD at any point
PATIENT_LAB_EVENTS = LAB_EVENTS[LAB_EVENTS['SUBJECT_ID'].isin(AD_SUBJECT_ID)]

#pull control group lab events as well
CONTROL_LAB_EVENTS = LAB_EVENTS[LAB_EVENTS['SUBJECT_ID'].isin(CONTROL_SUBJECT_ID)]

#Remove redundant rows
PATIENT_LAB_EVENTS = PATIENT_LAB_EVENTS.drop(['ROW_ID','VALUE'],axis=1)
CONTROL_LAB_EVENTS = CONTROL_LAB_EVENTS.drop(['ROW_ID','VALUE'],axis=1)

#Convert CHARTTIME to a datetime for sorting
PATIENT_LAB_EVENTS['CHARTTIME'] = pd.to_datetime(PATIENT_LAB_EVENTS["CHARTTIME"], errors="coerce")
CONTROL_LAB_EVENTS['CHARTTIME'] = pd.to_datetime(CONTROL_LAB_EVENTS["CHARTTIME"], errors="coerce")

#Only return values that are not na.
PATIENT_LAB_EVENTS = PATIENT_LAB_EVENTS[PATIENT_LAB_EVENTS['HADM_ID'].notna()]
CONTROL_LAB_EVENTS = CONTROL_LAB_EVENTS[CONTROL_LAB_EVENTS['HADM_ID'].notna()]

#Remove LAB_EVENTS to conserve lots of memory since we already have extracted the necessary data
del LAB_EVENTS

#Return a condensed lab events DF with CHARTTIME, ITEMID, VALUENUM, and FLAG for analytics and ML later.
PATIENT_LAB_EVENTS = (
    PATIENT_LAB_EVENTS
    .sort_values(['HADM_ID','CHARTTIME'])
    .groupby('HADM_ID')
    .apply(lambda df: list(zip(df.CHARTTIME, df.ITEMID, df.VALUENUM, df.FLAG)))
    .reset_index(name='Lab Events')
)

#Return a condensed lab events DF for control patients as well
CONTROL_LAB_EVENTS = (
    CONTROL_LAB_EVENTS
    .sort_values(['HADM_ID','CHARTTIME'])
    .groupby('HADM_ID')
    .apply(lambda df: list(zip(df.CHARTTIME, df.ITEMID, df.VALUENUM, df.FLAG)))
    .reset_index(name='Lab Events')
)

  .apply(lambda df: list(zip(df.CHARTTIME, df.ITEMID, df.VALUENUM, df.FLAG)))
  .apply(lambda df: list(zip(df.CHARTTIME, df.ITEMID, df.VALUENUM, df.FLAG)))


In [6]:
#Return every admission entry for patients who were diagnosed with AD at some point
PATIENT_ADMISSIONS = ADMISSIONS[ADMISSIONS['SUBJECT_ID'].isin(AD_SUBJECT_ID)]

#pull control group admissions as well
CONTROL_ADMISSIONS = ADMISSIONS[ADMISSIONS['SUBJECT_ID'].isin(CONTROL_SUBJECT_ID)]

#Remove redundant columns from the other filtered dataframes for a cleaner merge
PATIENT_PROCEDURES.drop('SUBJECT_ID',inplace=True,axis=1)
PATIENT_DIAGNOSES.drop('SUBJECT_ID',inplace=True,axis=1)
CONTROL_PROCEDURES.drop('SUBJECT_ID',inplace=True,axis=1)
CONTROL_DIAGNOSES.drop('SUBJECT_ID',inplace=True,axis=1)

In [7]:
#Merge the compressed DFs engineered earlier with admissions so that each admission has lab event, diagnosis, and procedure data
PATIENT_ADMISSIONS_MERGED = PATIENT_ADMISSIONS.merge(PATIENT_LAB_EVENTS, on="HADM_ID", how="left") \
            .merge(PATIENT_DIAGNOSES, on="HADM_ID", how="left") \
            .merge(PATIENT_PROCEDURES, on="HADM_ID", how="left")

CONTROL_ADMISSIONS_MERGED = CONTROL_ADMISSIONS.merge(CONTROL_LAB_EVENTS, on="HADM_ID", how="left") \
            .merge(CONTROL_DIAGNOSES, on="HADM_ID", how="left") \
            .merge(CONTROL_PROCEDURES, on="HADM_ID", how="left")

#Rename columns for clarity since there is a text-based labeling column and the ICD-9 diagnosis column
PATIENT_ADMISSIONS_MERGED = PATIENT_ADMISSIONS_MERGED.rename(columns={"DIAGNOSIS": "DIAGNOSIS (LABEL)","DIAGNOSES": "DIAGNOSIS (ICD_9)"})
CONTROL_ADMISSIONS_MERGED = CONTROL_ADMISSIONS_MERGED.rename(columns={"DIAGNOSIS": "DIAGNOSIS (LABEL)","DIAGNOSES": "DIAGNOSIS (ICD_9)"})

#Drop redundant row
PATIENT_ADMISSIONS_MERGED.drop(['ROW_ID'],inplace=True,axis=1)
CONTROL_ADMISSIONS_MERGED.drop(['ROW_ID'],inplace=True,axis=1)

In [8]:
#Identify the admissions where AD was one of the diagnoses given to the patients, excluding admissions where AD was not diagnosed
#No need to do this for control group
DISEASE_ADMISSIONS = PATIENT_ADMISSIONS_MERGED[PATIENT_ADMISSIONS_MERGED['HADM_ID'].isin(AD_HADM_ID)]

#Convert ADMITTIME to datetime for processing
DISEASE_ADMISSIONS['ADMITTIME'] = pd.to_datetime(DISEASE_ADMISSIONS["ADMITTIME"], errors="coerce")

#convert to datetime for control group
CONTROL_ADMISSIONS_MERGED['ADMITTIME'] = pd.to_datetime(CONTROL_ADMISSIONS_MERGED["ADMITTIME"], errors="coerce")

#Sort by HADM_ID and ADMITTIME to get a sorted list for processing
DISEASE_ADMISSIONS = DISEASE_ADMISSIONS.sort_values(['HADM_ID','ADMITTIME'])

#sort control group by HADM_ID and ADMITTIME as well
CONTROL_ADMISSIONS_MERGED = CONTROL_ADMISSIONS_MERGED.sort_values(['HADM_ID','ADMITTIME'])

#Identify the earliest admission time in which patients were diagnosed with AD
DISEASE_FIRST_ADMISSIONS = DISEASE_ADMISSIONS.groupby('SUBJECT_ID',as_index=False)['ADMITTIME'].min()

#Rename this column to "Comparator" since it will be used for filtering admissions from after the patient was diagnosed with AD
DISEASE_FIRST_ADMISSIONS = DISEASE_FIRST_ADMISSIONS.rename(columns={"ADMITTIME": "Comparator"})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  DISEASE_ADMISSIONS['ADMITTIME'] = pd.to_datetime(DISEASE_ADMISSIONS["ADMITTIME"], errors="coerce")


In [9]:
#Update ADMISSIONS_MERGED so it now contains all admissions for patients who were diagnosed with AD at some point
#Prior and including the admission with their first diagnosis of AD. Admissions after their first diagnosis are excluded

PATIENT_ADMISSIONS_MERGED = PATIENT_ADMISSIONS_MERGED.merge(DISEASE_FIRST_ADMISSIONS,on='SUBJECT_ID',how="left")
PATIENT_ADMISSIONS_MERGED = PATIENT_ADMISSIONS_MERGED[PATIENT_ADMISSIONS_MERGED['ADMITTIME']<=PATIENT_ADMISSIONS_MERGED['Comparator']]

#drop the comparator column now that filtering is done so that the DFs are the same
PATIENT_ADMISSIONS_MERGED = PATIENT_ADMISSIONS_MERGED.drop(['Comparator'],axis=1)

In [10]:
PATIENT_ADMISSIONS_MERGED

Unnamed: 0,SUBJECT_ID,HADM_ID,ADMITTIME,DISCHTIME,DEATHTIME,ADMISSION_TYPE,ADMISSION_LOCATION,DISCHARGE_LOCATION,INSURANCE,LANGUAGE,...,MARITAL_STATUS,ETHNICITY,EDREGTIME,EDOUTTIME,DIAGNOSIS (LABEL),HOSPITAL_EXPIRE_FLAG,HAS_CHARTEVENTS_DATA,Lab Events,DIAGNOSIS (ICD_9),PROCEDURE TYPE
0,364,136153,2130-05-22 20:10:00,2130-05-30 15:15:00,,EMERGENCY,TRANSFER FROM HOSP/EXTRAM,HOME,Medicare,,...,MARRIED,UNKNOWN/NOT SPECIFIED,,,MASSIVE HEMOPTYSIS,0,1,"[(2130-05-22 20:38:00, 50861, 9.0, nan), (2130...","[7863, 496, 9973, 5070, 2851, 4414]","[3327, 3324, 3891, 3893, 9671, 9604]"
1,85,116630,2162-03-02 14:04:00,2162-03-10 13:15:00,,EMERGENCY,CLINIC REFERRAL/PREMATURE,REHAB/DISTINCT PART HOSP,Medicare,ENGL,...,MARRIED,WHITE,,,AORTIC STENOSIS\CARDIAC CATH,0,1,"[(2162-03-02 07:40:00, 50868, 15.0, nan), (216...","[4241, 20280, V4282, 99811, 3320, 30000, 53081...","[3521, 3722, 3611, 3615, 3845, 3403, 8856, 396..."
3,100,153952,2157-08-10 07:15:00,2157-08-18 19:54:00,,ELECTIVE,PHYS REFERRAL/NORMAL DELI,HOME,Private,,...,WIDOWED,UNKNOWN/NOT SPECIFIED,,,AORTIC INSUFFICIENCY\AORTIC VALVE REPLACEMENT;...,0,1,"[(2157-08-10 08:10:00, 50800, nan, nan), (2157...","[99602, 4241, 4411, 4260, 42731]","[3522, 3844, 3961, 3783, 3772]"
4,124,112906,2161-12-17 03:39:00,2161-12-24 15:35:00,,EMERGENCY,EMERGENCY ROOM ADMIT,HOME,Medicare,,...,MARRIED,WHITE,2161-12-16 18:57:00,2161-12-17 12:35:00,"CHEST PAIN,R/O MI",0,1,"[(2161-12-17 06:20:00, 50868, 12.0, nan), (216...","[4412, 486, 496, 07070, 4478, 41402, 41401, 40...","[3601, 3606, 8856, 8842]"
7,145,198161,2144-03-29 01:44:00,2144-07-14 16:15:00,,EMERGENCY,EMERGENCY ROOM ADMIT,REHAB/DISTINCT PART HOSP,Medicaid,,...,SINGLE,WHITE,2144-03-28 23:46:00,2144-03-29 04:00:00,THORACIC DISSECTION,0,1,"[(2144-03-29 00:01:00, 50861, 97.0, abnormal),...","[44101, 5185, 99674, 56962, 9971, 42731, 99859...","[3954, 311, 3950, 3950, 3954, 3926, 4562, 4573..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2141,96254,140723,2129-08-14 14:02:00,2129-08-14 20:03:00,2129-08-14 20:03:00,EMERGENCY,TRANSFER FROM HOSP/EXTRAM,DEAD/EXPIRED,Medicare,ENGL,...,,UNKNOWN/NOT SPECIFIED,,,MASSIVE HEMOPTYSIS,1,1,"[(2129-08-14 13:18:00, 50868, 10.0, nan), (212...","[1628, 78630, 49121, 4271, 2875, 4414, V8741, ...","[3979, 9671, 8842, 8844, 8848, 3322, 9605, 9960]"
2142,90680,193278,2180-08-28 12:00:00,2180-09-06 18:33:00,,ELECTIVE,PHYS REFERRAL/NORMAL DELI,HOME HEALTH CARE,Medicare,ENGL,...,MARRIED,WHITE,,,AORTIC STENOSIS\AORTIC VALVE / ASCENDING AORTA...,0,1,"[(2180-08-28 06:47:00, 50802, -2.0, nan), (218...","[4241, 42820, 5990, 2762, 5180, 4412, 41401, 3...","[3521, 3615, 3845, 3761, 3961]"
2143,90700,155233,2126-12-02 17:50:00,2126-12-09 19:00:00,,EMERGENCY,TRANSFER FROM HOSP/EXTRAM,HOME,Medicare,ENGL,...,MARRIED,UNKNOWN/NOT SPECIFIED,,,VENTRICULAR TACHYCARDIA,0,1,"[(2126-12-03 05:10:00, 50861, 17.0, nan), (212...","[4271, 45381, 42820, 41410, 45829, 25000, 5853...","[3734, 64, 3726, 3727]"
2144,95408,121497,2150-07-14 17:36:00,2150-07-21 14:02:00,,ELECTIVE,PHYS REFERRAL/NORMAL DELI,HOME HEALTH CARE,Medicare,ENGL,...,MARRIED,WHITE,,,THORACOABDOMINAL ANEURYSM/SDA,0,1,"[(2150-07-14 14:57:00, 51355, 24.0, nan), (215...","[4417, 4019, 53081, 2720]","[3844, 3924]"


In [11]:
CONTROL_ADMISSIONS_MERGED

Unnamed: 0,SUBJECT_ID,HADM_ID,ADMITTIME,DISCHTIME,DEATHTIME,ADMISSION_TYPE,ADMISSION_LOCATION,DISCHARGE_LOCATION,INSURANCE,LANGUAGE,...,MARITAL_STATUS,ETHNICITY,EDREGTIME,EDOUTTIME,DIAGNOSIS (LABEL),HOSPITAL_EXPIRE_FLAG,HAS_CHARTEVENTS_DATA,Lab Events,DIAGNOSIS (ICD_9),PROCEDURE TYPE
47366,58526,100001,2117-09-11 11:46:00,2117-09-17 16:45:00,,EMERGENCY,CLINIC REFERRAL/PREMATURE,HOME,Private,ENGL,...,DIVORCED,WHITE,2117-09-11 08:59:00,2117-09-11 12:35:00,DIABETIC KETOACIDOSIS,0,1,"[(2117-09-11 08:22:00, 50861, 10.0, nan), (211...","[25013, 3371, 5849, 5780, V5867, 25063, 5363, ...",
47024,54610,100003,2150-04-17 15:34:00,2150-04-21 17:30:00,,EMERGENCY,EMERGENCY ROOM ADMIT,HOME,Private,ENGL,...,SINGLE,WHITE,2150-04-17 13:10:00,2150-04-17 17:47:00,UPPER GI BLEED,0,1,"[(2150-04-17 12:30:00, 50861, 126.0, abnormal)...","[53100, 2851, 07054, 5715, 45621, 53789, 4019,...","[4443, 9607, 9904, 3893]"
9046,9895,100006,2108-04-06 15:49:00,2108-04-18 17:18:00,,EMERGENCY,EMERGENCY ROOM ADMIT,HOME,Private,,...,SINGLE,BLACK/AFRICAN AMERICAN,2108-04-06 11:39:00,2108-04-06 17:56:00,COPD FLARE,0,1,"[(2108-04-06 11:30:00, 50868, 19.0, nan), (210...","[49320, 51881, 486, 20300, 2761, 7850, 3090, V...","[9390, 9925]"
27709,23018,100007,2145-03-31 05:33:00,2145-04-07 12:40:00,,EMERGENCY,EMERGENCY ROOM ADMIT,HOME,Private,,...,MARRIED,WHITE,2145-03-30 20:43:00,2145-03-31 06:08:00,BOWEL OBSTRUCTION,0,1,"[(2145-03-30 21:15:00, 50861, 12.0, nan), (214...","[56081, 5570, 9973, 486, 4019]","[4562, 5459]"
843,533,100009,2162-05-16 15:56:00,2162-05-21 13:37:00,,EMERGENCY,TRANSFER FROM HOSP/EXTRAM,HOME HEALTH CARE,Private,,...,MARRIED,WHITE,,,CORONARY ARTERY DISEASE,0,1,"[(2162-05-16 16:00:00, 50852, 7.9, abnormal), ...","[41401, 99604, 4142, 25000, 27800, V8535, 4148...","[3613, 3615, 3795, 3961]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24035,20785,199993,2161-10-23 18:01:00,2161-11-17 08:10:00,,EMERGENCY,TRANSFER FROM HOSP/EXTRAM,SHORT TERM HOSPITAL,Private,,...,DIVORCED,UNKNOWN/NOT SPECIFIED,2161-10-23 16:23:00,2161-10-23 18:45:00,CORONARY ARTERY DISEASE,0,1,"[(2161-10-23 18:51:00, 51221, 34.0, abnormal),...","[41031, 42821, 42731, 4271, 5180, 4240, 2760, ...","[3614, 3512, 3761, 8842, 8848, 3961, 3964, 340..."
30073,23761,199994,2188-07-07 18:47:00,2188-07-17 13:31:00,,EMERGENCY,EMERGENCY ROOM ADMIT,HOME HEALTH CARE,Medicare,,...,SINGLE,WHITE,2188-07-07 15:55:00,2188-07-07 20:21:00,MENTAL STATUS CHANGES,0,1,"[(2188-07-07 09:48:00, 50893, 9.2, nan), (2188...","[486, 4280, 51881, 3970, 496, 4169, 585, 42732...","[9671, 9604, 3995, 3891]"
25219,19412,199995,2137-12-11 17:35:00,2137-12-28 12:30:00,,EMERGENCY,PHYS REFERRAL/NORMAL DELI,HOME,Private,ENGL,...,SINGLE,WHITE,,,ROOT ABSCESS,0,1,"[(2137-12-11 19:55:00, 50861, 12.0, nan), (213...","[4210, 7464, 42971, 30401, 4412, 44284, V1259,...","[3521, 3961, 3845, 3539, 8841, 8847, 9929, 887..."
33394,27200,199998,2119-02-18 16:26:00,2119-02-24 14:25:00,,EMERGENCY,TRANSFER FROM HOSP/EXTRAM,HOME HEALTH CARE,Medicare,ENGL,...,MARRIED,WHITE,,,CORONARY ARTERY DISEASE,0,1,"[(2119-02-18 17:58:00, 50868, 15.0, nan), (211...","[41401, 9971, 9975, 42731, 78820, 4111, V4582,...","[3612, 3615, 3964]"


In [12]:
print(PATIENT_ADMISSIONS_MERGED.columns)
print(PATIENT_ADMISSIONS_MERGED.shape)

Index(['SUBJECT_ID', 'HADM_ID', 'ADMITTIME', 'DISCHTIME', 'DEATHTIME',
       'ADMISSION_TYPE', 'ADMISSION_LOCATION', 'DISCHARGE_LOCATION',
       'INSURANCE', 'LANGUAGE', 'RELIGION', 'MARITAL_STATUS', 'ETHNICITY',
       'EDREGTIME', 'EDOUTTIME', 'DIAGNOSIS (LABEL)', 'HOSPITAL_EXPIRE_FLAG',
       'HAS_CHARTEVENTS_DATA', 'Lab Events', 'DIAGNOSIS (ICD_9)',
       'PROCEDURE TYPE'],
      dtype='object')
(1644, 21)


In [13]:
print(CONTROL_ADMISSIONS_MERGED.columns)
print(CONTROL_ADMISSIONS_MERGED.shape)

Index(['SUBJECT_ID', 'HADM_ID', 'ADMITTIME', 'DISCHTIME', 'DEATHTIME',
       'ADMISSION_TYPE', 'ADMISSION_LOCATION', 'DISCHARGE_LOCATION',
       'INSURANCE', 'LANGUAGE', 'RELIGION', 'MARITAL_STATUS', 'ETHNICITY',
       'EDREGTIME', 'EDOUTTIME', 'DIAGNOSIS (LABEL)', 'HOSPITAL_EXPIRE_FLAG',
       'HAS_CHARTEVENTS_DATA', 'Lab Events', 'DIAGNOSIS (ICD_9)',
       'PROCEDURE TYPE'],
      dtype='object')
(58976, 21)


In [14]:
#print number of unique patients in each group after filtering
print("Number of unique patients diagnosed with AD: ", PATIENT_ADMISSIONS_MERGED['SUBJECT_ID'].nunique())
print("Number of unique control patients: ", CONTROL_ADMISSIONS_MERGED['SUBJECT_ID'].nunique())

Number of unique patients diagnosed with AD:  1486
Number of unique control patients:  46520


We need to add another Column having a hospital amdission ID value for each event further back from the event.

NOTE that we need to label the most recent event as 0 and count backwards in order to have all patients match.

Need to search in the literature what to do with repeated lab values. For example, if a patient across multiple admissions has their SpO2 value taken 6 times, what do we do with those values? For example, take average? Take most recent? Not sure