In [1]:
import pandas as pd
import numpy as np

import warnings

from select import select

warnings.filterwarnings('ignore')

In [2]:
path = "J:\\mimic-iii-clinical-database-1.4\\mimic-iii-clinical-database-1.4"

In [3]:
# load Patients and Diagnoses_icd
patients = pd.read_csv(path + '\\patients.csv')
diagnoses = pd.read_csv(path + '\\diagnoses_icd.csv')
admissions = pd.read_csv(path + '\\admissions.csv')

In [4]:
print(f"Patients shape is {patients.shape}")
print(f"Diagnoses shape is {diagnoses.shape}")
print()

# Check if ICD codes are numeric or not
icd_column_dtype = diagnoses['ICD9_CODE'].dtype
print("Checking ICD codes:")
print(f"\tICD codes column dtype is {icd_column_dtype}")

# Check if all ICD codes are numeric
if pd.api.types.is_numeric_dtype(diagnoses['ICD9_CODE']):
    print("\tAll ICD codes are numeric.")
else:
    print("\tICD codes are not entirely numeric. Attempting to clean and convert...")

    # Attempt to clean and convert to numeric, ignoring errors
    diagnoses['ICD9_CODE'] = pd.to_numeric(diagnoses['ICD9_CODE'], errors='coerce')

    # Report how many could not be converted
    null_count = diagnoses['ICD9_CODE'].isnull().sum()
    print(f"\tConverted ICD codes to numeric. {null_count} rows could not be converted and contain NaN.")


Patients shape is (46520, 8)
Diagnoses shape is (651047, 5)

Checking ICD codes:
	ICD codes column dtype is object
	ICD codes are not entirely numeric. Attempting to clean and convert...
	Converted ICD codes to numeric. 97296 rows could not be converted and contain NaN.


In [5]:
# checking why ICD9_CODE columns is Object
diagnoses['ICD9_CODE'].isna().sum()

97296

In [6]:
diagnoses_cleaned = diagnoses.dropna(subset=['ICD9_CODE'])
# Verify the result
print(f"Diagnoses shape after dropping missing ICD9_CODE: {diagnoses_cleaned.shape}")

Diagnoses shape after dropping missing ICD9_CODE: (553751, 5)


In [7]:
# Check if ICD codes are numeric or not
icd_column_dtype = diagnoses_cleaned['ICD9_CODE'].dtype
print("Checking ICD codes:")
print(f"\tICD codes column dtype is {icd_column_dtype}")

# Check if all ICD codes are numeric
if pd.api.types.is_numeric_dtype(diagnoses['ICD9_CODE']):
    print("\tAll ICD codes are numeric.")
else:
    print("\tICD codes are not entirely numeric. Attempting to clean and convert...")

    # Attempt to clean and convert to numeric, ignoring errors
    diagnoses['ICD9_CODE'] = pd.to_numeric(diagnoses['ICD9_CODE'], errors='coerce')

    # Report how many could not be converted
    null_count = diagnoses['ICD9_CODE'].isnull().sum()
    print(f"\tConverted ICD codes to numeric. {null_count} rows could not be converted and contain NaN.")


Checking ICD codes:
	ICD codes column dtype is float64
	All ICD codes are numeric.


In [8]:
# now lets extract sepsis patients ( sepsis , severe sepsis , septic shock)
sepsis_codes = {
    99591.0: 'Sepsis',
    99592.0: 'Severe Sepsis',
    78552.0: 'Septic Shock'
}

In [9]:
# Filter for sepsis-related diagnoses
sepsis_diagnoses = diagnoses_cleaned[diagnoses_cleaned['ICD9_CODE'].isin(sepsis_codes.keys())]


In [10]:
sepsis_diagnoses.columns

Index(['ROW_ID', 'SUBJECT_ID', 'HADM_ID', 'SEQ_NUM', 'ICD9_CODE'], dtype='object')

In [11]:
sepsis_diagnoses.shape

(7770, 5)

In [12]:
# Merge with the patients table to get demographic details
sepsis_patients = sepsis_diagnoses.merge(patients, on='SUBJECT_ID', how='inner')

In [13]:
print(sepsis_patients.head())
sepsis_patients.shape

   ROW_ID_x  SUBJECT_ID  HADM_ID  SEQ_NUM  ICD9_CODE  ROW_ID_y GENDER  \
0      1547         117   164853     16.0    99592.0       108      F   
1      1604         124   138376      6.0    99592.0       114      M   
2       505          64   172056      3.0    99591.0        57      F   
3       679          85   112077     18.0    99591.0        77      M   
4       131          21   111970      2.0    78552.0        18      M   

                   DOB                  DOD             DOD_HOSP  \
0  2083-12-28 00:00:00  2133-12-01 00:00:00  2133-12-01 00:00:00   
1  2090-11-19 00:00:00  2166-02-01 00:00:00  2166-02-01 00:00:00   
2  2116-06-27 00:00:00                  NaN                  NaN   
3  2090-09-18 00:00:00  2167-09-12 00:00:00  2167-09-12 00:00:00   
4  2047-04-04 00:00:00  2135-02-08 00:00:00  2135-02-08 00:00:00   

               DOD_SSN  EXPIRE_FLAG  
0  2133-12-01 00:00:00            1  
1  2166-02-01 00:00:00            1  
2                  NaN            0  


(7770, 12)

In [14]:
print(len(diagnoses['ICD9_CODE'].unique()))

5954


## ----------------------------------------------------

## ----------------------------------------------------

# Now exclude patients with multiple admissions except for first admissions

In [15]:
admissions = pd.read_csv(path + '\\admissions.csv')

In [16]:
admissions.shape

(58976, 19)

In [17]:
admissions.head()

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,ADMITTIME,DISCHTIME,DEATHTIME,ADMISSION_TYPE,ADMISSION_LOCATION,DISCHARGE_LOCATION,INSURANCE,LANGUAGE,RELIGION,MARITAL_STATUS,ETHNICITY,EDREGTIME,EDOUTTIME,DIAGNOSIS,HOSPITAL_EXPIRE_FLAG,HAS_CHARTEVENTS_DATA
0,21,22,165315,2196-04-09 12:26:00,2196-04-10 15:54:00,,EMERGENCY,EMERGENCY ROOM ADMIT,DISC-TRAN CANCER/CHLDRN H,Private,,UNOBTAINABLE,MARRIED,WHITE,2196-04-09 10:06:00,2196-04-09 13:24:00,BENZODIAZEPINE OVERDOSE,0,1
1,22,23,152223,2153-09-03 07:15:00,2153-09-08 19:10:00,,ELECTIVE,PHYS REFERRAL/NORMAL DELI,HOME HEALTH CARE,Medicare,,CATHOLIC,MARRIED,WHITE,,,CORONARY ARTERY DISEASE\CORONARY ARTERY BYPASS...,0,1
2,23,23,124321,2157-10-18 19:34:00,2157-10-25 14:00:00,,EMERGENCY,TRANSFER FROM HOSP/EXTRAM,HOME HEALTH CARE,Medicare,ENGL,CATHOLIC,MARRIED,WHITE,,,BRAIN MASS,0,1
3,24,24,161859,2139-06-06 16:14:00,2139-06-09 12:48:00,,EMERGENCY,TRANSFER FROM HOSP/EXTRAM,HOME,Private,,PROTESTANT QUAKER,SINGLE,WHITE,,,INTERIOR MYOCARDIAL INFARCTION,0,1
4,25,25,129635,2160-11-02 02:06:00,2160-11-05 14:55:00,,EMERGENCY,EMERGENCY ROOM ADMIT,HOME,Private,,UNOBTAINABLE,MARRIED,WHITE,2160-11-02 01:01:00,2160-11-02 04:27:00,ACUTE CORONARY SYNDROME,0,1


In [18]:
admissions['ADMITTIME'].isna().sum()

0

In [19]:
# Count the number of admissions per patient
# finding admission count isn't useful because we need to delete multiple sepsis admission except first
admission_counts = admissions.groupby('SUBJECT_ID').size().reset_index(name='ADMISSION_COUNT')

print(admission_counts)


       SUBJECT_ID  ADMISSION_COUNT
0               2                1
1               3                1
2               4                1
3               5                1
4               6                1
...           ...              ...
46515       99985                1
46516       99991                1
46517       99992                1
46518       99995                1
46519       99999                1

[46520 rows x 2 columns]


In [20]:
# Filter out patients with multiple admissions
# as we see here is not useful
single_admissions_patients = admission_counts[admission_counts['ADMISSION_COUNT'] == 1]

print(single_admissions_patients)

       SUBJECT_ID  ADMISSION_COUNT
0               2                1
1               3                1
2               4                1
3               5                1
4               6                1
...           ...              ...
46515       99985                1
46516       99991                1
46517       99992                1
46518       99995                1
46519       99999                1

[38983 rows x 2 columns]


In [21]:
# choose sepsis patients from admissions
filtered_admissions = admissions[admissions['SUBJECT_ID'].isin(sepsis_patients['SUBJECT_ID'])]
filtered_admissions.shape

(8703, 19)

In [22]:
filtered_admissions.head()

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,ADMITTIME,DISCHTIME,DEATHTIME,ADMISSION_TYPE,ADMISSION_LOCATION,DISCHARGE_LOCATION,INSURANCE,LANGUAGE,RELIGION,MARITAL_STATUS,ETHNICITY,EDREGTIME,EDOUTTIME,DIAGNOSIS,HOSPITAL_EXPIRE_FLAG,HAS_CHARTEVENTS_DATA
19,40,38,185910,2166-08-10 00:28:00,2166-09-04 11:30:00,,EMERGENCY,TRANSFER FROM HOSP/EXTRAM,LONG TERM CARE HOSPITAL,Medicare,,CATHOLIC,WIDOWED,WHITE,,,ACUTE MYOCARDIAL INFARCTION-SEPSIS,0,1
22,456,357,174486,2197-12-06 07:13:00,2198-01-03 14:00:00,,EMERGENCY,EMERGENCY ROOM ADMIT,REHAB/DISTINCT PART HOSP,Private,ENGL,NOT SPECIFIED,MARRIED,WHITE,2197-12-06 06:28:00,2197-12-06 07:43:00,GASTROINTESTINAL BLEED,0,1
23,457,357,145674,2198-08-02 04:49:00,2198-10-26 14:55:00,,EMERGENCY,EMERGENCY ROOM ADMIT,REHAB/DISTINCT PART HOSP,Private,ENGL,NOT SPECIFIED,MARRIED,WHITE,2198-08-02 02:03:00,2198-08-02 05:37:00,SEIZURE,0,1
24,458,357,122609,2198-11-01 22:36:00,2198-11-14 14:20:00,,EMERGENCY,EMERGENCY ROOM ADMIT,REHAB/DISTINCT PART HOSP,Private,ENGL,NOT SPECIFIED,MARRIED,WHITE,2198-11-01 18:01:00,2198-11-01 23:06:00,SEPSIS,0,1
25,459,357,101651,2199-10-20 12:05:00,2199-10-23 17:30:00,,EMERGENCY,EMERGENCY ROOM ADMIT,SNF,Private,ENGL,NOT SPECIFIED,MARRIED,WHITE,2199-10-20 09:34:00,2199-10-20 13:25:00,PNEUMONIA,0,1


In [23]:
filtered_admissions = filtered_admissions[['SUBJECT_ID', 'ADMITTIME']].dropna(subset=['ADMITTIME'])
filtered_admissions.head()

Unnamed: 0,SUBJECT_ID,ADMITTIME
19,38,2166-08-10 00:28:00
22,357,2197-12-06 07:13:00
23,357,2198-08-02 04:49:00
24,357,2198-11-01 22:36:00
25,357,2199-10-20 12:05:00


In [24]:
filtered_admissions = filtered_admissions.sort_values(by='ADMITTIME')
filtered_admissions.head()

Unnamed: 0,SUBJECT_ID,ADMITTIME
7378,4521,2100-06-28 19:29:00
37595,31585,2100-07-02 19:28:00
38886,42357,2100-07-14 02:04:00
50353,73131,2100-07-14 18:14:00
54560,84585,2100-08-03 00:26:00


In [25]:
print(sepsis_patients["SUBJECT_ID"].unique().__len__())
print(sepsis_patients.shape)

4689
(7770, 12)


In [26]:
print(filtered_admissions["SUBJECT_ID"].unique().__len__())
print(filtered_admissions.shape)

4689
(8703, 2)


In [27]:
# Sort the DataFrame by ADMITTIME
filtered_admissions_sorted = filtered_admissions.sort_values(by='ADMITTIME')

# Group by `subject_id` and keep the first occurrence for duplicates
unique_or_first_admissions = filtered_admissions_sorted.groupby('SUBJECT_ID', as_index=False).first()

unique_or_first_admissions.shape

(4689, 2)

In [28]:
unique_or_first_admissions.head()

Unnamed: 0,SUBJECT_ID,ADMITTIME
0,21,2134-09-11 12:17:00
1,38,2166-08-10 00:28:00
2,61,2118-06-17 14:40:00
3,62,2113-02-15 00:19:00
4,64,2143-03-03 09:25:00


In [29]:
print(unique_or_first_admissions.SUBJECT_ID.duplicated().sum())

0


In [30]:
sepsis_patients_corrected = unique_or_first_admissions

# ----------------------------------------------

# Now we should implement filters

In [31]:
patients.DOB.sample(10)

13299    2131-11-09 00:00:00
33282    2118-06-17 00:00:00
34404    2139-12-15 00:00:00
8850     2131-05-10 00:00:00
1198     2111-07-10 00:00:00
2291     1892-10-14 00:00:00
18800    1887-04-05 00:00:00
40232    2057-11-26 00:00:00
24655    2139-06-28 00:00:00
8227     2183-06-29 00:00:00
Name: DOB, dtype: object

In [32]:
# Convert DOB to datetime format
patients['DOB'] = pd.to_datetime(patients['DOB'], errors='coerce')

# Drop rows with shifted DOBs (year < 1900)
patients_cleaned = patients[patients['DOB'].dt.year >= 1900]

# Display the number of remaining rows
print(f"Number of rows after dropping shifted DOBs: {patients_cleaned.shape[0]}")


Number of rows after dropping shifted DOBs: 44561


In [33]:
patients_cleaned.shape

(44561, 8)

In [34]:
admissions.head()

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,ADMITTIME,DISCHTIME,DEATHTIME,ADMISSION_TYPE,ADMISSION_LOCATION,DISCHARGE_LOCATION,INSURANCE,LANGUAGE,RELIGION,MARITAL_STATUS,ETHNICITY,EDREGTIME,EDOUTTIME,DIAGNOSIS,HOSPITAL_EXPIRE_FLAG,HAS_CHARTEVENTS_DATA
0,21,22,165315,2196-04-09 12:26:00,2196-04-10 15:54:00,,EMERGENCY,EMERGENCY ROOM ADMIT,DISC-TRAN CANCER/CHLDRN H,Private,,UNOBTAINABLE,MARRIED,WHITE,2196-04-09 10:06:00,2196-04-09 13:24:00,BENZODIAZEPINE OVERDOSE,0,1
1,22,23,152223,2153-09-03 07:15:00,2153-09-08 19:10:00,,ELECTIVE,PHYS REFERRAL/NORMAL DELI,HOME HEALTH CARE,Medicare,,CATHOLIC,MARRIED,WHITE,,,CORONARY ARTERY DISEASE\CORONARY ARTERY BYPASS...,0,1
2,23,23,124321,2157-10-18 19:34:00,2157-10-25 14:00:00,,EMERGENCY,TRANSFER FROM HOSP/EXTRAM,HOME HEALTH CARE,Medicare,ENGL,CATHOLIC,MARRIED,WHITE,,,BRAIN MASS,0,1
3,24,24,161859,2139-06-06 16:14:00,2139-06-09 12:48:00,,EMERGENCY,TRANSFER FROM HOSP/EXTRAM,HOME,Private,,PROTESTANT QUAKER,SINGLE,WHITE,,,INTERIOR MYOCARDIAL INFARCTION,0,1
4,25,25,129635,2160-11-02 02:06:00,2160-11-05 14:55:00,,EMERGENCY,EMERGENCY ROOM ADMIT,HOME,Private,,UNOBTAINABLE,MARRIED,WHITE,2160-11-02 01:01:00,2160-11-02 04:27:00,ACUTE CORONARY SYNDROME,0,1


In [35]:
admissions = admissions.dropna(subset=['ADMITTIME'])

print(admissions['ADMITTIME'].dtype)


object


In [36]:
patients_cleaned['DOB'] = patients_cleaned['DOB'].dt.to_pydatetime()

In [37]:

# Ensure the 'ADMITTIME' column is in datetime format
admissions['ADMITTIME'] = pd.to_datetime(admissions['ADMITTIME'], errors='coerce')

# Now you can safely use .dt to access datetime-related properties
admissions['ADMITTIME'] = admissions['ADMITTIME'].dt.to_pydatetime()



In [38]:
# Merge PATIENTS (cleaned) and ADMISSIONS (cleaned)
patients_with_admissions_with_age = pd.merge(
    patients_cleaned[['SUBJECT_ID', 'DOB']],
    admissions[['SUBJECT_ID', 'ADMITTIME']],
    on='SUBJECT_ID',
    how='inner'
)

# Calculate age using native Python datetime
patients_with_admissions_with_age['AGE'] = patients_with_admissions_with_age.apply(
    lambda row: (row['ADMITTIME'].to_pydatetime() - row['DOB'].to_pydatetime()).days // 365 if row['DOB'] and row['ADMITTIME'] else None,
    axis=1
)

# Display results
print(patients_with_admissions_with_age[['SUBJECT_ID', 'DOB', 'ADMITTIME', 'AGE']].head())

   SUBJECT_ID        DOB           ADMITTIME  AGE
0         249 2075-03-13 2149-12-17 20:41:00   74
1         249 2075-03-13 2155-02-03 20:16:00   79
2         249 2075-03-13 2156-04-27 15:33:00   81
3         250 2164-12-27 2188-11-12 09:22:00   23
4         251 2090-03-15 2110-07-27 06:46:00   20


In [39]:
patients_with_admissions_with_age.shape

(56401, 4)

In [40]:
# choose sepsis patients from admissions
filtered_age = patients_with_admissions_with_age[patients_with_admissions_with_age['SUBJECT_ID'].isin(sepsis_patients_corrected['SUBJECT_ID'])]
filtered_age.shape

(8113, 4)

In [41]:
filtered_age = filtered_age.sort_values(by='ADMITTIME')
filtered_admissions.head()


Unnamed: 0,SUBJECT_ID,ADMITTIME
7378,4521,2100-06-28 19:29:00
37595,31585,2100-07-02 19:28:00
38886,42357,2100-07-14 02:04:00
50353,73131,2100-07-14 18:14:00
54560,84585,2100-08-03 00:26:00


In [42]:
print(filtered_age["SUBJECT_ID"].unique().__len__())
print(filtered_age.shape)


4340
(8113, 4)


In [43]:
# Group by `subject_id` and keep the first occurrence for duplicates
unique_or_first_age = filtered_age.groupby('SUBJECT_ID', as_index=False).first()


In [44]:
unique_or_first_age.shape
unique_or_first_age.head()

Unnamed: 0,SUBJECT_ID,DOB,ADMITTIME,AGE
0,21,2047-04-04,2134-09-11 12:17:00,87
1,38,2090-08-31,2166-08-10 00:28:00,75
2,61,2063-10-21,2118-06-17 14:40:00,54
3,62,2044-05-08,2113-02-15 00:19:00,68
4,64,2116-06-27,2143-03-03 09:25:00,26


In [45]:
unique_or_first_age.shape

(4340, 4)

In [46]:
age_df = unique_or_first_age



In [47]:
age_df.shape

(4340, 4)

# set age to sepsis patients

In [49]:
# Merge the dataframes on 'subject_id'
sepsis_age_df = sepsis_patients_corrected.merge(age_df, on='SUBJECT_ID', how='left')

# Save the result to a new CSV file or replace the existing one


# after age

In [50]:
sepsis_age_df.head()

Unnamed: 0,SUBJECT_ID,ADMITTIME_x,DOB,ADMITTIME_y,AGE
0,21,2134-09-11 12:17:00,2047-04-04,2134-09-11 12:17:00,87.0
1,38,2166-08-10 00:28:00,2090-08-31,2166-08-10 00:28:00,75.0
2,61,2118-06-17 14:40:00,2063-10-21,2118-06-17 14:40:00,54.0
3,62,2113-02-15 00:19:00,2044-05-08,2113-02-15 00:19:00,68.0
4,64,2143-03-03 09:25:00,2116-06-27,2143-03-03 09:25:00,26.0


In [51]:
filtered_sepsis_patientes_under18_df = sepsis_age_df[sepsis_age_df['AGE'] > 18]

# Print the shape and first few rows of the filtered DataFrame
print("Shape after removing under 18 ages:", filtered_sepsis_patientes_under18_df.shape)
filtered_sepsis_patientes_under18_df.head()

Shape after removing under 18 ages: (4327, 5)


Unnamed: 0,SUBJECT_ID,ADMITTIME_x,DOB,ADMITTIME_y,AGE
0,21,2134-09-11 12:17:00,2047-04-04,2134-09-11 12:17:00,87.0
1,38,2166-08-10 00:28:00,2090-08-31,2166-08-10 00:28:00,75.0
2,61,2118-06-17 14:40:00,2063-10-21,2118-06-17 14:40:00,54.0
3,62,2113-02-15 00:19:00,2044-05-08,2113-02-15 00:19:00,68.0
4,64,2143-03-03 09:25:00,2116-06-27,2143-03-03 09:25:00,26.0


In [55]:
# Load the required MIMIC-III files for demographic data
patients_df = pd.read_csv(path + '\\patients.csv', usecols=['SUBJECT_ID','GENDER'])
admissions_df = pd.read_csv(path + '\\admissions.csv', usecols=['SUBJECT_ID', 'ETHNICITY'])
# Merge demographic data from patients and admissions on SUBJECT_ID
demographic_df = pd.merge(
    patients_df,
    admissions_df.drop_duplicates(subset=['SUBJECT_ID']),
    on='SUBJECT_ID',
    how='inner'
)
print(demographic_df.head())

   SUBJECT_ID GENDER               ETHNICITY
0         249      F                   WHITE
1         250      F  BLACK/AFRICAN AMERICAN
2         251      M   UNKNOWN/NOT SPECIFIED
3         252      M                   WHITE
4         253      F                   WHITE


In [56]:
# Merge the demographic data with the filtered DataFrame (adults only)
final_df_with_demo = pd.merge(
    filtered_sepsis_patientes_under18_df,           # Filtered data without under-18 patients
    demographic_df,        # Demographic data
    on='SUBJECT_ID',
    how='left'             # Left join to retain all rows in filtered_df
)
final_df_with_demo.head()

Unnamed: 0,SUBJECT_ID,ADMITTIME_x,DOB,ADMITTIME_y,AGE,GENDER,ETHNICITY
0,21,2134-09-11 12:17:00,2047-04-04,2134-09-11 12:17:00,87.0,M,WHITE
1,38,2166-08-10 00:28:00,2090-08-31,2166-08-10 00:28:00,75.0,M,WHITE
2,61,2118-06-17 14:40:00,2063-10-21,2118-06-17 14:40:00,54.0,M,WHITE
3,62,2113-02-15 00:19:00,2044-05-08,2113-02-15 00:19:00,68.0,M,PATIENT DECLINED TO ANSWER
4,64,2143-03-03 09:25:00,2116-06-27,2143-03-03 09:25:00,26.0,F,BLACK/AFRICAN AMERICAN


In [57]:
final_cleaned_df = final_df_with_demo.dropna(subset=['GENDER', 'DOB', 'ETHNICITY'])

# Print the resulting shape and a preview
print("Shape after removing patients without demographic data:", final_cleaned_df.shape)
final_cleaned_df.head()

Shape after removing patients without demographic data: (4327, 7)


Unnamed: 0,SUBJECT_ID,ADMITTIME_x,DOB,ADMITTIME_y,AGE,GENDER,ETHNICITY
0,21,2134-09-11 12:17:00,2047-04-04,2134-09-11 12:17:00,87.0,M,WHITE
1,38,2166-08-10 00:28:00,2090-08-31,2166-08-10 00:28:00,75.0,M,WHITE
2,61,2118-06-17 14:40:00,2063-10-21,2118-06-17 14:40:00,54.0,M,WHITE
3,62,2113-02-15 00:19:00,2044-05-08,2113-02-15 00:19:00,68.0,M,PATIENT DECLINED TO ANSWER
4,64,2143-03-03 09:25:00,2116-06-27,2143-03-03 09:25:00,26.0,F,BLACK/AFRICAN AMERICAN


In [60]:
# Load lab events and lab items tables
labevents = pd.read_csv(path + '\\labevents.csv', usecols=['SUBJECT_ID', 'ITEMID', 'CHARTTIME', 'VALUENUM', 'VALUEUOM', 'FLAG'])
labitems = pd.read_csv(path + '\\d_labitems.csv', usecols=['ITEMID', 'LABEL', 'CATEGORY', 'FLUID'])

# Merge labevents with labitems to get descriptive labels for lab tests
lab_results = pd.merge(
    labevents,
    labitems,
    on='ITEMID',
    how='left'
)

In [61]:
print("Shape of lab_results:", lab_results.shape)
lab_results.head()

Shape of lab_results: (27854055, 9)


Unnamed: 0,SUBJECT_ID,ITEMID,CHARTTIME,VALUENUM,VALUEUOM,FLAG,LABEL,FLUID,CATEGORY
0,3,50820,2101-10-12 16:07:00,7.39,units,,pH,Blood,Blood Gas
1,3,50800,2101-10-12 18:17:00,,,,SPECIMEN TYPE,BLOOD,BLOOD GAS
2,3,50802,2101-10-12 18:17:00,-1.0,mEq/L,,Base Excess,Blood,Blood Gas
3,3,50804,2101-10-12 18:17:00,22.0,mEq/L,,Calculated Total CO2,Blood,Blood Gas
4,3,50808,2101-10-12 18:17:00,0.93,mmol/L,abnormal,Free Calcium,Blood,Blood Gas


In [62]:
sepsis_with_lab_pd = pd.merge(
    final_cleaned_df,
    lab_results,
    on = 'SUBJECT_ID',
    how = 'left'
) 
sepsis_with_lab_pd.head()

Unnamed: 0,SUBJECT_ID,ADMITTIME_x,DOB,ADMITTIME_y,AGE,GENDER,ETHNICITY,ITEMID,CHARTTIME,VALUENUM,VALUEUOM,FLAG,LABEL,FLUID,CATEGORY
0,21,2134-09-11 12:17:00,2047-04-04,2134-09-11 12:17:00,87.0,M,WHITE,51274.0,2134-09-15 01:47:00,15.5,sec,abnormal,PT,Blood,Hematology
1,21,2134-09-11 12:17:00,2047-04-04,2134-09-11 12:17:00,87.0,M,WHITE,51275.0,2134-09-15 01:47:00,49.8,sec,abnormal,PTT,Blood,Hematology
2,21,2134-09-11 12:17:00,2047-04-04,2134-09-11 12:17:00,87.0,M,WHITE,51277.0,2134-09-15 01:47:00,18.0,%,abnormal,RDW,Blood,Hematology
3,21,2134-09-11 12:17:00,2047-04-04,2134-09-11 12:17:00,87.0,M,WHITE,51279.0,2134-09-15 01:47:00,3.97,m/uL,abnormal,Red Blood Cells,Blood,Hematology
4,21,2134-09-11 12:17:00,2047-04-04,2134-09-11 12:17:00,87.0,M,WHITE,51301.0,2134-09-15 01:47:00,15.1,K/uL,abnormal,White Blood Cells,Blood,Hematology


In [63]:
sepsis_with_lab_pd.shape

(6752271, 15)

In [64]:
# Drop SUBJECT_IDs that do not appear in labevents
filtered_df_with_labs = final_cleaned_df[final_cleaned_df['SUBJECT_ID'].isin(lab_results['SUBJECT_ID'])]
labevents_unique = labevents.drop_duplicates(subset=['SUBJECT_ID'], keep='first')
final_with_labs = pd.merge(
    filtered_df_with_labs,
    labevents_unique[['SUBJECT_ID', 'ITEMID', 'CHARTTIME', 'VALUENUM', 'VALUEUOM', 'FLAG']],
    on='SUBJECT_ID',
    how='inner'  # Ensure we keep only matching SUBJECT_IDs
)
print("Shape after keeping unique lab results per SUBJECT_ID:", final_with_labs.shape)
final_with_labs.head()

Shape after keeping unique lab results per SUBJECT_ID: (4326, 12)


Unnamed: 0,SUBJECT_ID,ADMITTIME_x,DOB,ADMITTIME_y,AGE,GENDER,ETHNICITY,ITEMID,CHARTTIME,VALUENUM,VALUEUOM,FLAG
0,21,2134-09-11 12:17:00,2047-04-04,2134-09-11 12:17:00,87.0,M,WHITE,51274,2134-09-15 01:47:00,15.5,sec,abnormal
1,38,2166-08-10 00:28:00,2090-08-31,2166-08-10 00:28:00,75.0,M,WHITE,50893,2166-08-31 01:39:00,8.1,mg/dL,abnormal
2,61,2118-06-17 14:40:00,2063-10-21,2118-06-17 14:40:00,54.0,M,WHITE,51256,2118-07-10 03:00:00,49.0,%,abnormal
3,62,2113-02-15 00:19:00,2044-05-08,2113-02-15 00:19:00,68.0,M,PATIENT DECLINED TO ANSWER,50868,2113-02-14 20:20:00,13.0,mEq/L,
4,64,2143-03-03 09:25:00,2116-06-27,2143-03-03 09:25:00,26.0,F,BLACK/AFRICAN AMERICAN,51279,2143-06-23 19:11:00,4.32,m/uL,delta


In [65]:
# Calculate the percentage of missing values for each row
missing_percentage = final_with_labs.isnull().mean(axis=1) * 100

# Filter out rows where the percentage of missing values exceeds 20%
final_cleaned_df_no_missing = final_with_labs[missing_percentage <= 20]

# Print the shape and a few rows of the cleaned DataFrame
print("Shape after removing rows with >20% missing data:", final_cleaned_df_no_missing.shape)
final_cleaned_df_no_missing.head()

Shape after removing rows with >20% missing data: (4018, 12)


Unnamed: 0,SUBJECT_ID,ADMITTIME_x,DOB,ADMITTIME_y,AGE,GENDER,ETHNICITY,ITEMID,CHARTTIME,VALUENUM,VALUEUOM,FLAG
0,21,2134-09-11 12:17:00,2047-04-04,2134-09-11 12:17:00,87.0,M,WHITE,51274,2134-09-15 01:47:00,15.5,sec,abnormal
1,38,2166-08-10 00:28:00,2090-08-31,2166-08-10 00:28:00,75.0,M,WHITE,50893,2166-08-31 01:39:00,8.1,mg/dL,abnormal
2,61,2118-06-17 14:40:00,2063-10-21,2118-06-17 14:40:00,54.0,M,WHITE,51256,2118-07-10 03:00:00,49.0,%,abnormal
3,62,2113-02-15 00:19:00,2044-05-08,2113-02-15 00:19:00,68.0,M,PATIENT DECLINED TO ANSWER,50868,2113-02-14 20:20:00,13.0,mEq/L,
4,64,2143-03-03 09:25:00,2116-06-27,2143-03-03 09:25:00,26.0,F,BLACK/AFRICAN AMERICAN,51279,2143-06-23 19:11:00,4.32,m/uL,delta


In [82]:
chartevents = pd.read_csv(path + '\\chartevents.csv')

MemoryError: Unable to allocate 512. KiB for an array with shape (65536,) and data type int64