In [None]:
pip install pandas




In [None]:
# Import the necessary libraries
import pandas as pd
import numpy as np

# Mount your Google Drive to give Colab access to your files
from google.colab import drive
drive.mount('')

# --- IMPORTANT: Define the path to your MIMIC-IV CSV files in Google Drive ---

DRIVE_PATH = ''

print("Environment setup complete. Google Drive is mounted.")

Mounted at /content/drive
Environment setup complete. Google Drive is mounted.


In [None]:
print("\n--- Starting Part 1: Discovery Phase ---")

# --- Load the dictionary files ---
try:
    d_labitems_df = pd.read_csv(DRIVE_PATH + 'd_labitems.csv')
    d_icd_diagnoses_df = pd.read_csv(DRIVE_PATH + 'd_icd_diagnoses.csv')
    print("Successfully loaded d_labitems.csv and d_icd_diagnoses.csv.")
except FileNotFoundError:
    print("ERROR: Could not find the dictionary files. Please check your DRIVE_PATH.")
    # Stop execution if files are not found
    exit()

# --- 1a. Discover ICD-10 Codes for Sepsis and AKI ---
print("\n--- Discovering ICD-10 Codes ---")

# Search for relevant Sepsis codes (ICD-10 only)
sepsis_codes_df = d_icd_diagnoses_df[
    d_icd_diagnoses_df['long_title'].str.contains('Sepsis', case=False, na=False) &
    (d_icd_diagnoses_df['icd_version'] == 10)
]
print("\nPotential ICD-10 Codes for Sepsis:")
print(sepsis_codes_df)

# Search for relevant AKI codes (ICD-10 only)
aki_codes_df = d_icd_diagnoses_df[
    d_icd_diagnoses_df['long_title'].str.contains('Acute kidney', case=False, na=False) &
    (d_icd_diagnoses_df['icd_version'] == 10)
]
print("\nPotential ICD-10 Codes for Acute Kidney Injury:")
print(aki_codes_df)


# --- 1b. Discover Item IDs for Lab Panels ---
print("\n--- Discovering Lab Test Item IDs ---")

# Define the names of the tests from the base paper's appendix
cbc_test_names = ['Hematocrit', 'Platelet Count', 'White Blood Cell Count', 'Red Blood Cell Count', 'Hemoglobin']
cmp_test_names = ['Creatinine', 'Bicarbonate', 'Glucose', 'Potassium', 'Urea Nitrogen', 'Anion Gap', 'Lactate']

# Filter the d_labitems dataframe to find the corresponding itemids
cbc_items_df = d_labitems_df[d_labitems_df['label'].isin(cbc_test_names)]
cmp_items_df = d_labitems_df[d_labitems_df['label'].isin(cmp_test_names)]

print("\nDiscovered CBC Panel Item IDs:")
print(cbc_items_df)
print("\nDiscovered CMP Panel Item IDs:")
print(cmp_items_df)

print("\n--- Part 1: Discovery Phase Complete ---")
print("Please review the codes and IDs above. You will copy them into the lists in Part 2.")


--- Starting Part 1: Discovery Phase ---
Successfully loaded d_labitems.csv and d_icd_diagnoses.csv.

--- Discovering ICD-10 Codes ---

Potential ICD-10 Codes for Sepsis:
      icd_code  icd_version                                         long_title
12257     A021           10                                  Salmonella sepsis
12413     A227           10                                     Anthrax sepsis
12435     A267           10                              Erysipelothrix sepsis
12470     A327           10                                   Listerial sepsis
12529      A40           10                               Streptococcal sepsis
12530     A400           10               Sepsis due to streptococcus, group A
12531     A401           10               Sepsis due to streptococcus, group B
12532     A403           10             Sepsis due to Streptococcus pneumoniae
12533     A408           10                         Other streptococcal sepsis
12534     A409           10           

In [None]:
import pandas as pd
import numpy as np



print("\n--- Starting Part 1: Discovery Phase (Comprehensive) ---")

# --- Load the dictionary files ---
try:
    d_labitems_df = pd.read_csv(DRIVE_PATH + 'd_labitems.csv')
    d_icd_diagnoses_df = pd.read_csv(DRIVE_PATH + 'd_icd_diagnoses.csv')
    print(" Successfully loaded d_labitems.csv and d_icd_diagnoses.csv.")
except FileNotFoundError as e:
    print(f" ERROR: Could not find dictionary files. Please check your DRIVE_PATH. Details: {e}")
    exit()

# --- PRELIMINARY STEP: Confirm Column Names ---
print("\n--- Confirming column names from dictionary files ---")
print(f"Columns in d_icd_diagnoses.csv: {d_icd_diagnoses_df.columns.tolist()}")
print(f"Columns in d_labitems.csv: {d_labitems_df.columns.tolist()}")
# This confirms we can use 'long_title', 'icd_version', 'label', 'fluid', and 'itemid'.

# --- 1a. Discover ICD-10 Codes ---
# This code is now confirmed to work with the correct column names.
sepsis_codes_df = d_icd_diagnoses_df[d_icd_diagnoses_df['long_title'].str.contains('Sepsis', case=False, na=False) & (d_icd_diagnoses_df['icd_version'] == 10)]
aki_codes_df = d_icd_diagnoses_df[d_icd_diagnoses_df['long_title'].str.contains('Acute kidney', case=False, na=False) & (d_icd_diagnoses_df['icd_version'] == 10)]

# --- 1b. Discover Item IDs for ALL Lab Panels ---
cbc_test_names = ['Hematocrit', 'Platelet Count', 'White Blood Cell Count', 'Red Blood Cell Count', 'Hemoglobin']
cmp_test_names = ['Creatinine', 'Bicarbonate', 'Glucose', 'Potassium', 'Urea Nitrogen', 'Anion Gap', 'Lactate', 'Aspartate Aminotransferase', 'Bilirubin', 'Chloride', 'Albumin', 'Sodium']
abg_test_names = ['Base Excess', 'pH', 'Oxygen Saturation', 'Inspired O2 Fraction', 'Arterial Pressure']
aptt_test_names = ['PTT', 'INR', 'Prothrombin Time']

def find_item_ids(df, test_names):
    # Using a more precise regex with word boundaries (\b) to avoid partial matches
    regex_pattern = r'\b(' + '|'.join(test_names) + r')\b'
    return df[df['label'].str.contains(regex_pattern, case=False, na=False) & (df['fluid'] == 'Blood')]['itemid'].unique().tolist()

DISCOVERED_CBC_ITEMIDS = find_item_ids(d_labitems_df, cbc_test_names)
DISCOVERED_CMP_ITEMIDS = find_item_ids(d_labitems_df, cmp_test_names)
DISCOVERED_ABG_ITEMIDS = find_item_ids(d_labitems_df, abg_test_names)
DISCOVERED_APTT_ITEMIDS = find_item_ids(d_labitems_df, aptt_test_names)

# These lists should be populated based on the output from the discovery above for maximum accuracy,
# but using a pre-vetted list is also a valid scientific approach.
DISCOVERED_SEPSIS_CODES = ['A419', 'R6520', 'R6521']
DISCOVERED_AKI_CODES = ['N170', 'N171', 'N172', 'N179']

print("\n---  Discovered Item IDs for All Panels ---")
print(f"CBC Item IDs ({len(DISCOVERED_CBC_ITEMIDS)} found): {DISCOVERED_CBC_ITEMIDS}")
print(f"CMP Item IDs ({len(DISCOVERED_CMP_ITEMIDS)} found): {DISCOVERED_CMP_ITEMIDS}")
print(f"ABG Item IDs ({len(DISCOVERED_ABG_ITEMIDS)} found): {DISCOVERED_ABG_ITEMIDS}")
print(f"APTT Item IDs ({len(DISCOVERED_APTT_ITEMIDS)} found): {DISCOVERED_APTT_ITEMIDS}")
print("\n---  Part 1: Discovery Phase Complete ---")


--- Starting Part 1: Discovery Phase (Comprehensive) ---
✅ Successfully loaded d_labitems.csv and d_icd_diagnoses.csv.

--- Confirming column names from dictionary files ---
Columns in d_icd_diagnoses.csv: ['icd_code', 'icd_version', 'long_title']
Columns in d_labitems.csv: ['itemid', 'label', 'fluid', 'category']

--- ✅ Discovered Item IDs for All Panels ---
CBC Item IDs (29 found): [50810, 50811, 50852, 50855, 51212, 51221, 51222, 51223, 51224, 51225, 51265, 51285, 51631, 51638, 51639, 51640, 51641, 51642, 51643, 51644, 51645, 51646, 51647, 52028, 52032, 52128, 52129, 52157, 53189]
CMP Item IDs (40 found): [50803, 50806, 50809, 50813, 50822, 50824, 50862, 50868, 50882, 50883, 50884, 50885, 50902, 50912, 50931, 50954, 50971, 50983, 51006, 51568, 51569, 51570, 52022, 52024, 52027, 52434, 52442, 52452, 52455, 52500, 52535, 52546, 52569, 52610, 52623, 52647, 53085, 53089, 53138, 53154]
ABG Item IDs (3 found): [50802, 50817, 50820]
APTT Item IDs (7 found): [51237, 51275, 51675, 52165, 52

  return df[df['label'].str.contains(regex_pattern, case=False, na=False) & (df['fluid'] == 'Blood')]['itemid'].unique().tolist()


In [None]:

# ==============================================================================
# PART 2: MAIN ANALYSIS USING ALL DISCOVERED CODES
# ==============================================================================

print("\n--- Starting Part 2: Analysis Phase (Complete) ---")


DISCOVERED_SEPSIS_CODES = ['A021', 'A227', 'A267', 'A327', 'A400', 'A401', 'A403', 'A408', 'A409', 'A4101', 'A4102', 'A411', 'A412', 'A413', 'A414', 'A4150', 'A4151', 'A4152', 'A4153', 'A4159', 'A4181', 'A4189', 'A419', 'A427', 'A5486', 'B377', 'O0337', 'O0387', 'O0487', 'O0737', 'O0882', 'O85', 'O8604', 'P360', 'P3610', 'P3619', 'P362', 'P3630', 'P3639', 'P364', 'P365', 'P368', 'P369', 'R6520', 'R6521', 'T8144', 'T8144XA', 'T8144XD', 'T8144XS']
DISCOVERED_AKI_CODES = ['N170', 'N171', 'N172', 'N178', 'N179', 'O904']

DISCOVERED_CBC_ITEMIDS = [50811, 51221, 51222, 51265, 51638, 51639, 51640, 52028, 53189]
DISCOVERED_CMP_ITEMIDS = [50809, 50813, 50868, 50882, 50912, 50931, 50971, 51006, 52442, 52500, 52546, 52569, 52610, 52647, 53154]
DISCOVERED_ABG_ITEMIDS = [50802, 50817, 50820]
DISCOVERED_APTT_ITEMIDS = [51237, 51275, 51675, 52165, 52166, 52167, 52923]


print("Using the following codes and IDs for analysis:")
print(f"Sepsis Codes ({len(DISCOVERED_SEPSIS_CODES)}): {DISCOVERED_SEPSIS_CODES}")
print(f"AKI Codes ({len(DISCOVERED_AKI_CODES)}): {DISCOVERED_AKI_CODES}")
print(f"CBC Item IDs ({len(DISCOVERED_CBC_ITEMIDS)}): {DISCOVERED_CBC_ITEMIDS}")
print(f"CMP Item IDs ({len(DISCOVERED_CMP_ITEMIDS)}): {DISCOVERED_CMP_ITEMIDS}")
print(f"ABG Item IDs ({len(DISCOVERED_ABG_ITEMIDS)}): {DISCOVERED_ABG_ITEMIDS}")
print(f"APTT Item IDs ({len(DISCOVERED_APTT_ITEMIDS)}): {DISCOVERED_APTT_ITEMIDS}")

# --- Build Patient Cohorts ---
print("\n--- Building Patient Cohorts ---")
try:
    diagnoses_df = pd.read_csv(DRIVE_PATH + 'diagnoses_icd.csv', usecols=['subject_id', 'icd_code', 'icd_version'])
    diagnoses_df = diagnoses_df[diagnoses_df['icd_version'] == 10]

    sepsis_patients_df = diagnoses_df[diagnoses_df['icd_code'].isin(DISCOVERED_SEPSIS_CODES)]
    sepsis_subject_ids = sepsis_patients_df['subject_id'].unique()
    print(f"Successfully built Sepsis cohort with {len(sepsis_subject_ids)} unique patients.")

    aki_patients_df = diagnoses_df[diagnoses_df['icd_code'].isin(DISCOVERED_AKI_CODES)]
    aki_subject_ids = aki_patients_df['subject_id'].unique()
    print(f"Successfully built AKI cohort with {len(aki_subject_ids)} unique patients.")

except FileNotFoundError as e:
    print(f"❌ ERROR: Could not find diagnoses_icd.csv. Details: {e}")
    exit()

# --- The robust analysis function ---
def analyze_cohort_missingness(cohort_name, subject_ids, panel_name, itemids, labevents_filepath):
    print(f"\n--- Starting Missingness Analysis for {cohort_name} Cohort and {panel_name} Panel ---")
    if len(subject_ids) == 0:
        print("Cohort contains no patients. Skipping analysis.")
        return

    relevant_labs_chunks = []
    chunk_size = 1000000
    try:
        labevents_iterator = pd.read_csv(labevents_filepath, chunksize=chunk_size, low_memory=False, usecols=['subject_id', 'itemid', 'valuenum'])
        print(f"Processing {labevents_filepath}... (This may take several minutes)")
        for i, chunk in enumerate(labevents_iterator):
            filtered_chunk = chunk[chunk['subject_id'].isin(subject_ids) & chunk['itemid'].isin(itemids)]
            if not filtered_chunk.empty:
                relevant_labs_chunks.append(filtered_chunk)
            print(f"  Processed chunk {i+1}...")
    except FileNotFoundError as e:
        print(f"❌ ERROR: Could not find {labevents_filepath}. Details: {e}")
        return

    if not relevant_labs_chunks:
        print(f"Result: No lab events found for the {panel_name} panel in the {cohort_name} cohort.")
        return

    relevant_labs_df = pd.concat(relevant_labs_chunks)

    # Calculate Panel-Level Missingness
    patients_with_labs = relevant_labs_df['subject_id'].unique()
    panel_level_missingness = 1 - (len(patients_with_labs) / len(subject_ids))

    # Calculate Sporadic Missingness
    if len(relevant_labs_df) > 0:
        sporadic_missingness = relevant_labs_df['valuenum'].isnull().sum() / len(relevant_labs_df)
    else:
        sporadic_missingness = 0

    print(f"\n--- Results for {cohort_name} Cohort, {panel_name} Panel ---")
    print(f"Total patients in cohort: {len(subject_ids)}")
    print(f"Patients with at least one panel test result: {len(patients_with_labs)}")
    print(f"==> Panel-Level Missingness: {panel_level_missingness:.2%}")
    print(f"\nTotal measurements found for this panel: {len(relevant_labs_df)}")
    print(f"Number of null/missing measurement values: {relevant_labs_df['valuenum'].isnull().sum()}")
    print(f"==> Sporadic Missingness: {sporadic_missingness:.2%}")


# --- Run the FULL analysis for both cohorts and ALL FOUR panels ---
labevents_filepath = DRIVE_PATH + 'labevents.csv'

# Sepsis Analysis
print("\n" + "="*50 + "\nPERFORMING SEPSIS COHORT ANALYSIS\n" + "="*50)
analyze_cohort_missingness("Sepsis", sepsis_subject_ids, "CBC", DISCOVERED_CBC_ITEMIDS, labevents_filepath)
analyze_cohort_missingness("Sepsis", sepsis_subject_ids, "CMP", DISCOVERED_CMP_ITEMIDS, labevents_filepath)
analyze_cohort_missingness("Sepsis", sepsis_subject_ids, "ABG", DISCOVERED_ABG_ITEMIDS, labevents_filepath)
analyze_cohort_missingness("Sepsis", sepsis_subject_ids, "APTT", DISCOVERED_APTT_ITEMIDS, labevents_filepath)

# AKI Analysis
print("\n" + "="*50 + "\nPERFORMING AKI COHORT ANALYSIS\n" + "="*50)
analyze_cohort_missingness("AKI", aki_subject_ids, "CBC", DISCOVERED_CBC_ITEMIDS, labevents_filepath)
analyze_cohort_missingness("AKI", aki_subject_ids, "CMP", DISCOVERED_CMP_ITEMIDS, labevents_filepath)
analyze_cohort_missingness("AKI", aki_subject_ids, "ABG", DISCOVERED_ABG_ITEMIDS, labevents_filepath)
analyze_cohort_missingness("AKI", aki_subject_ids, "APTT", DISCOVERED_APTT_ITEMIDS, labevents_filepath)

print("\n---  Analysis Complete ---")


--- Starting Part 2: Analysis Phase (Complete) ---
Using the following codes and IDs for analysis:
Sepsis Codes (49): ['A021', 'A227', 'A267', 'A327', 'A400', 'A401', 'A403', 'A408', 'A409', 'A4101', 'A4102', 'A411', 'A412', 'A413', 'A414', 'A4150', 'A4151', 'A4152', 'A4153', 'A4159', 'A4181', 'A4189', 'A419', 'A427', 'A5486', 'B377', 'O0337', 'O0387', 'O0487', 'O0737', 'O0882', 'O85', 'O8604', 'P360', 'P3610', 'P3619', 'P362', 'P3630', 'P3639', 'P364', 'P365', 'P368', 'P369', 'R6520', 'R6521', 'T8144', 'T8144XA', 'T8144XD', 'T8144XS']
AKI Codes (6): ['N170', 'N171', 'N172', 'N178', 'N179', 'O904']
CBC Item IDs (9): [50811, 51221, 51222, 51265, 51638, 51639, 51640, 52028, 53189]
CMP Item IDs (15): [50809, 50813, 50868, 50882, 50912, 50931, 50971, 51006, 52442, 52500, 52546, 52569, 52610, 52647, 53154]
ABG Item IDs (3): [50802, 50817, 50820]
APTT Item IDs (7): [51237, 51275, 51675, 52165, 52166, 52167, 52923]

--- Building Patient Cohorts ---
Successfully built Sepsis cohort with 11321