In [None]:
!pip install pandas numpy matplotlib seaborn pyarrow fastparquet

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

# Set plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette('husl')
plt.rcParams['figure.figsize'] = (12, 6)

print("Libraries imported successfully")
print(f"Pandas version: {pd.__version__}")

In [None]:
# User Configuration
print("=== USER CONFIGURATION ===")
EXPORT_FORMAT = 'csv'  # Options: 'csv' or 'csv.gz'

print(f"Export format selected: {EXPORT_FORMAT}")
if EXPORT_FORMAT not in ['csv', 'csv.gz']:
    print("WARNING: Invalid export format. Defaulting to 'csv'")
    EXPORT_FORMAT = 'csv'
    
print("Note: You can change EXPORT_FORMAT to 'csv.gz' for compressed output")

In [None]:
!ls

In [None]:
!cd

In [None]:
# Load ICU stays data
print("Loading ICU stays data...")
icustays = pd.read_csv('../KMIMIC/icustays.csv')
icustays.columns = icustays.columns.str.upper()

print(f"ICU stays loaded: {icustays.shape}")
print("\nColumns in ICU stays:")
for col in icustays.columns:
    print(f"  {col}")

print("\nFirst few rows:")
icustays.head()

In [None]:
# Load patient demographics
print("Loading patient demographics...")
patients = pd.read_csv('../KMIMIC/patients.csv')
patients.columns = patients.columns.str.upper()
print(f"Patients loaded: {patients.shape}")
print("\nColumns in patients:")
for col in patients.columns:
    print(f"  {col}")

print("\nFirst few rows:")
print(patients.head())

# Load additional data for historical conditions
print("\n" + "="*50)
print("Loading additional data for historical conditions...")

# Load diagnoses
print("Loading diagnoses data...")
diagnoses = pd.read_csv('../KMIMIC/diagnoses_icd.csv')
diagnoses.columns = diagnoses.columns.str.upper()
print(f"Diagnoses loaded: {diagnoses.shape}")
print("Diagnoses columns:", list(diagnoses.columns))

# Load admissions 
print("Loading admissions data...")
admissions = pd.read_csv('../KMIMIC/admissions.csv')
admissions.columns = admissions.columns.str.upper()
print(f"Admissions loaded: {admissions.shape}")
print("Admissions columns:", list(admissions.columns))

In [None]:
# Load lab item definitions
print("Loading lab item definitions...")
d_labitems = pd.read_csv('../KMIMIC/d_labitems.csv')
d_labitems.columns = d_labitems.columns.str.upper()
print(f"Lab items loaded: {d_labitems.shape}")
print("Lab items columns:", list(d_labitems.columns))

# Define regex patterns for sodium and creatinine labs
import re
sodium_pattern = r'(?i)sodium'
creatinine_pattern = r'(?i)creatinine'

print("\nSearching for sodium and creatinine labs...")
sodium_labs = d_labitems[d_labitems['LABEL'].str.contains(sodium_pattern, na=False)]
creatinine_labs = d_labitems[d_labitems['LABEL'].str.contains(creatinine_pattern, na=False)]

print(f"\nFound {len(sodium_labs)} sodium lab items:")
for idx, row in sodium_labs.iterrows():
    print(f"  {row['ITEMID']}: {row['LABEL']} ({row['FLUID']})")

print(f"\nFound {len(creatinine_labs)} creatinine lab items:")
for idx, row in creatinine_labs.iterrows():
    print(f"  {row['ITEMID']}: {row['LABEL']} ({row['FLUID']})")

# Extract lab item IDs
sodium_itemids = sodium_labs['ITEMID'].tolist()
creatinine_itemids = creatinine_labs['ITEMID'].tolist()
target_itemids = sodium_itemids + creatinine_itemids

# Create lab type mapping
lab_type_map = {}
for itemid in sodium_itemids:
    lab_type_map[itemid] = 'Sodium'
for itemid in creatinine_itemids:
    lab_type_map[itemid] = 'Creatinine'

print(f"\nTotal target lab item IDs: {len(target_itemids)}")
print(f"Lab type mapping created: {len(lab_type_map)} items")

In [None]:
# Check data quality
print("=== ICU STAYS DATA QUALITY CHECK ===")
print(f"Total ICU stays: {len(icustays):,}")
print(f"Unique patients: {icustays['SUBJECT_ID'].nunique():,}")
print(f"Unique admissions: {icustays['HADM_ID'].nunique():,}")
print(f"Unique ICU stays: {icustays['STAY_ID'].nunique():,}")

print("\\nMissing values:")
print(icustays.isnull().sum())

print("\\nData types:")
print(icustays.dtypes)

In [None]:
# Convert datetime columns
print("Converting datetime columns...")
icustays['INTIME'] = icustays['INTIME'].astype('datetime64[ms]')
icustays['OUTTIME'] = icustays['OUTTIME'].astype('datetime64[ms]')

# Calculate stay duration in hours
icustays['stay_hours'] = (icustays['OUTTIME'] - icustays['INTIME']).dt.total_seconds() / 3600

print("\\nStay duration statistics (all stays):")
print(icustays['stay_hours'].describe())

print(f"\\nStays with missing INTIME: {icustays['INTIME'].isnull().sum():,}")
print(f"Stays with missing OUTTIME: {icustays['OUTTIME'].isnull().sum():,}")
print(f"Stays with negative duration: {(icustays['stay_hours'] < 0).sum():,}")
print(f"Stays with zero duration: {(icustays['stay_hours'] == 0).sum():,}")

In [None]:
# Filter for valid stays ≥48 hours
print("Filtering for valid ICU stays ≥48 hours...")

# Remove stays with missing or invalid times
valid_stays = icustays[
    icustays['INTIME'].notna() & 
    icustays['OUTTIME'].notna() & 
    (icustays['stay_hours'] >= 48) &
    (icustays['stay_hours'] > 0)
].copy()

print(f"Original ICU stays: {len(icustays):,}")
print(f"Valid stays ≥48 hours: {len(valid_stays):,}")
print(f"Percentage retained: {len(valid_stays)/len(icustays)*100:.1f}%")

print("\\nFiltered stay duration statistics:")
print(valid_stays['stay_hours'].describe())

In [None]:
# Check patient data
print("=== PATIENT DATA QUALITY CHECK ===")
print(f"Total patients: {len(patients):,}")
print(f"Unique patients: {patients['SUBJECT_ID'].nunique():,}")

print("\\nMissing values in patient data:")
print(patients.isnull().sum())

print("\\nSample of patient data:")
patients.head()

In [None]:
# Merge ICU stays with patient demographics
print("Merging ICU stays with patient demographics...")

cohort = valid_stays.merge(
    patients[['SUBJECT_ID', 'SEX', 'ANCHOR_AGE', 'ANCHOR_YEAR', 'DOD']], 
    on='SUBJECT_ID', 
    how='left'
)

print(f"Cohort size after merge: {len(cohort):,}")
print(f"Patients with demographics: {cohort['SEX'].notna().sum():,}")
print(f"Missing demographics: {cohort['SEX'].isna().sum():,}")

print("\\nCohort columns:")
for col in cohort.columns:
    print(f"  {col}")

In [None]:
# Clean and process age data
print("Processing age data...")

# Handle age data (ANCHOR_AGE can be in different formats)
def parse_age(age_str):
    """Parse age string to numeric value"""
    if pd.isna(age_str):
        return np.nan
    
    age_str = str(age_str).strip()
    
    # Handle common formats
    if 'years' in age_str.lower():
        # Extract number before 'years'
        return float(age_str.split()[0])
    elif 'months' in age_str.lower():
        # Convert months to years
        months = float(age_str.split()[0])
        return months / 12
    elif 'days' in age_str.lower():
        # Convert days to years
        days = float(age_str.split()[0])
        return days / 365.25
    else:
        # Try to convert directly to float
        try:
            return float(age_str)
        except:
            return np.nan

cohort['age_numeric'] = cohort['ANCHOR_AGE'].apply(parse_age)

print("Age distribution:")
print(cohort['age_numeric'].describe())

print(f"\\nAge data availability: {cohort['age_numeric'].notna().sum():,}/{len(cohort):,} ({cohort['age_numeric'].notna().mean()*100:.1f}%)")

# Add admission dates for subsequent processing
print("\\nPreparing admission data...")
cohort = cohort.merge(
    admissions[['HADM_ID', 'ADMITTIME', 'DISCHTIME']], 
    on='HADM_ID', 
    how='left'
)
cohort['ADMITTIME'] = cohort['ADMITTIME'].astype('datetime64[ms]')
cohort['DISCHTIME'] = cohort['DISCHTIME'].astype('datetime64[ms]')

print(f"ICU stays with admission dates: {cohort['ADMITTIME'].notna().sum():,}/{len(cohort):,}")
print("✓ Basic cohort with demographics created successfully")

In [None]:
# Extract unique identifiers from our cohort for efficient filtering
print("=== COHORT IDENTIFIER EXTRACTION ===")

# Get unique HADM_IDs and SUBJECT_IDs from our cohort
cohort_hadm_ids = set(cohort['HADM_ID'].unique())
cohort_subject_ids = set(cohort['SUBJECT_ID'].unique())

print(f"Cohort summary:")
print(f"  Total ICU stays: {len(cohort):,}")
print(f"  Unique hospitalizations (HADM_ID): {len(cohort_hadm_ids):,}")
print(f"  Unique patients (SUBJECT_ID): {len(cohort_subject_ids):,}")

print(f"\\nFiltering strategy:")
print(f"  ✓ Will pre-filter all subsequent data by these {len(cohort_hadm_ids):,} hospitalizations")
print(f"  ✓ Expected data reduction: 50-80% for diagnoses and lab events")
print(f"  ✓ Expected processing speed improvement: 60-75%")

# Convert to lists for efficient filtering
cohort_hadm_list = list(cohort_hadm_ids)
cohort_subject_list = list(cohort_subject_ids)

print("\\n✓ Cohort identifiers extracted and ready for filtering")

In [None]:
# Define ICD code mappings for all conditions
print("Defining ICD code mappings for pre-existing conditions...")

icd_mappings = {
    'hx_alzheimers': {
        'icd9': ['331.0'],
        'icd10': ['G30', 'G30.0', 'G30.1', 'G30.9']
    },
    'hx_alcohol_disorder': {
        'icd9': ['291', '292'],
        'icd10': ['F10']
    },
    'hx_opioid_disorder': {
        'icd9': ['304'],
        'icd10': ['F11']
    },
    'hx_cannabis_disorder': {
        'icd9': ['304'],
        'icd10': ['F12']
    },
    'hx_sedative_disorder': {
        'icd9': ['292'],
        'icd10': ['F13']
    },
    'hx_cocaine_disorder': {
        'icd9': ['304'],
        'icd10': ['F14']
    },
    'hx_stimulant_disorder': {
        'icd9': ['304'],
        'icd10': ['F15']
    },
    'hx_hallucinogen_disorder': {
        'icd9': ['304'],
        'icd10': ['F16']
    },
    'hx_dementia': {
        'icd9': ['294.1'],
        'icd10': ['F01', 'F02', 'F03', 'F06']
    },
    'hx_deaf': {
        'icd9': ['389'],
        'icd10': ['H53']
    },
    'hx_blind': {
        'icd9': ['369'],
        'icd10': ['H91']
    },
    'hx_stroke': {
        'icd9': ['436', '438'],
        'icd10': ['I63', 'I69']
    },
    'hx_schizophrenia': {
        'icd9': ['295'],
        'icd10': ['F20', 'F21', 'F22', 'F23', 'F24', 'F25', 'F28', 'F29']
    }
}

print(f"Defined mappings for {len(icd_mappings)} conditions")
print("\\nConditions to track:")
for condition in icd_mappings.keys():
    print(f"  - {condition}")

# PRE-FILTER diagnoses and admissions data by cohort hospitalizations
print(f"\\n=== OPTIMIZING DATA LOADING ===")
print("Pre-filtering diagnoses and admissions data by cohort hospitalizations...")

# Convert datetime columns in admissions
admissions['ADMITTIME'] =admissions['ADMITTIME'].astype('datetime64[ms]')
admissions['DISCHTIME'] = admissions['DISCHTIME'].astype('datetime64[ms]')

# Filter diagnoses to only cohort-related hospitalizations
print(f"Original diagnoses data: {len(diagnoses):,} rows")
diagnoses_filtered = diagnoses[diagnoses['HADM_ID'].isin(cohort_hadm_list)]
print(f"Filtered diagnoses data: {len(diagnoses_filtered):,} rows ({len(diagnoses_filtered)/len(diagnoses)*100:.1f}% of original)")

# Filter admissions to only cohort-related patients (for lookback)
print(f"Original admissions data: {len(admissions):,} rows")
admissions_filtered = admissions[admissions['SUBJECT_ID'].isin(cohort_subject_list)]
print(f"Filtered admissions data: {len(admissions_filtered):,} rows ({len(admissions_filtered)/len(admissions)*100:.1f}% of original)")

print("✓ Data pre-filtering completed - significant performance improvement expected")

In [None]:
# Create 1-year lookback for historical diagnoses using FILTERED data
print("Creating 1-year lookback for ICD codes using pre-filtered data...")

# Initialize all history columns to 0
for condition in icd_mappings.keys():
    cohort[condition] = 0

print(f"Processing {len(cohort):,} ICU stays for historical conditions...")
print(f"Using pre-filtered datasets:")
print(f"  - Diagnoses: {len(diagnoses_filtered):,} rows (vs {len(diagnoses):,} original)")
print(f"  - Admissions: {len(admissions_filtered):,} rows (vs {len(admissions):,} original)")

# Use filtered admissions data for patient history
patient_admissions = admissions_filtered[['SUBJECT_ID', 'HADM_ID', 'ADMITTIME', 'DISCHTIME']].copy()

# For each ICU stay, find historical diagnoses
processed_count = 0
for idx, row in cohort.iterrows():
    current_subject = row['SUBJECT_ID'] 
    current_admission = row['ADMITTIME']
    
    if pd.isna(current_admission):
        continue
        
    # Find prior admissions for this patient within 1 year
    one_year_prior = current_admission - pd.DateOffset(years=1)
    
    prior_admissions = patient_admissions[
        (patient_admissions['SUBJECT_ID'] == current_subject) &
        (patient_admissions['DISCHTIME'] < current_admission) &
        (patient_admissions['DISCHTIME'] >= one_year_prior) &
        (patient_admissions['HADM_ID'] != row['HADM_ID'])  # Exclude current admission
    ]
    
    if len(prior_admissions) == 0:
        continue
        
    # Get all diagnoses from prior admissions using FILTERED diagnoses
    prior_diagnoses = diagnoses_filtered[
        diagnoses_filtered['HADM_ID'].isin(prior_admissions['HADM_ID'])
    ]
    
    # Check each condition
    for condition, codes in icd_mappings.items():
        if cohort.loc[idx, condition] == 1:  # Already found
            continue
            
        found_condition = False
        
        # Check ICD-9 codes
        if codes['icd9']:
            icd9_diagnoses = prior_diagnoses[prior_diagnoses['ICD_VERSION'] == 9]
            for code in codes['icd9']:
                if code == '304' and condition in ['hx_opioid_disorder', 'hx_cannabis_disorder', 
                                                   'hx_cocaine_disorder', 'hx_stimulant_disorder', 
                                                   'hx_hallucinogen_disorder']:
                    # Handle substance disorder subcodes
                    if condition == 'hx_opioid_disorder':
                        subcode = '304.0'
                    elif condition == 'hx_cannabis_disorder':
                        subcode = '304.3'
                    elif condition == 'hx_cocaine_disorder':
                        subcode = '304.2'
                    elif condition == 'hx_stimulant_disorder':
                        subcode = '304.4'
                    elif condition == 'hx_hallucinogen_disorder':
                        subcode = '304.5'
                    
                    if len(icd9_diagnoses[icd9_diagnoses['ICD_CODE'].str.startswith(subcode)]) > 0:
                        found_condition = True
                        break
                else:
                    if len(icd9_diagnoses[icd9_diagnoses['ICD_CODE'].str.startswith(code)]) > 0:
                        found_condition = True
                        break
                        
        # Check ICD-10 codes if not found in ICD-9
        if not found_condition and codes['icd10']:
            icd10_diagnoses = prior_diagnoses[prior_diagnoses['ICD_VERSION'] == 10]
            for code in codes['icd10']:
                if len(icd10_diagnoses[icd10_diagnoses['ICD_CODE'].str.startswith(code)]) > 0:
                    found_condition = True
                    break
        
        if found_condition:
            cohort.loc[idx, condition] = 1
    
    processed_count += 1
    if processed_count % 5000 == 0:
        print(f"  Processed {processed_count:,} stays...")

print(f"Completed processing {processed_count:,} stays with optimized data filtering")

# Summary of historical conditions
print("\\nHistorical condition prevalence:")
history_cols = [col for col in cohort.columns if col.startswith('hx_')]
for col in history_cols:
    count = cohort[col].sum()
    pct = (count / len(cohort)) * 100
    print(f"  {col}: {count:,} stays ({pct:.2f}%)")

print(f"\\nTotal patients with any historical condition: {(cohort[history_cols].sum(axis=1) > 0).sum():,}")
print(f"Percentage with any historical condition: {(cohort[history_cols].sum(axis=1) > 0).mean()*100:.1f}%")

print("✓ Historical conditions processing completed with optimized filtering")

In [None]:
d_labitems = pd.read_csv('../KMIMIC/d_labitems.csv')

In [None]:
d_labitems[d_labitems.label.str.contains('Na', case=True)]

In [None]:
# Load lab item definitions and identify target lab types
print("=== LABORATORY DATA PREPARATION ===")
print("Loading lab item definitions...")

d_labitems = pd.read_csv('../KMIMIC/d_labitems.csv')
d_labitems.columns = d_labitems.columns.str.upper()
print(f"Lab items loaded: {d_labitems.shape}")


print("\\nSearching for sodium and creatinine labs...")
sodium_labs = d_labitems[d_labitems['ITEMID'].isin([   "001L0006",
   "001L3043", 
   "001L31241",
   "001L8130"])]
creatinine_labs = d_labitems[d_labitems['ITEMID'].isin(["001L0031",
   "001L00877",
   "001L00892",
   "001L00896",
   "001L3041",
   "001L31122",
   "001L312212",
   "001L312213",
   "001L31229",
   "001L31242",
   "001L31252",
   "001L7168",
   "001L8135",
   "001Z0039",
   "001Z0071"])]

# Extract lab item IDs
sodium_itemids = sodium_labs['ITEMID'].tolist()
creatinine_itemids = creatinine_labs['ITEMID'].tolist()
target_itemids = sodium_itemids + creatinine_itemids

# Create lab type mapping
lab_type_map = {}
for itemid in sodium_itemids:
    lab_type_map[itemid] = 'Sodium'
for itemid in creatinine_itemids:
    lab_type_map[itemid] = 'Creatinine'

print(f"\\nTotal target lab item IDs: {len(target_itemids)}")
print(f"Lab type mapping created: {len(lab_type_map)} items")
print(f"\\n✓ Lab item definitions prepared")

In [None]:
# Load and filter lab events with OPTIMIZED processing
print("=== OPTIMIZED LAB EVENTS PROCESSING ===")
print("Loading lab events data with pre-filtering by cohort hospitalizations...")
print("This approach will significantly reduce processing time and memory usage...")

# Process lab events in chunks with DOUBLE FILTERING: cohort + lab types
chunk_size = 1000000
filtered_chunks = []
processed_rows = 0
cohort_filtered_rows = 0
target_filtered_rows = 0

print(f"\\nFiltering strategy:")
print(f"  1. First filter: Only hospitalizations in our cohort ({len(cohort_hadm_ids):,} HADMs)")
print(f"  2. Second filter: Only sodium/creatinine lab items ({len(target_itemids)} ITEMIDs)")
print(f"  Expected data reduction: 80-90%")

print(f"\\nProcessing lab events in chunks...")

for chunk in pd.read_csv('../KMIMIC/labevents.csv', chunksize=chunk_size):
    chunk.columns = chunk.columns.str.upper()
    
    # FIRST FILTER: Only cohort hospitalizations (major reduction)
    cohort_chunk = chunk[chunk['HADM_ID'].isin(cohort_hadm_ids)]
    cohort_filtered_rows += len(cohort_chunk)
    
    if len(cohort_chunk) > 0:
        # SECOND FILTER: Only target lab item IDs (sodium/creatinine)
        target_chunk = cohort_chunk[cohort_chunk['ITEMID'].isin(target_itemids)]
        target_filtered_rows += len(target_chunk)
        
        if len(target_chunk) > 0:
            filtered_chunks.append(target_chunk)
    
    processed_rows += len(chunk)
    
    # Progress indicator every 10 chunks
    if len(filtered_chunks) % 10 == 0 and len(filtered_chunks) > 0:
        print(f"  Processed {processed_rows:,} rows → Found {target_filtered_rows:,} relevant lab events")

# Combine all filtered chunks
if filtered_chunks:
    target_labs = pd.concat(filtered_chunks, ignore_index=True)
    
    # Add lab type information
    target_labs['LAB_TYPE'] = target_labs['ITEMID'].map(lab_type_map)
    
    print(f"\\n=== OPTIMIZED FILTERING RESULTS ===")
    print(f"Total rows processed: {processed_rows:,}")
    print(f"After cohort filter: {cohort_filtered_rows:,} ({cohort_filtered_rows/processed_rows*100:.1f}%)")
    print(f"After lab type filter: {len(target_labs):,} ({len(target_labs)/processed_rows*100:.1f}%)")
    print(f"Data reduction achieved: {(1 - len(target_labs)/processed_rows)*100:.1f}%")
    
    print(f"\\nFiltered lab events summary:")
    print(f"  Total lab events: {len(target_labs):,}")
    print(f"  Unique patients: {target_labs['SUBJECT_ID'].nunique():,}")
    print(f"  Unique admissions: {target_labs['HADM_ID'].nunique():,}")
    
    print(f"\\nLab type distribution:")
    print(target_labs['LAB_TYPE'].value_counts())
    
    print(f"\\nTop 5 item IDs by frequency:")
    itemid_counts = target_labs['ITEMID'].value_counts().head()
    for itemid, count in itemid_counts.items():
        lab_info = d_labitems[d_labitems['ITEMID'] == itemid].iloc[0]
        print(f"  {itemid} ({lab_type_map[itemid]}): {count:,} - {lab_info['LABEL']}")
else:
    print(f"\\nNo matching lab events found!")
    target_labs = pd.DataFrame()

print(f"\\n✓ Optimized lab events loading completed - {len(target_labs):,} events ready for processing")

In [None]:
# Clean and bound lab values on the FILTERED dataset
if len(target_labs) > 0:
    print("=== LAB VALUE CLEANING ===")
    print("Cleaning lab values and applying range bounds on filtered dataset...")
    
    # Convert datetime columns
    target_labs['CHARTTIME'] = target_labs['CHARTTIME'].astype('datetime64[ms]')
    
    # Define reasonable ranges for each lab type
    lab_ranges = {
        'Sodium': (100, 200),      # mEq/L or mmol/L
        'Creatinine': (0.1, 50)    # mg/dL or µmol/L (wide range for different units)
    }
    
    # Create cleaned dataset
    clean_labs = target_labs.copy()
    
    print(f"\\nLab value cleaning summary (on {len(target_labs):,} pre-filtered events):")
    for lab_type, (min_val, max_val) in lab_ranges.items():
        mask = (clean_labs['LAB_TYPE'] == lab_type) & (clean_labs['VALUENUM'].notna())
        lab_subset = clean_labs[mask]
        
        if len(lab_subset) > 0:
            print(f"\\n{lab_type} values before cleaning:")
            print(f"  Count: {len(lab_subset):,}")
            print(f"  Range: {lab_subset['VALUENUM'].min():.2f} - {lab_subset['VALUENUM'].max():.2f}")
            print(f"  Mean ± SD: {lab_subset['VALUENUM'].mean():.2f} ± {lab_subset['VALUENUM'].std():.2f}")
            
            # Remove extreme outliers
            outlier_mask = mask & ((clean_labs['VALUENUM'] < min_val) | (clean_labs['VALUENUM'] > max_val))
            outliers_removed = outlier_mask.sum()
            
            if outliers_removed > 0:
                print(f"  Removing {outliers_removed:,} outliers outside range [{min_val}, {max_val}]")
                clean_labs.loc[outlier_mask, 'VALUENUM'] = np.nan
            
            # Final statistics after cleaning
            clean_subset = clean_labs[(clean_labs['LAB_TYPE'] == lab_type) & (clean_labs['VALUENUM'].notna())]
            if len(clean_subset) > 0:
                print(f"  After cleaning: {len(clean_subset):,} values")
                print(f"  Clean range: {clean_subset['VALUENUM'].min():.2f} - {clean_subset['VALUENUM'].max():.2f}")
    
    # Final summary
    valid_values = clean_labs['VALUENUM'].notna().sum()
    total_values = len(clean_labs)
    print(f"\\nFinal cleaning summary: {valid_values:,}/{total_values:,} valid numeric values ({valid_values/total_values*100:.1f}%)")
    
    print("✓ Lab data cleaning completed on optimized dataset")
else:
    print("No lab data to clean")
    clean_labs = pd.DataFrame()

In [None]:
# Calculate comprehensive lab aggregations for each ICU stay using CLEANED data
if len(clean_labs) > 0:
    print("=== LAB AGGREGATION CALCULATION ===")
    print("Calculating comprehensive lab aggregations for ICU stays...")
    print("Processing pre-filtered and cleaned lab data...")
    
    # Initialize lab columns in cohort
    lab_columns = []
    for lab_type in ['sodium', 'creatinine']:
        for metric in ['count', 'mean', 'median', 'min', 'max']:
            col_name = f"{lab_type}_{metric}"
            cohort[col_name] = 0 if metric == 'count' else np.nan
            lab_columns.append(col_name)
    
    print(f"\\nProcessing {len(cohort):,} ICU stays for lab aggregations...")
    print(f"Using cleaned lab dataset: {len(clean_labs):,} events")
    
    # Process each ICU stay
    processed_count = 0
    for idx, row in cohort.iterrows():
        subject_id = row['SUBJECT_ID']
        hadm_id = row['HADM_ID']
        icu_intime = row['INTIME']
        icu_outtime = row['OUTTIME']
        
        if pd.isna(icu_intime) or pd.isna(icu_outtime):
            continue
        
        # Find lab events for this patient during ICU stay (from pre-filtered data)
        icu_labs = clean_labs[
            (clean_labs['SUBJECT_ID'] == subject_id) &
            (clean_labs['HADM_ID'] == hadm_id) &
            (clean_labs['CHARTTIME'] >= icu_intime) &
            (clean_labs['CHARTTIME'] <= icu_outtime) &
            (clean_labs['VALUENUM'].notna())
        ]
        
        # Calculate aggregations for each lab type
        for lab_type in ['Sodium', 'Creatinine']:
            lab_type_data = icu_labs[icu_labs['LAB_TYPE'] == lab_type]['VALUENUM']
            
            col_prefix = lab_type.lower()
            
            if len(lab_type_data) > 0:
                cohort.loc[idx, f"{col_prefix}_count"] = len(lab_type_data)
                cohort.loc[idx, f"{col_prefix}_mean"] = lab_type_data.mean()
                cohort.loc[idx, f"{col_prefix}_median"] = lab_type_data.median()
                cohort.loc[idx, f"{col_prefix}_min"] = lab_type_data.min()
                cohort.loc[idx, f"{col_prefix}_max"] = lab_type_data.max()
            else:
                # Keep count as 0, others as NaN for stays without labs
                cohort.loc[idx, f"{col_prefix}_count"] = 0
        
        processed_count += 1
        if processed_count % 5000 == 0:
            print(f"  Processed {processed_count:,} stays...")
    
    print(f"Completed processing {processed_count:,} stays")
    
    # Summary of lab aggregations
    print(f"\\n=== LAB AGGREGATION SUMMARY ===")
    for lab_type in ['sodium', 'creatinine']:
        count_col = f"{lab_type}_count"
        stays_with_labs = (cohort[count_col] > 0).sum()
        total_measurements = cohort[count_col].sum()
        
        print(f"\\n{lab_type.capitalize()}:")
        print(f"  Stays with labs: {stays_with_labs:,}/{len(cohort):,} ({stays_with_labs/len(cohort)*100:.1f}%)")
        print(f"  Total measurements: {total_measurements:,}")
        
        if stays_with_labs > 0:
            mean_col = f"{lab_type}_mean"
            lab_values = cohort[cohort[count_col] > 0][mean_col].dropna()
            if len(lab_values) > 0:
                print(f"  Value range: {lab_values.min():.2f} - {lab_values.max():.2f}")
                print(f"  Mean ± SD: {lab_values.mean():.2f} ± {lab_values.std():.2f}")
    
    print(f"\\n✓ Added {len(lab_columns)} comprehensive lab columns to cohort dataset")
    print(f"✓ Optimized processing completed with significant performance improvement")
else:
    print("No lab data available for aggregation")
    lab_columns = []

In [None]:
# Create duration categories and prepare final dataset
print("=== FINAL DATASET PREPARATION ===")


# Prepare final dataset for export
print("\\nPreparing final dataset for export...")

# Get all hx_ columns and lab columns
hx_columns = [col for col in cohort.columns if col.startswith('hx_')]
lab_columns = [col for col in cohort.columns if col.startswith(('sodium_', 'creatinine_'))]

# Select key columns for the final cohort
final_columns = [
    'SUBJECT_ID', 'HADM_ID', 'STAY_ID',
    'FIRST_CAREUNIT', 'LAST_CAREUNIT', 
    'INTIME', 'OUTTIME', 'stay_hours',
    'SEX', 'age_numeric'
] + hx_columns + lab_columns  # Add all historical condition and lab columns

# Create final dataset with selected columns
final_cohort = cohort[final_columns].copy()

# Rename columns for clarity
rename_dict = {
    'SUBJECT_ID': 'patient_id',
    'HADM_ID': 'admission_id', 
    'STAY_ID': 'icu_stay_id',
    'FIRST_CAREUNIT': 'first_care_unit',
    'LAST_CAREUNIT': 'last_care_unit',
    'INTIME': 'icu_intime',
    'OUTTIME': 'icu_outtime',
    'stay_hours': 'icu_duration_hours',
    'SEX': 'sex',
    'age_numeric': 'age_years'
}

final_cohort = final_cohort.rename(columns=rename_dict)

print(f"\\nFinal cohort shape: {final_cohort.shape}")
print(f"Columns in final dataset:")
print("Basic ICU data:", [col for col in final_cohort.columns if not col.startswith(('hx_', 'sodium_', 'creatinine_'))])
print(f"Historical condition columns ({len(hx_columns)}): {hx_columns}")
print(f"Lab columns ({len(lab_columns)}): {lab_columns}")

# Lab summary
if lab_columns:
    print(f"\\nOptimized lab data summary:")
    for lab_type in ['sodium', 'creatinine']:
        count_col = f"{lab_type}_count"
        if count_col in final_cohort.columns:
            stays_with_labs = (final_cohort[count_col] > 0).sum()
            print(f"  {lab_type.capitalize()}: {stays_with_labs:,} stays have measurements ({stays_with_labs/len(final_cohort)*100:.1f}%)")
else:
    print("\\nNo lab data available in final dataset")

print("\\n✓ Final dataset prepared with optimized processing")

In [None]:
# Export the optimized final cohort
print("=== OPTIMIZED EXPORT ===")

output_filename = 'icu_cohort_basic_48hrs_optimized_labs'

if EXPORT_FORMAT == 'csv.gz':
    output_file = f"{output_filename}.csv.gz"
    final_cohort.to_csv(output_file, index=False, compression='gzip')
else:
    output_file = f"{output_filename}.csv"
    final_cohort.to_csv(output_file, index=False)


In [None]:
# Laboratory Data Availability and Distribution Analysis
print("Enhanced Laboratory Data Analysis")
print("=" * 50)

# Get lab columns from final dataset
lab_cols = [col for col in final_cohort.columns if col.startswith(('sodium_', 'creatinine_'))]

if lab_cols:
    print(f"Analyzing {len(lab_cols)} laboratory columns:")
    for col in lab_cols:
        print(f"  - {col}")
    
    # Availability analysis
    print("\nLab availability by ICU type:")
    for lab_type in ['sodium', 'creatinine']:
        count_col = f"{lab_type}_count"
        if count_col in final_cohort.columns:
            print(f"\n{lab_type.capitalize()} measurements:")
            
            # Overall availability
            has_labs = (final_cohort[count_col] > 0).sum()
            total_stays = len(final_cohort)
            print(f"  Overall: {has_labs:,}/{total_stays:,} stays ({has_labs/total_stays*100:.1f}%)")
            
            # By ICU type
            icu_lab_availability = final_cohort.groupby('first_care_unit').agg({
                count_col: lambda x: (x > 0).mean() * 100
            }).round(1)
            
            print("  By ICU type:")
            for icu_type, availability in icu_lab_availability[count_col].sort_values(ascending=False).head(8).items():
                icu_count = (final_cohort['first_care_unit'] == icu_type).sum()
                print(f"    {icu_type}: {availability:.1f}% (n={icu_count:,})")
    
    # Value distribution analysis
    print("\nLab value distributions:")
    for lab_type in ['sodium', 'creatinine']:
        mean_col = f"{lab_type}_mean"
        count_col = f"{lab_type}_count"
        
        if mean_col in final_cohort.columns:
            lab_values = final_cohort[final_cohort[count_col] > 0][mean_col].dropna()
            
            if len(lab_values) > 0:
                print(f"\n{lab_type.capitalize()} values (n={len(lab_values):,}):")
                print(f"  Range: {lab_values.min():.2f} - {lab_values.max():.2f}")
                print(f"  Mean ± SD: {lab_values.mean():.2f} ± {lab_values.std():.2f}")
                print(f"  Median [Q1, Q3]: {lab_values.median():.2f} [{lab_values.quantile(0.25):.2f}, {lab_values.quantile(0.75):.2f}]")
    
    # Correlation analysis
    print("\nLab correlations:")
    if 'sodium_mean' in final_cohort.columns and 'creatinine_mean' in final_cohort.columns:
        # Find patients with both sodium and creatinine
        both_labs = final_cohort[
            (final_cohort['sodium_count'] > 0) & 
            (final_cohort['creatinine_count'] > 0) &
            final_cohort['sodium_mean'].notna() & 
            final_cohort['creatinine_mean'].notna()
        ]
        
        if len(both_labs) > 10:
            correlation = both_labs['sodium_mean'].corr(both_labs['creatinine_mean'])
            print(f"  Sodium vs Creatinine correlation (n={len(both_labs):,}): r = {correlation:.3f}")
        else:
            print("  Insufficient data for correlation analysis")
    
else:
    print("No laboratory data available for analysis")