In [None]:
# Import required libraries
import pandas as pd
import duckdb
import numpy as np
from pathlib import Path
import time
import warnings
warnings.filterwarnings('ignore')
conn = duckdb.connect()
print("DuckDB connection established")
print("Libraries imported successfully")
print(f"DuckDB version: {duckdb.__version__}")

In [None]:
# Initialize DuckDB connection for ultra-fast processing
conn = duckdb.connect()
print("DuckDB connection established")

# Set up file paths
cohort_file = 'icu_cohort_basic_48hrs_optimized_labs.csv'
d_items_file = '../KMIMIC/emar.csv'
inputevents_file = '../KMIMIC/emar_detail.csv'

print("\nFile paths configured:")
print(f"  Cohort: {cohort_file}")
print(f"  D_ITEMS: {d_items_file}")
print(f"  InputEvents: {inputevents_file}")

In [None]:
# Load the ICU cohort dataset
print("Loading ICU cohort dataset...")
start_time = time.time()

cohort_df = pd.read_csv(cohort_file)
print(f"Cohort loaded in {time.time() - start_time:.2f}s")
print(f"Cohort shape: {cohort_df.shape}")
print(f"Columns: {len(cohort_df.columns)}")

# Register with DuckDB
conn.register('cohort', cohort_df)
print("\nCohort registered with DuckDB")

# Display basic cohort info
print(f"\nCohort Summary:")
print(f"  Total ICU stays: {len(cohort_df):,}")
print(f"  Unique patients: {cohort_df['patient_id'].nunique():,}")
print(f"  Unique admissions: {cohort_df['admission_id'].nunique():,}")

# Show sample
print("\nFirst few rows:")
cohort_df[['patient_id', 'admission_id', 'icu_stay_id', 'icu_intime', 'icu_outtime', 'icu_duration_hours']].head()

In [None]:
# Define the 20 target medications
medications = {
    'vasoactives': [
        'angiotensin', 'dobutamine', 'dopamine', 'epinephrine',
        'milrinone', 'norepinephrine', 'phenylephrine', 'vasopressin'
    ],
    'sedatives': [
        'dexmedetomidine', 'fentanyl', 'hydromorphone', 'ketamine',
        'lorazepam', 'midazolam', 'morphine', 'pentobarbital', 'propofol'
    ],
    'paralytics': [
        'cisatracurium', 'rocuronium', 'vecuronium'
    ]
}

# Create flat list of all medications
all_medications = []
for category, meds in medications.items():
    all_medications.extend(meds)

print(f"Target medications to find ITEMIDs for: {len(all_medications)}")
print(f"\nMedications by category:")
for category, meds in medications.items():
    print(f"  {category.title()}: {meds}")

print(f"\nComplete list: {all_medications}")

In [None]:
d_items = pd.read_csv(d_items_file)
conn.register('d_items', d_items)

In [None]:

medication_itemids = {}
all_itemids = []
not_found = []

In [None]:


# Search for each medication
for med_name in all_medications:
    # Use string containment search (case-insensitive)
    search_query = f"""
    SELECT distinct itemid, medication as label,medication as abbreviation
    FROM d_items
    WHERE LOWER(label) LIKE '%{med_name.lower()}%'
       OR LOWER(abbreviation) LIKE '%{med_name.lower()}%'
    """
    
    matches = conn.execute(search_query).df()
    
    if len(matches) > 0:
        itemids = matches['itemid'].tolist()
        medication_itemids[med_name] = itemids
        all_itemids.extend(itemids)
        
        print(f"{med_name}: {len(matches)} match(es) - ITEMIDs: {itemids}")
        for _, row in matches.iterrows():
            print(f"  - {row['itemid']}: {row['label']}")
    else:
        print(f"{med_name}: NOT FOUND")
        not_found.append(med_name)

print(f"\n=" * 60)
print(f"SEARCH SUMMARY:")
print(f"  Medications searched: {len(all_medications)}")
print(f"  Medications found: {len(medication_itemids)}")
print(f"  Medications not found: {len(not_found)}")
print(f"  Total unique ITEMIDs: {len(set(all_itemids))}")

if not_found:
    print(f"\nNot found: {not_found}")

# Create list of unique ITEMIDs for filtering inputevents
unique_itemids = list(set(all_itemids))
print(f"\nUnique ITEMIDs to filter: {unique_itemids}")

In [None]:
# Create enhanced cohort with medication features using DuckDB for ultra-fast processing
print("Creating medication features using ultra-fast DuckDB processing...")
print("This may take a few minutes due to the large inputevents dataset (~11M records)")
print("=" * 80)

start_time = time.time()

# Build the comprehensive SQL query to add all medication features at once
sql_parts = []

for med_name in medication_itemids.keys():
        # Filter out items containing "YDPMIIT99999" before joining
    filtered_itemids = [f"'{item}'" for item in medication_itemids[med_name] if "Y" not in str(item)]
    itemids_str = ','.join(filtered_itemids)
    
    sql_parts.append(f"""
    CASE WHEN EXISTS (
        SELECT 1 FROM d_items ie
        WHERE ie.hadm_id = c.admission_id
          AND cast(ie.itemid as varchar) IN ({itemids_str})
          AND CAST(ie.CHARTTIME AS TIMESTAMP) >= CAST(c.icu_intime AS TIMESTAMP)
          AND CAST(ie.CHARTTIME AS TIMESTAMP) <= CAST(c.icu_outtime AS TIMESTAMP)
    ) THEN 1 ELSE 0 END AS med_{med_name}""")

# Create the complete query
medication_features_query = f"""
SELECT 
    c.*,
    {','.join(sql_parts)}
FROM cohort_df c
"""

print("Executing comprehensive medication feature query...")
print(f"Adding {len(medication_itemids)} medication features...")

# Execute the query
enhanced_cohort = conn.execute(medication_features_query).df()

processing_time = time.time() - start_time
print(f"\n✓ Processing completed in {processing_time:.2f} seconds!")
print(f"Enhanced cohort shape: {enhanced_cohort.shape}")
print(f"New columns added: {len(medication_itemids)}")
print(f"Total columns now: {len(enhanced_cohort.columns)}")

In [None]:
# Debug: Print the first few medication queries to check for issues
print("Sample medication queries:")
for i, (med_name, itemids) in enumerate(list(medication_itemids.items())[:3]):
    itemids_str = ','.join(map(str, itemids))
    print(f"{med_name}: {itemids_str}")
    if i >= 2:
        break

In [None]:
# Analyze the new medication features
print("MEDICATION FEATURE ANALYSIS")
print("=" * 50)

# Get medication columns
med_columns = [col for col in enhanced_cohort.columns if col.startswith('med_')]
print(f"Medication features created: {len(med_columns)}")

# Calculate usage statistics
print(f"\nMedication usage statistics:")
usage_stats = []

for col in sorted(med_columns):
    med_name = col.replace('med_', '')
    usage_count = enhanced_cohort[col].sum()
    usage_pct = (usage_count / len(enhanced_cohort)) * 100
    
    usage_stats.append({
        'medication': med_name,
        'usage_count': usage_count,
        'usage_percentage': usage_pct
    })
    
    print(f"  {med_name}: {usage_count:,} stays ({usage_pct:.1f}%)")

# Overall medication usage
any_med_usage = (enhanced_cohort[med_columns].sum(axis=1) > 0).sum()
any_med_pct = (any_med_usage / len(enhanced_cohort)) * 100

print(f"\nOverall medication analysis:")
print(f"  Stays with ANY target medication: {any_med_usage:,} ({any_med_pct:.1f}%)")
print(f"  Stays with NO target medications: {len(enhanced_cohort) - any_med_usage:,} ({100 - any_med_pct:.1f}%)")

# Top medications by usage
usage_df = pd.DataFrame(usage_stats)
usage_df = usage_df.sort_values('usage_percentage', ascending=False)

print(f"\nTop 10 most used medications:")
for i, (_, row) in enumerate(usage_df.head(10).iterrows(), 1):
    print(f"  {i:2d}. {row['medication']}: {row['usage_count']:,} stays ({row['usage_percentage']:.1f}%)")

In [None]:
# Display sample of enhanced cohort
print("Enhanced cohort sample:")
print("=" * 30)

# Show basic info + a few medication columns
sample_columns = ['patient_id', 'admission_id', 'icu_stay_id', 'icu_duration_hours', 'age_years'] + med_columns[:5]
print(f"Sample columns: {sample_columns}")
print("\nFirst 10 rows:")
enhanced_cohort[sample_columns].head(10)

In [None]:
# Verify boolean nature of medication features
print(f"Verifying boolean nature of medication features:")
all_valid = True
for col in med_columns:
    unique_vals = set(enhanced_cohort[col].unique())
    if unique_vals <= {0, 1}:  # Should only contain 0 and 1
        print(f"  ✓ {col}: Valid boolean (0/1)")
    else:
        print(f"  ✗ {col}: Invalid values: {unique_vals}")
        all_valid = False

if all_valid:
    print("\n✓ All medication features are valid boolean columns!")
else:
    print("\n⚠️ Some medication features have invalid values!")

In [None]:
# Export the enhanced cohort dataset
print("Exporting enhanced cohort dataset...")
print("=" * 40)

# 1. Main enhanced cohort file
output_file = 'icu_cohort_with_medication_features.csv'
enhanced_cohort.to_csv(output_file, index=False)
print(f"✓ Enhanced cohort saved to: {output_file}")
print(f"  Rows: {len(enhanced_cohort):,}")
print(f"  Columns: {len(enhanced_cohort.columns)}")
print(f"  File size: {Path(output_file).stat().st_size / 1024 / 1024:.1f} MB")
