In [1]:
"""
Feature: o2sat
Extract min oxygen saturation from chartevents (ICU) and vitalsign (ED)
"""
import pandas as pd
import os
from config_paths import *
from utils import drop_columns

In [2]:
# Read dataset
df = pd.read_csv(dataset_path)
initial_row_count = len(df)

# Load ICU chartevents in chunks (min for O2 sat)
o2_itemids = [220277, 228232]
o2_min_list = []

try:
    chunk_iter = pd.read_csv(
        os.path.join(icu_path, "chartevents.csv"),
        usecols=["subject_id", "hadm_id", "itemid", "valuenum"],
        chunksize=1000000,
        low_memory=False
    )
    
    for chunk in chunk_iter:
        o2 = chunk[chunk["itemid"].isin(o2_itemids)].copy()
        o2 = o2.dropna(subset=["valuenum"])
        o2 = o2[(o2["valuenum"] >= 50) & (o2["valuenum"] <= 100)]
        
        if not o2.empty:
            o2_min_chunk = o2.groupby("hadm_id")["valuenum"].min()  # min for O2
            o2_min_list.append(o2_min_chunk)
    
    if o2_min_list:
        o2_min_icu = (
            pd.concat(o2_min_list)
            .groupby("hadm_id")
            .min()
            .reset_index()
            .rename(columns={"valuenum": "o2sat_icu"})
        )
    else:
        o2_min_icu = pd.DataFrame(columns=["hadm_id", "o2sat_icu"])
except:
    o2_min_icu = pd.DataFrame(columns=["hadm_id", "o2sat_icu"])

# Load ED vitalsign
try:
    vitalsign = pd.read_csv(os.path.join(ed_path, "vitalsign.csv"))
    ed_stays = pd.read_csv(os.path.join(ed_path, "edstays.csv"))
    
    ed_vitals_o2 = ed_stays.merge(
        vitalsign[["subject_id", "stay_id", "o2sat"]],
        on=["subject_id", "stay_id"],
        how="left"
    )
    
    # Filter valid SpO2 values (matching notebook)
    ed_vitals_o2 = ed_vitals_o2[
        (ed_vitals_o2["o2sat"].notna()) &
        (ed_vitals_o2["o2sat"] >= 50) &
        (ed_vitals_o2["o2sat"] <= 100)
    ]
    
    # Worst (minimum) ED SpO2 per admission (matching notebook)
    o2_min_ed = (
        ed_vitals_o2
        .groupby(["subject_id", "hadm_id"], as_index=False)["o2sat"]
        .min()
        .rename(columns={"o2sat": "o2sat_ed"})
    )
except:
    o2_min_ed = pd.DataFrame(columns=["subject_id", "hadm_id", "o2sat_ed"])

# Merge (matching notebook approach)
df = df.merge(o2_min_icu, on="hadm_id", how="left")
if not o2_min_ed.empty:
    # Merge ED using only hadm_id (matching notebook)
    df = df.merge(
        o2_min_ed[["hadm_id", "o2sat_ed"]],
        on="hadm_id",
        how="left"
    )

# Combine: For oxygen, lower is worse → take the MIN across ICU + ED (matching notebook)
df["o2sat"] = df[["o2sat_icu", "o2sat_ed"]].min(axis=1)

# Drop intermediate columns
df.drop(columns=["o2sat_icu", "o2sat_ed"], errors="ignore", inplace=True)

# Save
df.to_csv(dataset_path, index=False)
print(f"Feature 'o2sat' added.")
print(f"Non-null count: {df['o2sat'].notna().sum()}")
print(f"Null count: {df['o2sat'].isna().sum()}")
print(f"Total rows: {len(df)}")
print(f"Expected total: {initial_row_count}")
if len(df) == initial_row_count:
    print(f"✓ Row count matches expected total ({initial_row_count})")
else:
    print(f"⚠ Row count mismatch! Expected {initial_row_count}, got {len(df)} (difference: {len(df) - initial_row_count})")
print(f"Dataset shape: {df.shape}")

Feature 'o2sat' added.
Non-null count: 110361
Null count: 47659
Total rows: 158020
Expected total: 158020
✓ Row count matches expected total (158020)
Dataset shape: (158020, 96)


In [3]:
df.columns

Index(['subject_id', 'hadm_id', 'cauti_type', 'cauti_icd_codes',
       'other_uti_icd_codes', 'remaining_icd_codes', 'gender', 'anchor_age',
       'admittime', 'dischtime', 'admission_type', 'admission_location',
       'discharge_location', 'race', 'catheter_procedures',
       'catheter_procedures_ir', 'catheter_used_in_procedures_ir',
       'catheter_insertion_date_ir', 'catheter_removal_date_ir',
       'catheter_outputevents', 'catheter_used_in_output_events',
       'datetimeevents', 'catheter_used_in_datetime_events',
       'insertion_date_from_datetimeevents',
       'removal_date_from_datetimeevents', 'catheter_procedure_events',
       'catheter_used_in_procedure_events', 'catheter_size_from_chartevents',
       'index', 'catheter_clinical_notes', 'catheter_used_in_clinical_notes',
       'catheter_present', 'final_cauti_flag', 'final_insertion_date',
       'final_removal_date', 'catheter_duration_days', 'BMI_in_admission',
       'BMI_computed', 'BMI_last_year', 'BMI', 

In [4]:
# cols_to_drop = [
#  "o2sat"
# ]
# df = drop_columns(df, cols_to_drop)
# df.to_csv(dataset_path, index=False)