In [15]:
"""
Feature : ventilator_used
Flag if ventilator was used from chartevents and discharge notes
"""
import pandas as pd
import os
import re
from config_paths import *
from utils import drop_columns

In [17]:
# Read dataset
df = pd.read_csv(dataset_path)
initial_row_count = len(df)

# Load ICU chartevents in chunks
vent_item_ids = {223848, 223849, 227565, 227566, 229314}
vent_icu_pairs = set()

try:
    chunk_iter = pd.read_csv(
        os.path.join(icu_path, "chartevents.csv"),
        usecols=["subject_id", "hadm_id", "itemid"],
        dtype={"subject_id": "int32", "hadm_id": "int32", "itemid": "int32"},
        chunksize=1000000,
        low_memory=False
    )
    
    for chunk in chunk_iter:
        vent_chunk = chunk[chunk["itemid"].isin(vent_item_ids)]
        if not vent_chunk.empty:
            pairs = set(zip(vent_chunk["subject_id"], vent_chunk["hadm_id"]))
            vent_icu_pairs.update(pairs)
    
    vent_icu_df = pd.DataFrame(
        list(vent_icu_pairs),
        columns=["subject_id", "hadm_id"]
    )
    vent_icu_df["ventilator_icu"] = True
except:
    vent_icu_df = pd.DataFrame(columns=["subject_id", "hadm_id", "ventilator_icu"])

# Load discharge notes in chunks (matching notebook approach)
vent_terms = [
    "mechanical ventilation", "mechanically ventilated", "on ventilator",
    "on vent", "ventilated", "intubated", "intubation", "et tube", "ett",
    "endotracheal tube"
]

vent_pattern = "|".join(re.escape(t) for t in vent_terms)

vent_note_pairs = set()

try:
    chunk_iter = pd.read_csv(
        os.path.join(note_path, "discharge.csv"),
        usecols=["subject_id", "hadm_id", "text"],
        chunksize=100000
    )
    
    for chunk in chunk_iter:
        hits = chunk[
            chunk["text"].str.contains(vent_pattern, case=False, na=False, regex=True)
        ]
        if not hits.empty:
            pairs = set(zip(hits["subject_id"], hits["hadm_id"]))
            vent_note_pairs.update(pairs)
    
    vent_notes_df = pd.DataFrame(
        list(vent_note_pairs),
        columns=["subject_id", "hadm_id"]
    )
    vent_notes_df["ventilator_notes"] = True
except:
    vent_notes_df = pd.DataFrame(columns=["subject_id", "hadm_id", "ventilator_notes"])

# Merge
df = df.merge(vent_icu_df, on=["subject_id", "hadm_id"], how="left")
df = df.merge(vent_notes_df, on=["subject_id", "hadm_id"], how="left")

# Clean up booleans (matching notebook approach)
for col in ["ventilator_icu", "ventilator_notes"]:
    df[col] = (
        df[col]
        .astype("boolean")
        .fillna(False)
        .astype(bool)
    )

# Final combined feature
df["ventilator_used"] = df["ventilator_icu"] | df["ventilator_notes"]

# Drop intermediate columns
df.drop(columns=["ventilator_icu", "ventilator_notes"], errors="ignore", inplace=True)

# Save
df.to_csv(dataset_path, index=False)
print(f"Feature 'ventilator_used' added.")
print(f"Value counts:\n{df['ventilator_used'].value_counts()}")
print(f"Total rows: {len(df)}")
print(f"True count: {df['ventilator_used'].sum()}")
print(f"False count: {(~df['ventilator_used']).sum()}")
print(f"Sum check (True + False): {df['ventilator_used'].sum() + (~df['ventilator_used']).sum()}")
print(f"Expected total: {initial_row_count}")
if len(df) == initial_row_count:
    print(f"✓ Row count matches expected total ({initial_row_count})")
else:
    print(f"⚠ Row count mismatch! Expected {initial_row_count}, got {len(df)} (difference: {len(df) - initial_row_count})")
print(f"Dataset shape: {df.shape}")



Feature 'ventilator_used' added.
Value counts:
ventilator_used
True     103288
False     54732
Name: count, dtype: int64
Total rows: 158020
True count: 103288
False count: 54732
Sum check (True + False): 158020
Expected total: 158020
✓ Row count matches expected total (158020)
Dataset shape: (158020, 103)


In [18]:
df.columns

Index(['subject_id', 'hadm_id', 'cauti_type', 'cauti_icd_codes',
       'other_uti_icd_codes', 'remaining_icd_codes', 'gender', 'anchor_age',
       'admittime', 'dischtime',
       ...
       'heart_rate', 'resp_rate', 'o2sat', 'BP_systolic', 'BP_diastolic',
       'other_uti', 'other_uti_present', 'has_cauti_history',
       'pain_documented', 'ventilator_used'],
      dtype='object', length=103)

In [19]:
# cols_to_drop = [
#  "ventilator_used"
# ]
# df = drop_columns(df, cols_to_drop)
# df.to_csv(dataset_path, index=False)