In [1]:
"""
Feature: multiple_invasive_devices & no_of_invasive_devices
Count invasive devices (central line, mechanical ventilation, tracheostomy, dialysis line)
"""
import pandas as pd
import os
from config_paths import *
from utils import drop_columns

In [2]:
# Read dataset
df = pd.read_csv(dataset_path)
initial_row_count = len(df)

# Load procedures
procedures = pd.read_csv(
    os.path.join(hosp_path, "procedures_icd.csv"),
    usecols=["subject_id", "hadm_id", "icd_code", "icd_version"]
)
procedures["icd_code"] = procedures["icd_code"].astype(str)
procedures["icd_version"] = procedures["icd_version"].astype(int)
procedures["icd_code_clean"] = procedures["icd_code"].str.replace(".", "", regex=False)

# Device patterns
secondary_cauti_risk_devices = {
    "central_line": ["3893", "05H0", "05H1", "05H2", "05H3"],
    "mechanical_ventilation": ["967", "5A19"],
    "tracheostomy": ["311", "312", "0B11"],
    "dialysis_line": ["3995", "5A1D"]
}

device_cols = list(secondary_cauti_risk_devices.keys())

# Create device flags
for device_name, prefixes in secondary_cauti_risk_devices.items():
    procedures[device_name] = procedures["icd_code_clean"].str.startswith(tuple(prefixes))

# Aggregate to admission level
device_admission_level = (
    procedures.groupby(["subject_id", "hadm_id"])[device_cols]
    .max()
    .reset_index()
)

# Fill NaN with False
device_admission_level[device_cols] = device_admission_level[device_cols].fillna(False)

# Count devices
device_admission_level["no_of_invasive_devices"] = (
    device_admission_level[device_cols].astype(int).sum(axis=1)
)

device_admission_level["multiple_invasive_devices"] = (
    device_admission_level["no_of_invasive_devices"] >= 1
)

# Merge
df = df.merge(
    device_admission_level[["subject_id", "hadm_id", "no_of_invasive_devices", "multiple_invasive_devices"]],
    on=["subject_id", "hadm_id"],
    how="left"
)

# Fill missing
df["no_of_invasive_devices"] = df["no_of_invasive_devices"].fillna(0).astype(int)
df["multiple_invasive_devices"] = (
    df["multiple_invasive_devices"].astype("boolean").fillna(False).astype(bool)
)

# Save
df.to_csv(dataset_path, index=False)

# Diagnostic output
print(f"\nTotal rows: {len(df)}")
print(f"Expected total: {initial_row_count}")
if len(df) == initial_row_count:
    print(f"✓ Row count matches expected total ({initial_row_count})")
else:
    print(f"⚠ Row count mismatch! Expected {initial_row_count}, got {len(df)} (difference: {len(df) - initial_row_count})")
print(f"Dataset shape: {df.shape}")
print(f"Feature 'no_of_invasive_devices' added. Range: {df['no_of_invasive_devices'].min()} - {df['no_of_invasive_devices'].max()}")
print(f"Feature 'multiple_invasive_devices' added. True count: {df['multiple_invasive_devices'].sum()}")
print(f"Dataset shape: {df.shape}")



Total rows: 158020
Expected total: 158020
✓ Row count matches expected total (158020)
Dataset shape: (158020, 67)
Feature 'no_of_invasive_devices' added. Range: 0 - 4
Feature 'multiple_invasive_devices' added. True count: 29549
Dataset shape: (158020, 67)


In [3]:
df.columns

Index(['subject_id', 'hadm_id', 'cauti_type', 'cauti_icd_codes',
       'other_uti_icd_codes', 'remaining_icd_codes', 'gender', 'anchor_age',
       'admittime', 'dischtime', 'admission_type', 'admission_location',
       'discharge_location', 'race', 'catheter_procedures',
       'catheter_procedures_ir', 'catheter_used_in_procedures_ir',
       'catheter_insertion_date_ir', 'catheter_removal_date_ir',
       'catheter_outputevents', 'catheter_used_in_output_events',
       'datetimeevents', 'catheter_used_in_datetime_events',
       'insertion_date_from_datetimeevents',
       'removal_date_from_datetimeevents', 'catheter_procedure_events',
       'catheter_used_in_procedure_events', 'catheter_size_from_chartevents',
       'index', 'catheter_clinical_notes', 'catheter_used_in_clinical_notes',
       'catheter_present', 'final_cauti_flag', 'final_insertion_date',
       'final_removal_date', 'catheter_duration_days', 'BMI_in_admission',
       'BMI_computed', 'BMI_last_year', 'BMI', 

In [4]:
# cols_to_drop = [
#  "multiple_invasive_devices",
#  "no_of_invasive_devices"
# ]
# df = drop_columns(df, cols_to_drop)
# df.to_csv(dataset_path, index=False)