In [8]:
"""
Feature: has_cauti_history
Check if patient had CAUTI in previous admissions
"""
import pandas as pd
import os
from config_paths import *
from utils import drop_columns

# Read dataset
df = pd.read_csv(dataset_path)
initial_row_count = len(df)
df["admittime"] = pd.to_datetime(df["admittime"], errors="coerce")

# Sort by patient and time
df = df.sort_values(["subject_id", "admittime", "hadm_id"]).reset_index(drop=True)

# Boolean mask: which admissions count as "definite CAUTI" for history
# Check if patient has catheter present AND other UTI present (from feature 23)
# Or if they have strict CAUTI diagnosis codes
definite_mask = (
    (
        df["catheter_present"].fillna(False) &
        df["other_uti_present"].fillna(False)
    ) |
    df["other_uti_present"].fillna(False)
)

# For each subject: shift to exclude current admission, then cummax to track history
df["has_cauti_history"] = (
    definite_mask
    .groupby(df["subject_id"])
    .transform(lambda s: s.shift(fill_value=False).cummax())
    .astype(bool)
)

# Save
df.to_csv(dataset_path, index=False)

# Diagnostic output
print(f"\nTotal rows: {len(df)}")
print(f"Expected total: {initial_row_count}")
if len(df) == initial_row_count:
    print(f"✓ Row count matches expected total ({initial_row_count})")
else:
    print(f"⚠ Row count mismatch! Expected {initial_row_count}, got {len(df)} (difference: {len(df) - initial_row_count})")
print(f"Dataset shape: {df.shape}")
print(f"Feature 'has_cauti_history' added. True count: {df['has_cauti_history'].sum()}")
print(f"Dataset shape: {df.shape}")




Total rows: 158020
Expected total: 158020
✓ Row count matches expected total (158020)
Dataset shape: (158020, 101)
Feature 'has_cauti_history' added. True count: 26738
Dataset shape: (158020, 101)


In [9]:
df.columns

Index(['subject_id', 'hadm_id', 'cauti_type', 'cauti_icd_codes',
       'other_uti_icd_codes', 'remaining_icd_codes', 'gender', 'anchor_age',
       'admittime', 'dischtime',
       ...
       'blood_culture_performed', 'temperature', 'heart_rate', 'resp_rate',
       'o2sat', 'BP_systolic', 'BP_diastolic', 'other_uti',
       'other_uti_present', 'has_cauti_history'],
      dtype='object', length=101)

In [10]:
# cols_to_drop = [
#  "has_cauti_history"
# ]
# df = drop_columns(df, cols_to_drop)
# df.to_csv(dataset_path, index=False)