In [1]:
"""
Feature: catheter_care
Extract from discharge notes using regex patterns and strengthen with procedure data
"""
import pandas as pd
import os
import re
from config_paths import *
from utils import drop_columns

In [2]:
# Read dataset
df = pd.read_csv(dataset_path)
initial_row_count = len(df)

# ============================================
# 1. NLP features from discharge notes
# ============================================
# Load discharge notes
discharge = pd.read_csv(
    os.path.join(note_path, "discharge.csv"),
    usecols=["subject_id", "hadm_id", "text"]
)

# Catheter care pattern (match notebook: no na=False)
discharge["catheter_care"] = discharge["text"].str.contains(
    r"(?:catheter care|foley care|perineal care|meatal care|pericare|hygiene).*?"
    r"(?:done|performed|completed|given)?",
    flags=re.IGNORECASE,
    regex=True
)

# Aggregate per admission
if not discharge.empty:
    catheter_care_nlp = (
        discharge.groupby(["subject_id", "hadm_id"])["catheter_care"]
        .any()
        .reset_index()
    )
else:
    # Create empty DataFrame with correct structure
    catheter_care_nlp = pd.DataFrame(columns=["subject_id", "hadm_id", "catheter_care"])

# ============================================
# 2. Procedure-based features from procedures_icd
# ============================================
# ICD-9 codes for catheter procedures (matches notebook)
icd9_catheter_codes = [
    "598",    # Ureteral catheterization
    "5994",   # Replacement of cystostomy tube
    "9646",   # Irrigation of ureterostomy and ureteral catheter
    "9647",   # Irrigation of cystostomy
    "9648",   # Irrigation of urinary catheter (Foley/urethral)
    "9762",   # Removal of ureterostomy tube/catheter
    "9763",   # Removal of cystostomy tube
    "5794"
]

# ICD-10-PCS prefixes for catheter procedures (matches notebook)
icd10_prefixes = [
    "0T9B",   # Bladder catheterization (Foley/urethral)
    "0T9C",   # Upper urinary tract catheter (ureter/nephrostomy)
    "0T2B",   # Suprapubic catheter (cystostomy)
    "0TPB",   # Removal of bladder catheter
    "0TRB",   # Replacement of bladder catheter
    "0TWB"    # Revision of bladder catheter
]

# Load procedures
procedures = pd.read_csv(
    os.path.join(hosp_path, "procedures_icd.csv"),
    usecols=["subject_id", "hadm_id", "icd_code", "icd_version"]
)

# Normalize code + version
procedures["icd_code"] = procedures["icd_code"].astype(str)
procedures["icd_version"] = procedures["icd_version"].astype(int)

# Helper: is this a catheter-related procedure? (matches notebook)
def is_catheter_procedure(code: str, version: int) -> bool:
    code = str(code)
    if version == 9:
        return code in icd9_catheter_codes
    if version == 10:
        return any(code.startswith(pref) for pref in icd10_prefixes)
    return False

procedures["is_catheter_proc"] = procedures.apply(
    lambda r: is_catheter_procedure(r["icd_code"], r["icd_version"]),
    axis=1
)

# Filter to only catheter procedures (matches notebook)
catheter_procs = procedures[procedures["is_catheter_proc"]].copy()

# Helper: classify catheter procedure (matches notebook logic)
def classify_catheter_proc_row(row):
    code = str(row["icd_code"])
    version = int(row["icd_version"])
    
    care = False
    
    if version == 9:
        # ICD-9 codes: 9647, 9648 → irrigation → treat as catheter care / manipulation
        if code in ["9647", "9648"]:
            care = True
    
    return pd.Series({
        "catheter_care_proc": care
    })

# Apply classification to filtered catheter procedures (matches notebook)
proc_flags = catheter_procs.apply(classify_catheter_proc_row, axis=1)
catheter_procs = pd.concat([catheter_procs, proc_flags], axis=1)

# Aggregate procedure-based flags at admission level (matches notebook)
catheter_care_proc = (
    catheter_procs
    .groupby(["subject_id", "hadm_id"], as_index=False)["catheter_care_proc"]
    .any()
)

# ============================================
# 3. Merge NLP and procedure evidence
# ============================================
# Merge NLP features
df = df.merge(catheter_care_nlp, on=["subject_id", "hadm_id"], how="left")

# Merge procedure features
df = df.merge(catheter_care_proc, on=["subject_id", "hadm_id"], how="left")

# Ensure columns exist (matches notebook)
if "catheter_care" not in df.columns:
    df["catheter_care"] = False
if "catheter_care_proc" not in df.columns:
    df["catheter_care_proc"] = False

# ============================================
# 4. Combine NLP + procedure evidence (OR logic) BEFORE filling NaN
#    (matches notebook: combine first, then fill NaN)
# ============================================
df["catheter_care"] = (
    df["catheter_care"] | df["catheter_care_proc"]
)

# Drop intermediate column
df = df.drop(columns=["catheter_care_proc"])

# ============================================
# 5. Fill missing values with False (matches notebook: fill after combining)
# ============================================
# Convert to nullable boolean, fill NaN, then convert to bool
df["catheter_care"] = (
    df["catheter_care"]
    .astype("boolean")
    .fillna(False)
    .astype(bool)
)

# Save
df.to_csv(dataset_path, index=False)

# Diagnostic output
print(f"\nTotal rows: {len(df)}")
print(f"Expected total: {initial_row_count}")
if len(df) == initial_row_count:
    print(f"✓ Row count matches expected total ({initial_row_count})")
else:
    print(f"⚠ Row count mismatch! Expected {initial_row_count}, got {len(df)} (difference: {len(df) - initial_row_count})")
print(f"Dataset shape: {df.shape}")
print(f"Feature 'catheter_care' added.")
print(f"True count: {df['catheter_care'].sum()}")
print(f"False count: {(~df['catheter_care']).sum()}")
print(f"Dataset shape: {df.shape}")


Total rows: 158020
Expected total: 158020
✓ Row count matches expected total (158020)
Dataset shape: (158020, 69)
Feature 'catheter_care' added.
True count: 2582
False count: 155438
Dataset shape: (158020, 69)


In [3]:
df.columns

Index(['subject_id', 'hadm_id', 'cauti_type', 'cauti_icd_codes',
       'other_uti_icd_codes', 'remaining_icd_codes', 'gender', 'anchor_age',
       'admittime', 'dischtime', 'admission_type', 'admission_location',
       'discharge_location', 'race', 'catheter_procedures',
       'catheter_procedures_ir', 'catheter_used_in_procedures_ir',
       'catheter_insertion_date_ir', 'catheter_removal_date_ir',
       'catheter_outputevents', 'catheter_used_in_output_events',
       'datetimeevents', 'catheter_used_in_datetime_events',
       'insertion_date_from_datetimeevents',
       'removal_date_from_datetimeevents', 'catheter_procedure_events',
       'catheter_used_in_procedure_events', 'catheter_size_from_chartevents',
       'index', 'catheter_clinical_notes', 'catheter_used_in_clinical_notes',
       'catheter_present', 'final_cauti_flag', 'final_insertion_date',
       'final_removal_date', 'catheter_duration_days', 'BMI_in_admission',
       'BMI_computed', 'BMI_last_year', 'BMI', 

In [4]:
# cols_to_drop = [
#  "catheter_care"
# ]
# df = drop_columns(df, cols_to_drop)
# df.to_csv(dataset_path, index=False)