In [1]:
"""
Feature: n_catheter_manip_unique_types
Count unique catheter manipulation types (insertion, removal, change, irrigation)
Extracts from THREE sources: procedures, output events, and discharge notes
Takes union of all sources (matching notebook logic)
"""
import pandas as pd
import os
import re
from config_paths import *
from utils import drop_columns

In [2]:
# =====================================================
# Read dataset
# =====================================================
df = pd.read_csv(dataset_path)
initial_row_count = len(df)

# =====================================================
# 1. PROCEDURES → manipulation types
# =====================================================
procedures = pd.read_csv(
    os.path.join(hosp_path, "procedures_icd.csv"),
    usecols=["subject_id", "hadm_id", "icd_code", "icd_version"]
)

procedures["icd_code"] = procedures["icd_code"].astype(str)
procedures["icd_version"] = procedures["icd_version"].astype(int)

icd9_catheter_codes = ["598", "5994", "9646", "9647", "9648", "9762", "9763", "5794"]
icd10_catheter_prefixes = ["0T9B", "0T9C", "0T2B", "0T2C", "0TPB", "0TRB", "0TWB"]

def is_catheter_procedure(code, version):
    if version == 9:
        return code in icd9_catheter_codes
    if version == 10:
        return any(code.startswith(p) for p in icd10_catheter_prefixes)
    return False

def map_proc_to_manip_type(code, version):
    if version == 9:
        if code in ["598", "5794"]:
            return "insertion"
        if code == "5994":
            return "change_replace"
        if code in ["9646", "9647", "9648"]:
            return "irrigation"
        if code in ["9762", "9763"]:
            return "removal"
    if version == 10:
        if code.startswith(("0T9B", "0T9C")):
            return "insertion"
        if code.startswith(("0T2B", "0T2C", "0TRB", "0TWB")):
            return "change_replace"
        if code.startswith("0TPB"):
            return "removal"
    return "other"

procedures["is_catheter_proc"] = procedures.apply(
    lambda r: is_catheter_procedure(r["icd_code"], r["icd_version"]), axis=1
)

catheter_procs = procedures[procedures["is_catheter_proc"]].copy()

catheter_procs["manip_type_proc"] = catheter_procs.apply(
    lambda r: map_proc_to_manip_type(r["icd_code"], r["icd_version"]), axis=1
)

proc_types_per_adm = (
    catheter_procs
    .groupby(["subject_id", "hadm_id"])["manip_type_proc"]
    .agg(lambda x: sorted(set(x) - {"other"}))
    .reset_index(name="manip_types_proc")
)

# =====================================================
# 2. OUTPUTEVENTS → irrigation
# =====================================================
try:
    output = pd.read_csv(
        os.path.join(icu_path, "outputevents.csv"),
        usecols=["subject_id", "hadm_id", "itemid"]
    )

    IRRIGATION_ITEMIDS = [226566, 227487, 227488, 227489]

    output_types_per_adm = (
        output[output["itemid"].isin(IRRIGATION_ITEMIDS)]
        .drop_duplicates(subset=["subject_id", "hadm_id"])
        .assign(manip_types_output=[["irrigation"]])
        [["subject_id", "hadm_id", "manip_types_output"]]
    )
except:
    output_types_per_adm = pd.DataFrame(
        columns=["subject_id", "hadm_id", "manip_types_output"]
    )

# =====================================================
# 3. DISCHARGE NOTES → manipulation types
# =====================================================
try:
    discharge = pd.read_csv(
        os.path.join(note_path, "discharge.csv"),
        usecols=["subject_id", "hadm_id", "text"]
    )

    NOTE_MANIP_PATTERNS = [

        # INSERTION
        (
            r"(?:catheter|foley|urinary catheter|straight cath|suprapubic).*?"
            r"(?:inserted|placed|started|established|performed)",
            "insertion"
        ),
        (r"\burethral catheterization\b", "insertion"),

        # REMOVAL
        (
            r"(?:catheter|foley|urinary catheter).*?"
            r"(?:removed|discontinued|dc|withdrawn|taken out)",
            "removal"
        ),

        # REMOVAL / REPLACEMENT (NOTEBOOK MATCH)
        (
            r"(?:foley|catheter)\s*"
            r"(?:removed|discontinued|changed|replaced|taken out|withdrawn|reinserted|exchanged)",
            "change_replace"
        ),

        # IRRIGATION
        (
            r"(?:catheter|bladder).*?"
            r"(?:irrigation|irrigated)",
            "irrigation"
        ),
    ]

    NOTE_PATTERNS = [(re.compile(p, re.I), t) for p, t in NOTE_MANIP_PATTERNS]

    def extract_note_types(text):
        types = set()
        if not isinstance(text, str):
            return types
        for rx, t in NOTE_PATTERNS:
            if rx.search(text):
                types.add(t)
        return types

    note_types_per_adm = (
        discharge
        .groupby(["subject_id", "hadm_id"])["text"]
        .agg(lambda x: sorted(set().union(*(extract_note_types(t) for t in x))))
        .reset_index(name="manip_types_notes")
    )
except:
    note_types_per_adm = pd.DataFrame(
        columns=["subject_id", "hadm_id", "manip_types_notes"]
    )

# =====================================================
# 3b. DATETIMEEVENTS (already computed columns)
# =====================================================
dt_manip = df[
    [
        "subject_id",
        "hadm_id",
        "insertion_date_from_datetimeevents",
        "removal_date_from_datetimeevents",
    ]
].copy()

def derive_datetime_types(row):
    types = set()
    if pd.notna(row["insertion_date_from_datetimeevents"]):
        types.add("insertion")
    if pd.notna(row["removal_date_from_datetimeevents"]):
        types.add("removal")
    return list(types)

dt_manip["manip_types_datetime"] = dt_manip.apply(
    derive_datetime_types, axis=1
)

dt_manip = dt_manip[["subject_id", "hadm_id", "manip_types_datetime"]]

# =====================================================
# 4. MERGE ALL SOURCES
# =====================================================
manip = df[["subject_id", "hadm_id"]].copy()

manip = manip.merge(proc_types_per_adm, on=["subject_id", "hadm_id"], how="left")
manip = manip.merge(output_types_per_adm, on=["subject_id", "hadm_id"], how="left")
manip = manip.merge(note_types_per_adm, on=["subject_id", "hadm_id"], how="left")
manip = manip.merge(dt_manip, on=["subject_id", "hadm_id"], how="left")

# Normalize lists
for col in [
    "manip_types_proc",
    "manip_types_output",
    "manip_types_notes",
    "manip_types_datetime",
]:
    if col not in manip.columns:
        manip[col] = []
    manip[col] = manip[col].apply(lambda x: x if isinstance(x, list) else [])

# =====================================================
# 5. UNION + COUNT + BOOLEAN FLAGS
# =====================================================
def union_all(row):
    return set(
        row["manip_types_proc"]
        + row["manip_types_output"]
        + row["manip_types_notes"]
        + row["manip_types_datetime"]
    )

manip["_manip_union"] = manip.apply(union_all, axis=1)

manip["n_catheter_manip_unique_types"] = manip["_manip_union"].apply(len)

manip["catheter_insertion"] = manip["_manip_union"].apply(
    lambda s: "insertion" in s
)

manip["catheter_removal"] = manip["_manip_union"].apply(
    lambda s: "removal" in s
)

manip["catheter_removal_replacement"] = manip["_manip_union"].apply(
    lambda s: bool({"removal", "change_replace"} & s)
)

manip.drop(columns=["_manip_union"], inplace=True)

# =====================================================
# 6. MERGE BACK + SAVE
# =====================================================
df = df.merge(
    manip[
        [
            "subject_id",
            "hadm_id",
            "n_catheter_manip_unique_types",
            "catheter_insertion",
            "catheter_removal",
            "catheter_removal_replacement",
        ]
    ],
    on=["subject_id", "hadm_id"],
    how="left"
)

df["n_catheter_manip_unique_types"] = (
    df["n_catheter_manip_unique_types"].fillna(0).astype(int)
)

for col in [
    "catheter_insertion",
    "catheter_removal",
    "catheter_removal_replacement",
]:
    df[col] = df[col].fillna(False).astype(bool)

df.to_csv(dataset_path, index=False)

# =====================================================
# Diagnostics
# =====================================================
print("\nFeature engineering completed successfully.")
print(df["n_catheter_manip_unique_types"].value_counts().sort_index())
print(
    f"Rows: {len(df)} | Expected: {initial_row_count} | "
    f"{'✓ MATCH' if len(df) == initial_row_count else '⚠ MISMATCH'}"
)
print(f"Dataset shape: {df.shape}")



Feature engineering completed successfully.
n_catheter_manip_unique_types
0    99888
1    48133
2     8677
3     1258
4       64
Name: count, dtype: int64
Rows: 158020 | Expected: 158020 | ✓ MATCH
Dataset shape: (158020, 65)


In [3]:
df.columns

Index(['subject_id', 'hadm_id', 'cauti_type', 'cauti_icd_codes',
       'other_uti_icd_codes', 'remaining_icd_codes', 'gender', 'anchor_age',
       'admittime', 'dischtime', 'admission_type', 'admission_location',
       'discharge_location', 'race', 'catheter_procedures',
       'catheter_procedures_ir', 'catheter_used_in_procedures_ir',
       'catheter_insertion_date_ir', 'catheter_removal_date_ir',
       'catheter_outputevents', 'catheter_used_in_output_events',
       'datetimeevents', 'catheter_used_in_datetime_events',
       'insertion_date_from_datetimeevents',
       'removal_date_from_datetimeevents', 'catheter_procedure_events',
       'catheter_used_in_procedure_events', 'catheter_size_from_chartevents',
       'index', 'catheter_clinical_notes', 'catheter_used_in_clinical_notes',
       'catheter_present', 'final_cauti_flag', 'final_insertion_date',
       'final_removal_date', 'catheter_duration_days', 'BMI_in_admission',
       'BMI_computed', 'BMI_last_year', 'BMI', 

In [4]:
# cols_to_drop = [
#  "n_catheter_manip_unique_types",
#  "catheter_insertion",
#  "catheter_removal",
#  "catheter_removal_replacement"
# ]
# df = drop_columns(df, cols_to_drop)
# df.to_csv(dataset_path, index=False)