In [1]:
"""
Feature: catheter_type
Extract from procedures, output events, and clinical notes
Combines all three sources to get unified catheter type
"""
import pandas as pd
import os
import re
import ast
from config_paths import *
from utils import drop_columns

In [2]:
df = pd.read_csv(dataset_path)
initial_row_count = len(df)
# -----------------------------------------
# Normalize a cell value into list[str]
# -----------------------------------------
def to_list(x):
    if isinstance(x, list):
        return x
    if isinstance(x, str):
        s = x.strip()
        if s.startswith("[") and s.endswith("]"):
            try:
                parsed = ast.literal_eval(s)
                if isinstance(parsed, list):
                    return parsed
            except (ValueError, SyntaxError):
                pass
        return [i.strip() for i in s.split(",") if i.strip()]
    return []


# -----------------------------------------
# Explicit catheter_used_* columns
# -----------------------------------------
CATHETER_USED_COLUMNS = [
    "catheter_used_in_procedures_ir",
    "catheter_used_in_output_events",
    "catheter_used_in_datetime_events",
    "catheter_used_in_procedure_events",
    "catheter_used_in_clinical_notes",
]


# -----------------------------------------
# Merge catheter types from explicit sources
# -----------------------------------------
def merge_catheter_types(row):
    merged = set()

    for col in CATHETER_USED_COLUMNS:
        values = to_list(row.get(col, []))
        for v in values:
            if isinstance(v, str) and v.strip():
                merged.add(v.strip().lower())

    # Canonical ordering
    canonical_order = ["foley", "straight", "condom", "suprapubic", "ureteral"]
    merged_list = list(merged)
    merged_list.sort(
        key=lambda x: canonical_order.index(x)
        if x in canonical_order else len(canonical_order)
    )

    return merged_list


# -----------------------------------------
# Apply feature
# -----------------------------------------
df["catheter_type"] = df.apply(merge_catheter_types, axis=1)

# Save
df.to_csv(dataset_path, index=False)

print("✓ Feature 'catheter_type' created using explicit catheter_used_* columns")


# Diagnostic output
print(f"\nFeature 'catheter_type' added.")

# Count combinations (as tuples, matching notebook format)
combination_counts = (
    df["catheter_type"]
    .apply(lambda x: tuple(sorted(x)) if isinstance(x, list) else tuple())
    .value_counts()
)

# Remove empty combo
combination_counts = combination_counts.drop((), errors="ignore")

print(f"\nTop combinations (matching notebook format):")
for combo, count in combination_counts.head(20).items():
    print(f"{combo} = {count}")

print(f"\nTotal rows: {len(df)}")
print(f"Rows with catheter type: {df['catheter_type'].apply(lambda x: len(x) > 0 if isinstance(x, list) else False).sum()}")
print(f"Rows without catheter type: {df['catheter_type'].apply(lambda x: len(x) == 0 if isinstance(x, list) else True).sum()}")
print(f"Expected total: {initial_row_count}")
if len(df) == initial_row_count:
    print(f"✓ Row count matches expected total ({initial_row_count})")
else:
    print(f"⚠ Row count mismatch! Expected {initial_row_count}, got {len(df)} (difference: {len(df) - initial_row_count})")
print(f"Dataset shape: {df.shape}")


✓ Feature 'catheter_type' created using explicit catheter_used_* columns

Feature 'catheter_type' added.

Top combinations (matching notebook format):
('foley',) = 104270
('suprapubic',) = 8813
('foley', 'suprapubic') = 4449
('foley', 'straight') = 4431
('condom', 'foley') = 2307
('straight',) = 1905
('ureteral',) = 1700
('foley', 'ureteral') = 1623
('condom',) = 1508
('condom', 'foley', 'straight') = 1269
('foley', 'unknown') = 976
('condom', 'straight') = 298
('foley', 'straight', 'suprapubic') = 258
('foley', 'nephrostomy') = 217
('suprapubic', 'ureteral') = 166
('foley', 'suprapubic', 'ureteral') = 165
('straight', 'suprapubic') = 127
('foley', 'nephrostomy', 'ureteral') = 61
('condom', 'foley', 'suprapubic') = 46
('nephrostomy',) = 44

Total rows: 158020
Rows with catheter type: 134985
Rows without catheter type: 23035
Expected total: 158020
✓ Row count matches expected total (158020)
Dataset shape: (158020, 57)


In [4]:
df.columns

Index(['subject_id', 'hadm_id', 'cauti_type', 'cauti_icd_codes',
       'other_uti_icd_codes', 'remaining_icd_codes', 'gender', 'anchor_age',
       'admittime', 'dischtime', 'admission_type', 'admission_location',
       'discharge_location', 'race', 'catheter_procedures',
       'catheter_procedures_ir', 'catheter_used_in_procedures_ir',
       'catheter_insertion_date_ir', 'catheter_removal_date_ir',
       'catheter_outputevents', 'catheter_used_in_output_events',
       'datetimeevents', 'catheter_used_in_datetime_events',
       'insertion_date_from_datetimeevents',
       'removal_date_from_datetimeevents', 'catheter_procedure_events',
       'catheter_used_in_procedure_events', 'catheter_size_from_chartevents',
       'index', 'catheter_clinical_notes', 'catheter_used_in_clinical_notes',
       'catheter_present', 'final_cauti_flag', 'final_insertion_date',
       'final_removal_date', 'catheter_duration_days', 'BMI_in_admission',
       'BMI_computed', 'BMI_last_year', 'BMI', 

In [5]:
# cols_to_drop = [
#  "catheter_type"
# ]
# df = drop_columns(df, cols_to_drop)
# df.to_csv(dataset_path, index=False)