In [6]:
"""
Feature : urinary_obstruction_present
Extract from discharge notes using regex patterns
"""
import pandas as pd
import os
import re
from config_paths import *
from utils import drop_columns

In [8]:
# Read dataset
df = pd.read_csv(dataset_path)
initial_row_count = len(df)

# Load discharge notes
discharge = pd.read_csv(
    os.path.join(note_path, "discharge.csv"),
    usecols=["subject_id", "hadm_id", "text"]
)

# Phrases suggesting obstruction/blockage of urinary catheter / flow (complete from notebook)
OBSTRUCTION_PATTERNS = [
    # Explicit obstruction / blockage
    r"urinary obstruction",
    r"obstruction of urine",
    r"obstructed urine flow",
    r"obstructed foley",
    r"obstructed catheter",
    r"catheter obstruction",
    r"urethral obstruction",
    r"bladder outlet obstruction",
    # Blocked / blockage
    r"urinary blockage",
    r"catheter blockage",
    r"foley blockage",
    r"blocked catheter",
    r"blocked foley",
    r"tube blocked",
    r"line blocked",
    # Not draining / poor drainage
    r"foley not draining",
    r"catheter not draining",
    r"no urine draining",
    r"urine not draining",
    r"poor urinary drainage",
    r"poor drainage from foley",
    r"inadequate urine drainage",
    # Kinks / dependent loops (common documentation)
    r"kinked foley",
    r"kinked catheter",
    r"kinked tubing",
    r"dependent loop in tubing",
    r"dependent loop in foley",
    r"dependent loops in tubing",
    # Interventions that mention blockage
    r"irrigation.*for (blockage|obstruction)",
    r"flushed.*for (blockage|obstruction)",
    r"flushed due to (blockage|obstruction)",
    r"foley irrigated due to (blockage|obstruction)",
    r"catheter irrigated due to (blockage|obstruction)",
]

# Phrases that explicitly say there is *no* obstruction / no blockage (complete from notebook)
NO_OBSTRUCTION_PATTERNS = [
    r"no urinary obstruction",
    r"no evidence of urinary obstruction",
    r"no bladder outlet obstruction",
    r"no obstruction to urine flow",
    r"no catheter blockage",
    r"no evidence of catheter blockage",
    r"foley draining well",
    r"foley draining appropriately",
    r"drainage adequate",
    r"adequate urine drainage",
    r"urine draining without difficulty",
]

obstruction_regex = re.compile("|".join(OBSTRUCTION_PATTERNS), re.IGNORECASE)
no_obstruction_regex = re.compile("|".join(NO_OBSTRUCTION_PATTERNS), re.IGNORECASE)

def extract_urinary_obstruction(text):
    """Extract urinary obstruction flag from text"""
    if not isinstance(text, str):
        return False
    
    has_obstruction = bool(obstruction_regex.search(text))
    has_no_obstruction = bool(no_obstruction_regex.search(text))
    
    return has_obstruction and not has_no_obstruction

discharge_flag = discharge.copy()
discharge_flag["urinary_obstruction_present_note"] = discharge_flag["text"].apply(extract_urinary_obstruction)

# Aggregate per admission (using .max() like notebook)
urinary_obstruction_adm = (
    discharge_flag
    .groupby(["subject_id", "hadm_id"], as_index=False)["urinary_obstruction_present_note"]
    .max()
    .rename(columns={"urinary_obstruction_present_note": "urinary_obstruction_present"})
)

urinary_obstruction_adm["urinary_obstruction_present"] = (
    urinary_obstruction_adm["urinary_obstruction_present"]
    .astype("boolean")
)

# Merge
df = df.merge(urinary_obstruction_adm, on=["subject_id", "hadm_id"], how="left")

# Fill missing with False (using .astype("boolean") like notebook)
df["urinary_obstruction_present"] = (
    df["urinary_obstruction_present"]
    .fillna(False)
    .astype("boolean")
)

# Save
df.to_csv(dataset_path, index=False)

# Diagnostic output
print(f"\nTotal rows: {len(df)}")
print(f"Expected total: {initial_row_count}")
if len(df) == initial_row_count:
    print(f"✓ Row count matches expected total ({initial_row_count})")
else:
    print(f"⚠ Row count mismatch! Expected {initial_row_count}, got {len(df)} (difference: {len(df) - initial_row_count})")
print(f"Dataset shape: {df.shape}")
print(f"Feature 'urinary_obstruction_present' added. True count: {df['urinary_obstruction_present'].sum()}")
print(f"Dataset shape: {df.shape}")


Total rows: 158020
Expected total: 158020
✓ Row count matches expected total (158020)
Dataset shape: (158020, 60)
Feature 'urinary_obstruction_present' added. True count: 1208
Dataset shape: (158020, 60)


In [9]:
df.columns

Index(['subject_id', 'hadm_id', 'cauti_type', 'cauti_icd_codes',
       'other_uti_icd_codes', 'remaining_icd_codes', 'gender', 'anchor_age',
       'admittime', 'dischtime', 'admission_type', 'admission_location',
       'discharge_location', 'race', 'catheter_procedures',
       'catheter_procedures_ir', 'catheter_used_in_procedures_ir',
       'catheter_insertion_date_ir', 'catheter_removal_date_ir',
       'catheter_outputevents', 'catheter_used_in_output_events',
       'datetimeevents', 'catheter_used_in_datetime_events',
       'insertion_date_from_datetimeevents',
       'removal_date_from_datetimeevents', 'catheter_procedure_events',
       'catheter_used_in_procedure_events', 'catheter_size_from_chartevents',
       'index', 'catheter_clinical_notes', 'catheter_used_in_clinical_notes',
       'catheter_present', 'final_cauti_flag', 'final_insertion_date',
       'final_removal_date', 'catheter_duration_days', 'BMI_in_admission',
       'BMI_computed', 'BMI_last_year', 'BMI', 

In [10]:
# cols_to_drop = [
#  "urinary_obstruction_present"
# ]
# df = drop_columns(df, cols_to_drop)
# df.to_csv(dataset_path, index=False)