In [1]:
"""
Feature: improper_drainage_position
Extract from discharge notes using regex patterns
"""
import pandas as pd
import os
import re
from config_paths import *
from utils import drop_columns

In [2]:
# Read dataset
df = pd.read_csv(dataset_path)
initial_row_count = len(df)

# Load discharge notes
discharge = pd.read_csv(
    os.path.join(note_path, "discharge.csv"),
    usecols=["subject_id", "hadm_id", "text"]
)

# Improper drainage patterns (complete from notebook)
IMPROPER_PATTERNS = [
    # Direct height references
    r"drainage bag.*above.*bladder",
    r"urine bag.*above.*bladder",
    r"foley bag.*above.*bladder",
    r"bag.*above bladder",
    r"bag.*higher than.*bladder",
    r"bag.*elevated",
    # Backflow & obstruction (how clinicians actually write it)
    r"urine backflow",
    r"backflow of urine",
    r"urine backing up",
    r"reflux of urine",
    # Dependent loops / kinks (VERY common language)
    r"dependent loop",
    r"kinked tubing",
    r"foley kinked",
    r"obstructed foley",
    r"poor drainage from foley",
    r"foley not draining",
    r"urine not draining",
    # Bag positioning implied
    r"bag repositioned",
    r"drainage improved after lowering"
]

PROPER_PATTERNS = [
    r"bag.*below.*bladder",
    r"kept.*below.*bladder level",
    r"ensure.*bag.*below.*bladder",
    r"dependent drainage",
    r"free drainage",
    r"no obstruction",
    r"no backflow"
]

improper_regex = re.compile("|".join(IMPROPER_PATTERNS), re.IGNORECASE)
proper_regex = re.compile("|".join(PROPER_PATTERNS), re.IGNORECASE)

def extract_improper_drainage(text):
    """Extract improper drainage position flag"""
    if not isinstance(text, str):
        return False
    
    improper_match = bool(improper_regex.search(text))
    proper_match = bool(proper_regex.search(text))
    
    return improper_match and not proper_match

discharge_flag = discharge.copy()
discharge_flag["improper_drainage_position_note"] = discharge_flag["text"].apply(extract_improper_drainage)

# Aggregate per admission (using .max() like notebook)
improper_drainage_adm = (
    discharge_flag
    .groupby(["subject_id", "hadm_id"], as_index=False)["improper_drainage_position_note"]
    .max()
    .rename(columns={"improper_drainage_position_note": "improper_drainage_position"})
)

# Enforce proper Boolean dtype (like notebook)
improper_drainage_adm["improper_drainage_position"] = (
    improper_drainage_adm["improper_drainage_position"]
    .astype("boolean")
)

# Merge
df = df.merge(improper_drainage_adm, on=["subject_id", "hadm_id"], how="left")

# Fill missing with False (using .astype("boolean") like notebook)
df["improper_drainage_position"] = (
    df["improper_drainage_position"]
    .fillna(False)
    .astype("boolean")
)

# Save
df.to_csv(dataset_path, index=False)

# Diagnostic output
print(f"\nTotal rows: {len(df)}")
print(f"Expected total: {initial_row_count}")
if len(df) == initial_row_count:
    print(f"✓ Row count matches expected total ({initial_row_count})")
else:
    print(f"⚠ Row count mismatch! Expected {initial_row_count}, got {len(df)} (difference: {len(df) - initial_row_count})")
print(f"Dataset shape: {df.shape}")
print(f"Feature 'improper_drainage_position' added. True count: {df['improper_drainage_position'].sum()}")
print(f"Dataset shape: {df.shape}")


Total rows: 158020
Expected total: 158020
✓ Row count matches expected total (158020)
Dataset shape: (158020, 61)
Feature 'improper_drainage_position' added. True count: 46
Dataset shape: (158020, 61)


In [3]:
df.columns

Index(['subject_id', 'hadm_id', 'cauti_type', 'cauti_icd_codes',
       'other_uti_icd_codes', 'remaining_icd_codes', 'gender', 'anchor_age',
       'admittime', 'dischtime', 'admission_type', 'admission_location',
       'discharge_location', 'race', 'catheter_procedures',
       'catheter_procedures_ir', 'catheter_used_in_procedures_ir',
       'catheter_insertion_date_ir', 'catheter_removal_date_ir',
       'catheter_outputevents', 'catheter_used_in_output_events',
       'datetimeevents', 'catheter_used_in_datetime_events',
       'insertion_date_from_datetimeevents',
       'removal_date_from_datetimeevents', 'catheter_procedure_events',
       'catheter_used_in_procedure_events', 'catheter_size_from_chartevents',
       'index', 'catheter_clinical_notes', 'catheter_used_in_clinical_notes',
       'catheter_present', 'final_cauti_flag', 'final_insertion_date',
       'final_removal_date', 'catheter_duration_days', 'BMI_in_admission',
       'BMI_computed', 'BMI_last_year', 'BMI', 

In [5]:
# cols_to_drop = [
#  "improper_drainage_position"
# ]
# df = drop_columns(df, cols_to_drop)
# df.to_csv(dataset_path, index=False)