In [1]:
"""
Feature: recent_urologic_abdominal_surgery
Extract from procedures_icd and discharge notes, check if within recent timeframe
"""
import pandas as pd
import os
import re
from config_paths import *
from utils import drop_columns

In [2]:
# Read dataset
df = pd.read_csv(dataset_path)
initial_row_count = len(df)
df["admittime"] = pd.to_datetime(df["admittime"])

# Load procedures
d_icd_procedures = pd.read_csv(os.path.join(hosp_path, "d_icd_procedures.csv"))
procedures = pd.read_csv(
    os.path.join(hosp_path, "procedures_icd.csv"),
    usecols=["subject_id", "hadm_id", "icd_code", "icd_version"]
)
procedures["icd_code"] = procedures["icd_code"].astype(str)
procedures["icd_version"] = procedures["icd_version"].astype(int)

# Identify urologic/abdominal surgery codes
d_icd_procedures["icd_code"] = d_icd_procedures["icd_code"].astype(str)

icd10_uro_abd_pattern = r'^0(?:M|N|D|F|TT|TB|T1|T5|T7|T9|TF|TN|TP|VB|VT|DT|FT|DQ|DB|DV|DW|WJ)[A-Z0-9]{4}$'
icd9_uro_abd_pattern = r'^(?:55|56|57|58|60|62|63|47|49|51|52|53|54)\.'

icd10_uro_abd_mask = d_icd_procedures["icd_code"].str.match(icd10_uro_abd_pattern)
icd9_uro_abd_mask = d_icd_procedures["icd_code"].str.match(icd9_uro_abd_pattern)

urologic_abdominal_codes = d_icd_procedures.loc[
    icd10_uro_abd_mask | icd9_uro_abd_mask,
    "icd_code"
].drop_duplicates()
urologic_abdominal_codes_set = set(urologic_abdominal_codes)

# Flag in procedures
procedures["is_urologic_abdominal_surg"] = procedures["icd_code"].isin(urologic_abdominal_codes_set)

# Get procedure dates (need to merge with procedures_icd datetime if available)
# For now, use hadm_id level flag
uro_abd_from_proc = (
    procedures.groupby("hadm_id")["is_urologic_abdominal_surg"]
    .any()
    .reset_index()
    .rename(columns={"is_urologic_abdominal_surg": "uro_abd_from_procedures"})
)

# Load discharge notes
discharge = pd.read_csv(
    os.path.join(note_path, "discharge.csv"),
    usecols=["subject_id", "hadm_id", "text"]
)

uro_abd_regex = (
    r"\b(?:TURP|transurethral|prostatectom\w*|nephrectom\w*|nephrotom\w*|"
    r"cystectom\w*|cystotom\w*|pyeloplast\w*|ureterolithotom\w*|lithotripsy|"
    r"ureteroscop\w*|orchiectom\w*|orchiopex\w*|testicular\s+surgery|"
    r"kidney\s+surgery|ureter\s+surgery|bladder\s+surgery|prostate\s+surgery|"
    r"urologic\s+surgery|laparotomy|laparoscopy|abdominal\s+surgery|"
    r"appendectom\w*|cholecystectom\w*|colectom\w*|gastrectom\w*|"
    r"hepatectom\w*|splenectom\w*|bowel\s+resection|intestinal\s+resection)"
    r"\b"
)

discharge["uro_abd_from_notes"] = discharge["text"].str.contains(
    uro_abd_regex, case=False, na=False, regex=True
)

uro_abd_from_notes = (
    discharge.groupby("hadm_id")["uro_abd_from_notes"]
    .any()
    .reset_index()
)

# Merge
df = df.merge(uro_abd_from_proc, on="hadm_id", how="left")
df = df.merge(uro_abd_from_notes, on="hadm_id", how="left")

# Safe boolean handling
df["uro_abd_from_procedures"] = (
    df["uro_abd_from_procedures"].astype("boolean").fillna(False).astype(bool)
)
df["uro_abd_from_notes"] = (
    df["uro_abd_from_notes"].astype("boolean").fillna(False).astype(bool)
)

# Final flag (recent = within this admission or previous 90 days)
# For simplicity, if present in this admission, consider it recent
df["recent_urologic_abdominal_surgery"] = (
    df["uro_abd_from_procedures"] | df["uro_abd_from_notes"]
)

# Drop intermediate columns
df.drop(columns=["uro_abd_from_procedures", "uro_abd_from_notes"], inplace=True)

# Save
df.to_csv(dataset_path, index=False)

# Diagnostic output
print(f"\nTotal rows: {len(df)}")
print(f"Expected total: {initial_row_count}")
if len(df) == initial_row_count:
    print(f"✓ Row count matches expected total ({initial_row_count})")
else:
    print(f"⚠ Row count mismatch! Expected {initial_row_count}, got {len(df)} (difference: {len(df) - initial_row_count})")
print(f"Dataset shape: {df.shape}")
print(f"Feature 'recent_urologic_abdominal_surgery' added. True count: {df['recent_urologic_abdominal_surgery'].sum()}")
print(f"Dataset shape: {df.shape}")


Total rows: 158020
Expected total: 158020
✓ Row count matches expected total (158020)
Dataset shape: (158020, 54)
Feature 'recent_urologic_abdominal_surgery' added. True count: 43480
Dataset shape: (158020, 54)


In [3]:
df.columns

Index(['subject_id', 'hadm_id', 'cauti_type', 'cauti_icd_codes',
       'other_uti_icd_codes', 'remaining_icd_codes', 'gender', 'anchor_age',
       'admittime', 'dischtime', 'admission_type', 'admission_location',
       'discharge_location', 'race', 'catheter_procedures',
       'catheter_procedures_ir', 'catheter_used_in_procedures_ir',
       'catheter_insertion_date_ir', 'catheter_removal_date_ir',
       'catheter_outputevents', 'catheter_used_in_output_events',
       'datetimeevents', 'catheter_used_in_datetime_events',
       'insertion_date_from_datetimeevents',
       'removal_date_from_datetimeevents', 'catheter_procedure_events',
       'catheter_used_in_procedure_events', 'catheter_size_from_chartevents',
       'index', 'catheter_clinical_notes', 'catheter_used_in_clinical_notes',
       'catheter_present', 'final_cauti_flag', 'final_insertion_date',
       'final_removal_date', 'catheter_duration_days', 'BMI_in_admission',
       'BMI_computed', 'BMI_last_year', 'BMI', 

In [5]:
# cols_to_drop = [
#  "recent_urologic_abdominal_surgery"
# ]
# df = drop_columns(df, cols_to_drop)
# df.to_csv(dataset_path, index=False)