In [1]:
"""
Feature: benign_prostatic_hyperplasia
Extract from diagnoses_icd using ICD codes (only for male patients)
"""
import pandas as pd
import os
from config_paths import *
from utils import drop_columns

In [None]:
# Read dataset
df = pd.read_csv(dataset_path)
initial_row_count = len(df)

# Load diagnoses
diagnoses = pd.read_csv(
    os.path.join(hosp_path, "diagnoses_icd.csv"),
    usecols=["subject_id", "hadm_id", "icd_code", "icd_version"]
)
diagnoses["icd_code"] = diagnoses["icd_code"].astype(str)

# BPH ICD codes (exact match)
bph_icd_codes = [
    "60000", "60001", "60010", "60011",
    "60020", "60021", "60090", "60091",
    "N40", "N400", "N401", "N402", "N403"
]

def has_icd_exact(icd_list, codes):
    return icd_list.isin(codes).any()

# Filter to male patients only
df_men = df[df["gender"].str.upper() == "M"].copy()

# Extract BPH for male patients
bph_df = (
    diagnoses[diagnoses["subject_id"].isin(df_men["subject_id"])]
    .groupby(["subject_id", "hadm_id"])
    .apply(
        lambda g: pd.Series({
            "benign_prostatic_hyperplasia": has_icd_exact(g["icd_code"], bph_icd_codes)
        }),
        include_groups=False
    )
    .reset_index()
)

# Merge with dataset
df = df.merge(
    bph_df,
    on=["subject_id", "hadm_id"],
    how="left"
)

# Fill missing values with False
df["benign_prostatic_hyperplasia"] = df["benign_prostatic_hyperplasia"].fillna(False).astype(bool)

# IMPORTANT: Explicitly set BPH to False for all female patients
# This ensures that even if there are data quality issues or merge problems,
# females will never have BPH = True
df.loc[df["gender"].str.upper().isin(["F", "FEMALE"]), "benign_prostatic_hyperplasia"] = False

# Save
df.to_csv(dataset_path, index=False)

# Diagnostic output
print(f"\nTotal rows: {len(df)}")
print(f"Expected total: {initial_row_count}")
if len(df) == initial_row_count:
    print(f"✓ Row count matches expected total ({initial_row_count})")
else:
    print(f"⚠ Row count mismatch! Expected {initial_row_count}, got {len(df)} (difference: {len(df) - initial_row_count})")
print(f"Dataset shape: {df.shape}")
print(f"Feature 'benign_prostatic_hyperplasia' added. True count: {df['benign_prostatic_hyperplasia'].sum()}")

# Verify that no females have BPH = True
female_bph_count = df[df["gender"].str.upper().isin(["F", "FEMALE"]) & (df["benign_prostatic_hyperplasia"] == True)].shape[0]
if female_bph_count == 0:
    print(f"✓ Verified: No female patients have BPH = True (correct)")
else:
    print(f"⚠ WARNING: {female_bph_count} female patient(s) have BPH = True (this should not happen!)")

# Show gender distribution of BPH cases
print(f"\nBPH by gender:")
print(df.groupby(df["gender"].str.upper())["benign_prostatic_hyperplasia"].agg(["sum", "count"]))
print(f"Dataset shape: {df.shape}")


  df["benign_prostatic_hyperplasia"] = df["benign_prostatic_hyperplasia"].fillna(False).astype(bool)



Total rows: 158020
Expected total: 158020
✓ Row count matches expected total (158020)
Dataset shape: (158020, 48)
Feature 'benign_prostatic_hyperplasia' added. True count: 13336
Dataset shape: (158020, 48)


In [4]:
df.columns

Index(['subject_id', 'hadm_id', 'cauti_type', 'cauti_icd_codes',
       'other_uti_icd_codes', 'remaining_icd_codes', 'gender', 'anchor_age',
       'admittime', 'dischtime', 'admission_type', 'admission_location',
       'discharge_location', 'race', 'catheter_procedures',
       'catheter_procedures_ir', 'catheter_used_in_procedures_ir',
       'catheter_insertion_date_ir', 'catheter_removal_date_ir',
       'catheter_outputevents', 'catheter_used_in_output_events',
       'datetimeevents', 'catheter_used_in_datetime_events',
       'insertion_date_from_datetimeevents',
       'removal_date_from_datetimeevents', 'catheter_procedure_events',
       'catheter_used_in_procedure_events', 'catheter_size_from_chartevents',
       'index', 'catheter_clinical_notes', 'catheter_used_in_clinical_notes',
       'catheter_present', 'final_cauti_flag', 'final_insertion_date',
       'final_removal_date', 'catheter_duration_days', 'BMI_in_admission',
       'BMI_computed', 'BMI_last_year', 'BMI', 

In [5]:
# cols_to_drop = [
#  "benign_prostatic_hyperplasia"
# ]
# df = drop_columns(df, cols_to_drop)
# df.to_csv(dataset_path, index=False)