In [1]:
import pandas as pd
import os
from config_paths import *
from utils import drop_columns

In [2]:
# =====================================================
# 1. Read main dataset
# =====================================================
df = pd.read_csv(dataset_path)

# =====================================================
# 2. Load diagnosis tables
# =====================================================
diagnoses_icd = pd.read_csv(
    os.path.join(hosp_path, "diagnoses_icd.csv"),
    usecols=["subject_id", "hadm_id", "icd_code", "icd_version"]
)

# =====================================================
# 3. Normalize ICD codes (remove dot)
# =====================================================
diagnoses_icd["icd_code_nodot"] = (
    diagnoses_icd["icd_code"].astype(str).str.replace(".", "", regex=False)
)
diagnoses_icd["icd_version"] = diagnoses_icd["icd_version"].astype(int)

# =====================================================
# 4. Expanded UTI ICD code → category mapping
# =====================================================
UTI_CATEGORY_MAP = {

    # ---------------- ICD-9 ----------------
    "5950": "cystitis",
    "5951": "cystitis",
    "5952": "cystitis",
    "5953": "cystitis",
    "5954": "cystitis",
    "5959": "cystitis",

    "5900": "pyelonephritis",
    "5901": "pyelonephritis",
    "5902": "pyelonephritis",
    "5903": "pyelonephritis",
    "5908": "pyelonephritis",
    "5909": "pyelonephritis",

    "5990": "UTI unspecified",

    "59780": "urethritis",
    "59789": "urethritis",

    # ---------------- ICD-10 ----------------
    "N300": "cystitis",
    "N301": "cystitis",
    "N302": "cystitis",
    "N303": "cystitis",
    "N304": "cystitis",
    "N308": "cystitis",
    "N309": "cystitis",

    "N10":  "pyelonephritis",
    "N11":  "pyelonephritis",
    "N110": "pyelonephritis",
    "N119": "pyelonephritis",
    "N12":  "pyelonephritis",

    "N341": "urethritis",
    "N342": "urethritis",
    "N349": "urethritis",

    "N390": "UTI unspecified",
}

# =====================================================
# 5. Filter expanded UTI diagnoses
# =====================================================
uti_dx = diagnoses_icd[
    diagnoses_icd["icd_code_nodot"].isin(UTI_CATEGORY_MAP.keys())
].copy()

# Assign clinical category
uti_dx["uti_category"] = uti_dx["icd_code_nodot"].map(UTI_CATEGORY_MAP)

# =====================================================
# 6. Aggregate to admission level
# =====================================================
uti_admission = (
    uti_dx
    .groupby(["subject_id", "hadm_id"])["uti_category"]
    .agg(lambda x: sorted(set(x)))
    .reset_index(name="other_uti")
)

# =====================================================
# 7. Boolean flag
# =====================================================
uti_admission["other_uti_present"] = uti_admission["other_uti"].apply(
    lambda x: len(x) > 0
)

# =====================================================
# 8. Merge into main dataset
# =====================================================
df = df.merge(
    uti_admission,
    on=["subject_id", "hadm_id"],
    how="left"
)

# Fill defaults
df["other_uti"] = df["other_uti"].apply(
    lambda x: x if isinstance(x, list) else []
)

df["other_uti_present"] = (
    df["other_uti_present"]
    .fillna(False)
    .astype(bool)
)

# =====================================================
# 9. Save dataset
# =====================================================
df.to_csv(dataset_path, index=False)

# =====================================================
# Diagnostics
# =====================================================
print("✔ other_uti_present & other_uti columns created")
print(df["other_uti_present"].value_counts())
print("\nUTI category distribution:")
print(df["other_uti"].explode().value_counts())
print(f"Dataset shape: {df.shape}")

  .fillna(False)


✔ other_uti_present & other_uti columns created
other_uti_present
False    118791
True      39229
Name: count, dtype: int64

UTI category distribution:
other_uti
UTI unspecified    37080
pyelonephritis      1626
cystitis             617
urethritis            54
Name: count, dtype: int64
Dataset shape: (158020, 100)


In [4]:
df.columns

Index(['subject_id', 'hadm_id', 'cauti_type', 'cauti_icd_codes',
       'other_uti_icd_codes', 'remaining_icd_codes', 'gender', 'anchor_age',
       'admittime', 'dischtime', 'admission_type', 'admission_location',
       'discharge_location', 'race', 'catheter_procedures',
       'catheter_procedures_ir', 'catheter_used_in_procedures_ir',
       'catheter_insertion_date_ir', 'catheter_removal_date_ir',
       'catheter_outputevents', 'catheter_used_in_output_events',
       'datetimeevents', 'catheter_used_in_datetime_events',
       'insertion_date_from_datetimeevents',
       'removal_date_from_datetimeevents', 'catheter_procedure_events',
       'catheter_used_in_procedure_events', 'catheter_size_from_chartevents',
       'index', 'catheter_clinical_notes', 'catheter_used_in_clinical_notes',
       'catheter_present', 'final_cauti_flag', 'final_insertion_date',
       'final_removal_date', 'catheter_duration_days', 'BMI_in_admission',
       'BMI_computed', 'BMI_last_year', 'BMI', 

In [5]:
# cols_to_drop = [
#  "other_uti_present",
#  "other_uti"
# ]
# df = drop_columns(df, cols_to_drop)
# df.to_csv(dataset_path, index=False)