In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

# Get project root (go up from scripts/silver_to_gold to project root)
PROJECT_ROOT = Path(__file__).parent.parent.parent if '__file__' in globals() else Path.cwd().parent.parent
file_path = PROJECT_ROOT / "data" / "silver" / "silver_dataset.csv"

df = pd.read_csv(file_path)
print(f"Loaded dataset with shape: {df.shape}")
df.head()


Loaded dataset with shape: (158020, 118)


Unnamed: 0,subject_id,hadm_id,gender,anchor_age,catheter_present,bmi,diabetes,cancer,chronic_kidney_disease,neurogenic_bladder,...,catheter_type_foley,catheter_type_nephrostomy,catheter_type_straight,catheter_type_suprapubic,catheter_type_unknown,catheter_type_ureteral,other_uti_uti_unspecified,other_uti_cystitis,other_uti_pyelonephritis,other_uti_urethritis
0,10000032,22595853,0,-0.643593,1,-1.537167,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1,10000032,29079034,0,-0.643593,1,-1.506081,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,10000032,25742920,0,-0.643593,1,-1.506081,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,10000560,28979390,0,-0.585292,1,0.063762,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,10000690,25860671,0,1.338645,1,-0.760017,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


In [2]:
FEATURES_TO_DROP = [
    # ----------------------------
    # Outcome / administrative
    # ----------------------------
    "length_of_stay",

    # ----------------------------
    # Diagnosis / infection flags (leakage)
    # ----------------------------
    "other_uti_present",
    "has_cauti_history",

    # ----------------------------
    # Catheter actions driven by infection
    # ----------------------------
    "catheter_removal",
    "catheter_removal_replacement",

    # ----------------------------
    # Antibiotic / treatment reactions
    # ----------------------------
    "antibiotics_per_admission",
    "recent_antibiotic_use",

    # ----------------------------
    # Labs & infection evidence (soft leakage)
    # ----------------------------
    "urinalysis_wbc",
    "urinalysis_rbc",
    "blood_wbc",
    "creatinine",
    "procalcitonin_measured",
    "urine_culture_performed",
    "blood_culture_performed",
    "gram_negative_organisms_present",
    "gram_positive_organisms_present",
    "fungi_present",
    "blood_crp_measured",
    "cfu_count_measured",

    # ----------------------------
    # Physiologic response / monitoring
    # ----------------------------
    "oliguria",
    "urine_output_measured",

    # ----------------------------
    # Vitals (post-infection effects)
    # ----------------------------
    "temperature",
    "heart_rate",
    "resp_rate",
    "o2sat",
    "bp_systolic",
    "bp_diastolic",

    # ----------------------------
    # Symptoms / downstream effects
    # ----------------------------
    "pain_documented",

    # ----------------------------
    # Tests indicating UTI knowledge
    # ----------------------------
    "nitrite_tested",
    "nitrite_positive",

    # ----------------------------
    # Discharge location (pure outcome)
    # ----------------------------
    "discharge_location_acute_hospital",
    "discharge_location_against_advice",
    "discharge_location_assisted_living",
    "discharge_location_chroniclong_term_acute_care",
    "discharge_location_died",
    "discharge_location_healthcare_facility",
    "discharge_location_home",
    "discharge_location_home_health_care",
    "discharge_location_hospice",
    "discharge_location_other_facility",
    "discharge_location_psych_facility",
    "discharge_location_rehab",
    "discharge_location_skilled_nursing_facility",

    # ----------------------------
    # Explicit UTI subtype indicators
    # ----------------------------
    "other_uti_uti_unspecified",
    "other_uti_cystitis",
    "other_uti_pyelonephritis",
    "other_uti_urethritis",
]

# Check which columns exist in the dataset
existing_cols_to_drop = [col for col in FEATURES_TO_DROP if col in df.columns]
missing_cols = [col for col in FEATURES_TO_DROP if col not in df.columns]

print(f"Columns to drop: {len(existing_cols_to_drop)}")
if missing_cols:
    print(f"Warning: {len(missing_cols)} columns not found in dataset: {missing_cols}")

# Drop the columns
df_filtered = df.drop(columns=existing_cols_to_drop, errors='ignore')

print(f"\nOriginal shape: {df.shape}")
print(f"Filtered shape: {df_filtered.shape}")
print(f"Columns dropped: {df.shape[1] - df_filtered.shape[1]}")


Columns to drop: 47

Original shape: (158020, 118)
Filtered shape: (158020, 71)
Columns dropped: 47


In [3]:
# Ensure the gold directory exists
gold_dir = PROJECT_ROOT / "data" / "gold"
gold_dir.mkdir(parents=True, exist_ok=True)

# Save the filtered dataset
save_path = gold_dir / "gold_dataset.csv"
df_filtered.to_csv(save_path, index=False)

print(f"Gold dataset saved at: {save_path}")
print(f"Final dataset shape: {df_filtered.shape}")


Gold dataset saved at: C:\Users\Coditas\Desktop\Projects\Cauti\data\gold\gold_dataset.csv
Final dataset shape: (158020, 71)
