In [1]:
"""
Feature: fungi_present
Flag if fungi found in urine culture
"""
import pandas as pd
import os
from config_paths import *
from utils import drop_columns

In [2]:
# Read dataset
df = pd.read_csv(dataset_path)
initial_row_count = len(df)

# Load microbiology events
microbiology_events = pd.read_csv(
    os.path.join(hosp_path, "microbiologyevents.csv"),
    usecols=["subject_id", "hadm_id", "spec_type_desc", "test_name", "org_name"]
)

# Filter urine culture
urine_culture_df = microbiology_events[
    (microbiology_events["spec_type_desc"].str.upper() == "URINE") &
    (microbiology_events["test_name"].str.upper() == "URINE CULTURE")
].copy()

# Fungi organisms list (complete from notebook)
fungi_organisms = [
    "Yeast",
    "Yeast, presumptively not C. albicans",
    "Candida albicans",
    "Candida albicans, presumptive identification",
    "Candida glabrata",
    "Candida parapsilosis",
    "Candida tropicalis",
    "Candida dubliniensis",
    "Candida krusei",
    "Candida lusitaniae",
    "Candida kefyr"
]

fungi_organisms_lower = [org.strip().lower() for org in fungi_organisms]

# Flag fungi (using .isin() for exact match like notebook)
urine_culture_df["is_fungi"] = (
    urine_culture_df["org_name"]
    .astype(str)
    .str.strip()
    .str.lower()
    .isin(fungi_organisms_lower)
)

# Admission-level flag (groupby hadm_id only, like notebook)
admission_fungi = (
    urine_culture_df
    .groupby("hadm_id")["is_fungi"]
    .any()
    .reset_index()
    .rename(columns={"is_fungi": "fungi_present"})
)

# Merge (on hadm_id only, like notebook)
df = df.merge(admission_fungi, on="hadm_id", how="left")

# Fill missing with False (using .where() like notebook)
df["fungi_present"] = df["fungi_present"].where(
    df["fungi_present"].notna(), False
).astype(bool)

# Save
df.to_csv(dataset_path, index=False)

# Diagnostic output
print(f"\nTotal rows: {len(df)}")
print(f"Expected total: {initial_row_count}")
if len(df) == initial_row_count:
    print(f"✓ Row count matches expected total ({initial_row_count})")
else:
    print(f"⚠ Row count mismatch! Expected {initial_row_count}, got {len(df)} (difference: {len(df) - initial_row_count})")
print(f"Dataset shape: {df.shape}")
print(f"Feature 'fungi_present' added. True count: {df['fungi_present'].sum()}")
print(f"Dataset shape: {df.shape}")




Total rows: 158020
Expected total: 158020
✓ Row count matches expected total (158020)
Dataset shape: (158020, 91)
Feature 'fungi_present' added. True count: 4612
Dataset shape: (158020, 91)


In [3]:
df.columns

Index(['subject_id', 'hadm_id', 'cauti_type', 'cauti_icd_codes',
       'other_uti_icd_codes', 'remaining_icd_codes', 'gender', 'anchor_age',
       'admittime', 'dischtime', 'admission_type', 'admission_location',
       'discharge_location', 'race', 'catheter_procedures',
       'catheter_procedures_ir', 'catheter_used_in_procedures_ir',
       'catheter_insertion_date_ir', 'catheter_removal_date_ir',
       'catheter_outputevents', 'catheter_used_in_output_events',
       'datetimeevents', 'catheter_used_in_datetime_events',
       'insertion_date_from_datetimeevents',
       'removal_date_from_datetimeevents', 'catheter_procedure_events',
       'catheter_used_in_procedure_events', 'catheter_size_from_chartevents',
       'index', 'catheter_clinical_notes', 'catheter_used_in_clinical_notes',
       'catheter_present', 'final_cauti_flag', 'final_insertion_date',
       'final_removal_date', 'catheter_duration_days', 'BMI_in_admission',
       'BMI_computed', 'BMI_last_year', 'BMI', 

In [4]:
# cols_to_drop = [
#  "fungi_present"
# ]
# df = drop_columns(df, cols_to_drop)
# df.to_csv(dataset_path, index=False)