In [1]:
"""
Feature: gram_negative_organisms_present
Flag if gram-negative organisms found in urine culture
"""
import pandas as pd
import os
from config_paths import *
from utils import drop_columns

In [2]:
# Read dataset
df = pd.read_csv(dataset_path)
initial_row_count = len(df)

# Load microbiology events
microbiology_events = pd.read_csv(
    os.path.join(hosp_path, "microbiologyevents.csv"),
    usecols=["subject_id", "hadm_id", "spec_type_desc", "test_name", "org_name"]
)

# Filter urine culture
urine_culture_df = microbiology_events[
    (microbiology_events["spec_type_desc"].str.upper() == "URINE") &
    (microbiology_events["test_name"].str.upper() == "URINE CULTURE")
].copy()

# Gram-negative organisms list (complete from notebook)
gram_negative_organisms = [
    "Escherichia coli",
    "Klebsiella pneumoniae",
    "Klebsiella oxytoca",
    "Proteus mirabilis",
    "Proteus vulgaris",
    "Proteus hauseri",
    "Pseudomonas aeruginosa",
    "Pseudomonas fluorescens",
    "Pseudomonas putida",
    "Enterobacter cloacae complex",
    "Enterobacter cloacae",
    "Enterobacter aerogenes",
    "Enterobacter asburiae",
    "Enterobacter species",
    "Enterobacter amnigenus 2",
    "Citrobacter freundii complex",
    "Citrobacter koseri",
    "Citrobacter amalonaticus",
    "Citrobacter sedlakii",
    "Citrobacter species",
    "Serratia marcescens",
    "Serratia liquefaciens",
    "Morganella morganii",
    "Hafnia alvei",
    "Providencia stuartii",
    "Providencia rettgeri",
    "Pantoea species",
    "Acinetobacter baumannii",
    "Acinetobacter baumannii complex",
    "Acinetobacter lwoffii",
    "Acinetobacter sp.",
    "Burkholderia cepacia group",
    "Stenotrophomonas maltophilia",
    "Achromobacter sp.",
    "Salmonella Montevideo",
    "Gram negative rod(s)",
    "Gram negative rod #1",
    "Gram negative rod #2",
    "Gram negative rod #3",
    "Gram negative diplococci",
    "Gram negative bacteria"
]

gram_negative_organisms_lower = [org.strip().lower() for org in gram_negative_organisms]

# Flag gram-negative (using .isin() for exact match like notebook)
urine_culture_df["is_gram_negative"] = (
    urine_culture_df["org_name"]
    .astype(str)
    .str.strip()
    .str.lower()
    .isin(gram_negative_organisms_lower)
)

# Admission-level flag (groupby hadm_id only, like notebook)
admission_gram_negative = (
    urine_culture_df
    .groupby("hadm_id")["is_gram_negative"]
    .any()
    .reset_index()
    .rename(columns={"is_gram_negative": "gram_negative_organisms_present"})
)

# Merge (on hadm_id only, like notebook)
df = df.merge(admission_gram_negative, on="hadm_id", how="left")

# Fill missing with False (using .where() like notebook)
df["gram_negative_organisms_present"] = df["gram_negative_organisms_present"].where(
    df["gram_negative_organisms_present"].notna(), False
).astype(bool)

# Save
df.to_csv(dataset_path, index=False)

# Diagnostic output
print(f"\nTotal rows: {len(df)}")
print(f"Expected total: {initial_row_count}")
if len(df) == initial_row_count:
    print(f"✓ Row count matches expected total ({initial_row_count})")
else:
    print(f"⚠ Row count mismatch! Expected {initial_row_count}, got {len(df)} (difference: {len(df) - initial_row_count})")
print(f"Dataset shape: {df.shape}")
print(f"Feature 'gram_negative_organisms_present' added. True count: {df['gram_negative_organisms_present'].sum()}")
print(f"Dataset shape: {df.shape}")



Total rows: 158020
Expected total: 158020
✓ Row count matches expected total (158020)
Dataset shape: (158020, 89)
Feature 'gram_negative_organisms_present' added. True count: 8951
Dataset shape: (158020, 89)


In [3]:
df.columns

Index(['subject_id', 'hadm_id', 'cauti_type', 'cauti_icd_codes',
       'other_uti_icd_codes', 'remaining_icd_codes', 'gender', 'anchor_age',
       'admittime', 'dischtime', 'admission_type', 'admission_location',
       'discharge_location', 'race', 'catheter_procedures',
       'catheter_procedures_ir', 'catheter_used_in_procedures_ir',
       'catheter_insertion_date_ir', 'catheter_removal_date_ir',
       'catheter_outputevents', 'catheter_used_in_output_events',
       'datetimeevents', 'catheter_used_in_datetime_events',
       'insertion_date_from_datetimeevents',
       'removal_date_from_datetimeevents', 'catheter_procedure_events',
       'catheter_used_in_procedure_events', 'catheter_size_from_chartevents',
       'index', 'catheter_clinical_notes', 'catheter_used_in_clinical_notes',
       'catheter_present', 'final_cauti_flag', 'final_insertion_date',
       'final_removal_date', 'catheter_duration_days', 'BMI_in_admission',
       'BMI_computed', 'BMI_last_year', 'BMI', 

In [4]:
# cols_to_drop = [
#  "gram_negative_organisms_present"
# ]
# df = drop_columns(df, cols_to_drop)
# df.to_csv(dataset_path, index=False)