In [1]:
"""
Feature: cfu_count
Extract CFU count from microbiologyevents comments AND discharge notes
Takes the maximum CFU count from both sources per admission (matching notebook)
"""
import pandas as pd
import os
import re
import numpy as np
from config_paths import *
from utils import drop_columns

In [2]:
# Read dataset
df = pd.read_csv(dataset_path)
initial_row_count = len(df)

# ============================================================
# 1. Extract CFU from microbiologyevents comments
# ============================================================
microbiology_events = pd.read_csv(
    os.path.join(hosp_path, "microbiologyevents.csv"),
    usecols=["subject_id", "hadm_id", "spec_type_desc", "test_name", "comments"]
)

# Filter urine culture
urine_culture_df = microbiology_events[
    (microbiology_events["spec_type_desc"].str.upper() == "URINE") &
    (microbiology_events["test_name"].str.upper() == "URINE CULTURE")
].copy()

# Extract CFU from comments
def extract_cfu_from_mbe_comment(comment):
    """Extracts CFU from microbiology events comments."""
    if pd.isna(comment):
        return np.nan
    text = str(comment)
    m = re.search(r"(\d[\d,]*)\s*(?:organisms/ml|CFU/mL)", text, flags=re.I)
    if m:
        num = m.group(1).replace(",", "")
        try:
            return float(num)
        except ValueError:
            return np.nan
    return np.nan

urine_culture_df["cfu_count_from_mbe"] = urine_culture_df["comments"].apply(extract_cfu_from_mbe_comment)

# Max CFU per admission from microbiology events
cfu_from_mbe_hadm = (
    urine_culture_df.groupby("hadm_id")["cfu_count_from_mbe"]
    .max()
    .reset_index()
)

# ============================================================
# 2. Extract CFU from discharge notes
# ============================================================
try:
    discharge = pd.read_csv(
        os.path.join(note_path, "discharge.csv"),
        usecols=["subject_id", "hadm_id", "text"]
    )

    def extract_cfu_from_text(text):
        """
        Extracts CFU count from free-text discharge notes.
        Handles patterns like:
            '100,000 CFU'
            '1 × 10^5 CFU'
            '1x10^5 CFU'
        Returns float or NaN.
        """
        if pd.isna(text):
            return np.nan
        s = str(text)

        # First, find an expression followed by 'CFU'
        m = re.search(r"([\d,\.]+(?:\s*[x×*]?\s*10\^?\d*)?)\s*CFU", s, flags=re.I)
        if not m:
            return np.nan

        expr = m.group(1).strip().replace(",", "")

        # Case 1: scientific notation like '1 × 10^5', '1x10^5'
        sci_match = re.match(r"(\d+(?:\.\d+)?)\s*[x×*]\s*10\^?(\d+)", expr, flags=re.I)
        if sci_match:
            base = float(sci_match.group(1))
            exp = int(sci_match.group(2))
            return base * (10 ** exp)

        # Case 2: plain '10^5'
        pow_match = re.match(r"10\^(\d+)", expr, flags=re.I)
        if pow_match:
            exp = int(pow_match.group(1))
            return float(10 ** exp)

        # Case 3: simple number '100000'
        try:
            return float(expr)
        except ValueError:
            return np.nan

    # Apply on discharge notes
    discharge["cfu_count_from_notes"] = discharge["text"].apply(extract_cfu_from_text)

    # Max CFU per admission from discharge notes
    cfu_from_notes_hadm = (
        discharge.groupby("hadm_id")["cfu_count_from_notes"]
        .max()
        .reset_index()
    )
except:
    # If discharge notes not available, create empty dataframe
    cfu_from_notes_hadm = pd.DataFrame(columns=["hadm_id", "cfu_count_from_notes"])

# ============================================================
# 3. Merge both sources and take maximum
# ============================================================
# Merge CFU from microbiology events
df = df.merge(cfu_from_mbe_hadm, on="hadm_id", how="left")

# Merge CFU from discharge notes
if not cfu_from_notes_hadm.empty:
    df = df.merge(cfu_from_notes_hadm, on="hadm_id", how="left")
else:
    df["cfu_count_from_notes"] = np.nan

# Final CFU: greatest of the two sources
df["cfu_count"] = df[["cfu_count_from_mbe", "cfu_count_from_notes"]].max(axis=1)

# Drop intermediate columns
df = df.drop(columns=["cfu_count_from_mbe", "cfu_count_from_notes"], errors="ignore")

# Save
df.to_csv(dataset_path, index=False)

# Diagnostic output
missing = df["cfu_count"].isna().sum()
non_missing = df["cfu_count"].notna().sum()
total = len(df)

print(f"\nFeature 'cfu_count' added.")
print(f"Missing: {missing}, Non-Missing: {non_missing}, Total: {total}")
print(f"Expected total: {initial_row_count}")
if total == initial_row_count:
    print(f"✓ Row count matches expected total ({initial_row_count})")
else:
    print(f"⚠ Row count mismatch! Expected {initial_row_count}, got {total} (difference: {total - initial_row_count})")
print(f"Dataset shape: {df.shape}")


Feature 'cfu_count' added.
Missing: 145635, Non-Missing: 12385, Total: 158020
Expected total: 158020
✓ Row count matches expected total (158020)
Dataset shape: (158020, 88)


In [3]:
df.columns

Index(['subject_id', 'hadm_id', 'cauti_type', 'cauti_icd_codes',
       'other_uti_icd_codes', 'remaining_icd_codes', 'gender', 'anchor_age',
       'admittime', 'dischtime', 'admission_type', 'admission_location',
       'discharge_location', 'race', 'catheter_procedures',
       'catheter_procedures_ir', 'catheter_used_in_procedures_ir',
       'catheter_insertion_date_ir', 'catheter_removal_date_ir',
       'catheter_outputevents', 'catheter_used_in_output_events',
       'datetimeevents', 'catheter_used_in_datetime_events',
       'insertion_date_from_datetimeevents',
       'removal_date_from_datetimeevents', 'catheter_procedure_events',
       'catheter_used_in_procedure_events', 'catheter_size_from_chartevents',
       'index', 'catheter_clinical_notes', 'catheter_used_in_clinical_notes',
       'catheter_present', 'final_cauti_flag', 'final_insertion_date',
       'final_removal_date', 'catheter_duration_days', 'BMI_in_admission',
       'BMI_computed', 'BMI_last_year', 'BMI', 

In [4]:
# cols_to_drop = [
#  "cfu_count"
# ]
# df = drop_columns(df, cols_to_drop)
# df.to_csv(dataset_path, index=False)