In [1]:
"""
Feature: recent_antibiotic_use
Check if antibiotics were used within 30 days before admission
"""
import pandas as pd
import os
import re
from config_paths import *
from utils import drop_columns

In [2]:
# Read dataset
df = pd.read_csv(dataset_path)
initial_row_count = len(df)
df["dischtime"] = pd.to_datetime(df["dischtime"])

# Load prescriptions
prescriptions = pd.read_csv(
    os.path.join(hosp_path, "prescriptions.csv"),
    usecols=["subject_id", "starttime", "stoptime", "hadm_id", "drug"]
)

# Antibiotic keywords (complete from notebook)
antibiotic_keywords = [
    "ampicillin",
    "amoxicillin",
    "amoxicillin-clavulanate",
    "augmentin",   # brand for amox-clav
    "penicillin",
    "piperacillin",
    "zosyn",       # brand for pip-tazo
    "tazobactam",
    "cefazolin",
    "ceftriaxone",
    "cefepime",
    "ceftaroline",
    "cephalexin",
    "cefuroxime",
    "cefotaxime",
    "meropenem",
    "ertapenem",
    "imipenem",
    "cilastatin",
    "aztreonam",
    "gentamicin",
    "tobramycin",
    "amikacin",
    "ciprofloxacin",
    "levofloxacin",
    "moxifloxacin",
    "azithromycin",
    "erythromycin",
    "clarithromycin",
    "vancomycin",
    "daptomycin",
    "linezolid",
    "doxycycline",
    "tetracycline",
    "minocycline",
    "trimethoprim",
    "sulfamethoxazole",
    "bactrim",
    "septra",
    "metronidazole",
    "nitrofurantoin",
    "clindamycin",
    "rifampin",
    "colistin",
    "polymyxin"
]

# Convert list → regex OR pattern
antibiotic_pattern = "|".join([re.escape(x) for x in antibiotic_keywords])

def drug_flag_regex(df, pattern):
    return df["drug"].str.contains(pattern, case=False, na=False, regex=True)

# Build antibiotic flag
abx_rows = prescriptions[drug_flag_regex(prescriptions, antibiotic_pattern)]

# Merge with admissions to get dischtime
abx_with_discharge = abx_rows.merge(
    df[["subject_id", "hadm_id", "dischtime"]],
    on=["subject_id", "hadm_id"],
    how="left"
)

# Make sure these are proper datetimes
abx_with_discharge["starttime"] = pd.to_datetime(abx_with_discharge["starttime"], errors="coerce")
abx_with_discharge["dischtime"] = pd.to_datetime(abx_with_discharge["dischtime"], errors="coerce")

# Define "recent" = starttime in [dischtime - 30 days, dischtime]
window = pd.Timedelta(days=30)

abx_with_discharge["recent_antibiotic_use"] = (
    (abx_with_discharge["starttime"] <= abx_with_discharge["dischtime"]) &
    (abx_with_discharge["starttime"] >= abx_with_discharge["dischtime"] - window)
)

# Aggregate per admission
recent_flag_df = (
    abx_with_discharge
    .groupby(["subject_id", "hadm_id"])["recent_antibiotic_use"]
    .max()
    .reset_index()
)

# Merge
df = df.merge(recent_flag_df, on=["subject_id", "hadm_id"], how="left")

# Clean boolean handling (like notebook)
df["recent_antibiotic_use"] = (
    df["recent_antibiotic_use"]
    .astype("boolean")   # pandas nullable boolean
    .fillna(False)       # missing → False
    .astype(bool)        # convert to pure bool
)

# Save
df.to_csv(dataset_path, index=False)

# Diagnostic output
print(f"\nTotal rows: {len(df)}")
print(f"Expected total: {initial_row_count}")
if len(df) == initial_row_count:
    print(f"✓ Row count matches expected total ({initial_row_count})")
else:
    print(f"⚠ Row count mismatch! Expected {initial_row_count}, got {len(df)} (difference: {len(df) - initial_row_count})")
print(f"Dataset shape: {df.shape}")
print(f"Feature 'recent_antibiotic_use' added. True count: {df['recent_antibiotic_use'].sum()}")
print(f"Dataset shape: {df.shape}")



Total rows: 158020
Expected total: 158020
✓ Row count matches expected total (158020)
Dataset shape: (158020, 73)
Feature 'recent_antibiotic_use' added. True count: 110893
Dataset shape: (158020, 73)


In [3]:
df.columns

Index(['subject_id', 'hadm_id', 'cauti_type', 'cauti_icd_codes',
       'other_uti_icd_codes', 'remaining_icd_codes', 'gender', 'anchor_age',
       'admittime', 'dischtime', 'admission_type', 'admission_location',
       'discharge_location', 'race', 'catheter_procedures',
       'catheter_procedures_ir', 'catheter_used_in_procedures_ir',
       'catheter_insertion_date_ir', 'catheter_removal_date_ir',
       'catheter_outputevents', 'catheter_used_in_output_events',
       'datetimeevents', 'catheter_used_in_datetime_events',
       'insertion_date_from_datetimeevents',
       'removal_date_from_datetimeevents', 'catheter_procedure_events',
       'catheter_used_in_procedure_events', 'catheter_size_from_chartevents',
       'index', 'catheter_clinical_notes', 'catheter_used_in_clinical_notes',
       'catheter_present', 'final_cauti_flag', 'final_insertion_date',
       'final_removal_date', 'catheter_duration_days', 'BMI_in_admission',
       'BMI_computed', 'BMI_last_year', 'BMI', 

In [5]:
# cols_to_drop = [
#  "recent_antibiotic_use"
# ]
# df = drop_columns(df, cols_to_drop)
# df.to_csv(dataset_path, index=False)