In [1]:
"""
Feature: urinalysis_nitrite
Extract urinalysis nitrite (binary: positive/negative) from labevents
"""
import pandas as pd
import os
from config_paths import *
from utils import drop_columns

In [2]:
# Read dataset
df = pd.read_csv(dataset_path)
initial_row_count = len(df)
df["admittime"] = pd.to_datetime(df["admittime"])
df["dischtime"] = pd.to_datetime(df["dischtime"])

# Load labevents
labevents = pd.read_csv(
    os.path.join(hosp_path, "labevents.csv"),
    usecols=["subject_id", "hadm_id", "itemid", "charttime", "value"]
)
labevents["charttime"] = pd.to_datetime(labevents["charttime"])

# Urinalysis nitrite item IDs
urinalysis_nitrite_ids = [51487, 51987]

# Filter and merge
urine_nitrite = labevents[labevents["itemid"].isin(urinalysis_nitrite_ids)].copy()
urine_nitrite = urine_nitrite.merge(
    df[["subject_id", "hadm_id", "admittime", "dischtime"]],
    on=["subject_id", "hadm_id"],
    how="left"
)

# Restrict to admission window
urine_nitrite = urine_nitrite[
    (urine_nitrite["charttime"] >= urine_nitrite["admittime"]) &
    (urine_nitrite["charttime"] <= urine_nitrite["dischtime"])
]

# Normalize to binary
def normalize_nitrite(val):
    if pd.isna(val):
        return None
    val = str(val).strip().upper()
    if val in ["POS", "POSITIVE", "1", "Y", "YES"]:
        return 1
    if val in ["NEG", "NEGATIVE", "0", "N", "NO"]:
        return 0
    return None

urine_nitrite["nitrite_binary"] = urine_nitrite["value"].apply(normalize_nitrite)

# Compute nitrite features per admission (like notebook)
urinalysis_nitrite_agg = (
    urine_nitrite
    .groupby(["subject_id", "hadm_id"])
    .agg(
        urine_nitrite_any_positive=("nitrite_binary", lambda x: int((x == 1).any())),
        urine_nitrite_last=("nitrite_binary", "last")
    )
    .reset_index()
)

# Use any_positive as the main feature (urinalysis_nitrite)
urinalysis_nitrite_agg["urinalysis_nitrite"] = urinalysis_nitrite_agg["urine_nitrite_any_positive"]

# Merge
df = df.merge(
    urinalysis_nitrite_agg[["subject_id", "hadm_id", "urinalysis_nitrite"]],
    on=["subject_id", "hadm_id"],
    how="left"
)

# Save
df.to_csv(dataset_path, index=False)

# Diagnostic output
print(f"\nTotal rows: {len(df)}")
print(f"Expected total: {initial_row_count}")
if len(df) == initial_row_count:
    print(f"✓ Row count matches expected total ({initial_row_count})")
else:
    print(f"⚠ Row count mismatch! Expected {initial_row_count}, got {len(df)} (difference: {len(df) - initial_row_count})")
print(f"Dataset shape: {df.shape}")
print(f"Feature 'urinalysis_nitrite' added. Positive count: {(df['urinalysis_nitrite'] == 1).sum()}")
print(f"Dataset shape: {df.shape}")



Total rows: 158020
Expected total: 158020
✓ Row count matches expected total (158020)
Dataset shape: (158020, 79)
Feature 'urinalysis_nitrite' added. Positive count: 1446
Dataset shape: (158020, 79)


In [3]:
df.columns

Index(['subject_id', 'hadm_id', 'cauti_type', 'cauti_icd_codes',
       'other_uti_icd_codes', 'remaining_icd_codes', 'gender', 'anchor_age',
       'admittime', 'dischtime', 'admission_type', 'admission_location',
       'discharge_location', 'race', 'catheter_procedures',
       'catheter_procedures_ir', 'catheter_used_in_procedures_ir',
       'catheter_insertion_date_ir', 'catheter_removal_date_ir',
       'catheter_outputevents', 'catheter_used_in_output_events',
       'datetimeevents', 'catheter_used_in_datetime_events',
       'insertion_date_from_datetimeevents',
       'removal_date_from_datetimeevents', 'catheter_procedure_events',
       'catheter_used_in_procedure_events', 'catheter_size_from_chartevents',
       'index', 'catheter_clinical_notes', 'catheter_used_in_clinical_notes',
       'catheter_present', 'final_cauti_flag', 'final_insertion_date',
       'final_removal_date', 'catheter_duration_days', 'BMI_in_admission',
       'BMI_computed', 'BMI_last_year', 'BMI', 

In [4]:
# cols_to_drop = [
#  "urinalysis_nitrite"
# ]
# df = drop_columns(df, cols_to_drop)
# df.to_csv(dataset_path, index=False)