In [26]:
"""
Feature bmi
Extract BMI from OMR data (direct BMI values, or compute from weight/height)
"""
import pandas as pd
import os
from config_paths import *
from utils import drop_columns

In [28]:
# Read dataset
df = pd.read_csv(dataset_path)
initial_row_count = len(df)
df["admittime"] = pd.to_datetime(df["admittime"])
df["dischtime"] = pd.to_datetime(df["dischtime"])

# Load OMR data
omr = pd.read_csv(
    os.path.join(hosp_path, "omr.csv"),
    usecols=["subject_id", "chartdate", "result_name", "result_value"]
)
omr["chartdate"] = pd.to_datetime(omr["chartdate"])

# 1. Filter OMR to BMI values
bmi_omr = omr[omr["result_name"].isin(["BMI", "BMI (kg/m2)"])].copy()
bmi_omr["result_value"] = pd.to_numeric(bmi_omr["result_value"], errors="coerce")

# Join OMR BMI to admissions
bmi_join = df.merge(bmi_omr, on="subject_id", how="left")

# Keep only chartdates inside the admission stay
bmi_join = bmi_join[
    (bmi_join["chartdate"] >= bmi_join["admittime"]) &
    (bmi_join["chartdate"] <= bmi_join["dischtime"])
]

# Average BMI in admission (if multiple)
BMI_in_admission = (
    bmi_join.groupby(["subject_id", "hadm_id"])["result_value"]
    .mean()
    .reset_index()
    .rename(columns={"result_value": "BMI_in_admission"})
)

# 2. Extract Weight + Height rows and compute BMI
vitals = omr[omr["result_name"].isin(["Weight (Lbs)", "Height (Inches)"])].copy()
vitals["result_value"] = pd.to_numeric(vitals["result_value"], errors="coerce")

# Join vitals to admission
vitals_join = df.merge(vitals, on="subject_id", how="left")

# Keep only vitals inside admission period
vitals_join = vitals_join[
    (vitals_join["chartdate"] >= vitals_join["admittime"]) &
    (vitals_join["chartdate"] <= vitals_join["dischtime"])
]

# Pivot weight & height
vitals_pivot = vitals_join.pivot_table(
    index=["subject_id", "hadm_id"],
    columns="result_name",
    values="result_value",
    aggfunc="mean"
).reset_index()

# Compute BMI from weight & height
if "Weight (Lbs)" in vitals_pivot.columns and "Height (Inches)" in vitals_pivot.columns:
    vitals_pivot["Weight_kg"] = vitals_pivot["Weight (Lbs)"] * 0.453592
    vitals_pivot["Height_m"] = vitals_pivot["Height (Inches)"] * 0.0254
    vitals_pivot["BMI_computed"] = (
        vitals_pivot["Weight_kg"] /
        (vitals_pivot["Height_m"] ** 2)
    )
    BMI_computed = vitals_pivot[["subject_id", "hadm_id", "BMI_computed"]]
else:
    BMI_computed = pd.DataFrame(columns=["subject_id", "hadm_id", "BMI_computed"])

# 3. BMI from last year (within 365 days before admission)
bmi_before = df.merge(bmi_omr, on="subject_id", how="left")
bmi_before = bmi_before[
    (bmi_before["admittime"] - bmi_before["chartdate"]).dt.days <= 365
]
BMI_last_year = (
    bmi_before.sort_values(["subject_id", "hadm_id", "chartdate"])
              .groupby(["subject_id", "hadm_id"])
              .tail(1)
              .rename(columns={"result_value": "BMI_last_year"})
)
if not BMI_last_year.empty:
    BMI_last_year = BMI_last_year[["subject_id", "hadm_id", "BMI_last_year"]]
else:
    BMI_last_year = pd.DataFrame(columns=["subject_id", "hadm_id", "BMI_last_year"])

# Merge all BMI sources
df = df.merge(BMI_in_admission, on=["subject_id", "hadm_id"], how="left")
df = df.merge(BMI_computed, on=["subject_id", "hadm_id"], how="left")
df = df.merge(BMI_last_year, on=["subject_id", "hadm_id"], how="left")

# Combine: prefer in-admission, then computed, then last year (like notebook - uses "BMI" uppercase)
df["BMI"] = (
    df["BMI_in_admission"]
    .fillna(df["BMI_computed"])
    .fillna(df["BMI_last_year"])
)

# Note: Notebook keeps intermediate columns (BMI_in_admission, BMI_computed, BMI_last_year)
# We keep them to match notebook exactly

# Save
df.to_csv(dataset_path, index=False)

# Diagnostic output
print(f"\nFeature 'BMI' added.")
missing = df['BMI'].isna().sum()
non_missing = df['BMI'].notna().sum()
print(f"Missing: {missing}, Non-missing: {non_missing}, Total: {len(df)}")
print(f"Expected total: {initial_row_count}")
if len(df) == initial_row_count:
    print(f"✓ Row count matches expected total ({initial_row_count})")
else:
    print(f"⚠ Row count mismatch! Expected {initial_row_count}, got {len(df)} (difference: {len(df) - initial_row_count})")
print(f"Dataset shape: {df.shape}")


Feature 'BMI' added.
Missing: 46173, Non-missing: 111847, Total: 158020
Expected total: 158020
✓ Row count matches expected total (158020)
Dataset shape: (158020, 40)


In [29]:
df.columns

Index(['subject_id', 'hadm_id', 'cauti_type', 'cauti_icd_codes',
       'other_uti_icd_codes', 'remaining_icd_codes', 'gender', 'anchor_age',
       'admittime', 'dischtime', 'admission_type', 'admission_location',
       'discharge_location', 'race', 'catheter_procedures',
       'catheter_procedures_ir', 'catheter_used_in_procedures_ir',
       'catheter_insertion_date_ir', 'catheter_removal_date_ir',
       'catheter_outputevents', 'catheter_used_in_output_events',
       'datetimeevents', 'catheter_used_in_datetime_events',
       'insertion_date_from_datetimeevents',
       'removal_date_from_datetimeevents', 'catheter_procedure_events',
       'catheter_used_in_procedure_events', 'catheter_size_from_chartevents',
       'index', 'catheter_clinical_notes', 'catheter_used_in_clinical_notes',
       'catheter_present', 'final_cauti_flag', 'final_insertion_date',
       'final_removal_date', 'catheter_duration_days', 'BMI_in_admission',
       'BMI_computed', 'BMI_last_year', 'BMI'],

In [30]:
# cols_to_drop = [
#     'BMI_in_admission',
#     'BMI_computed',
#     'BMI_last_year',
#     'BMI'
# ]
# df = drop_columns(df, cols_to_drop)
# df.to_csv(dataset_path, index=False)