In [1]:
"""
Feature: blood_pressure
Extract blood pressure (systolic/diastolic) from chartevents and OMR
"""
import pandas as pd
import os
import numpy as np
from config_paths import *
from utils import drop_columns

In [2]:
# Read dataset
df = pd.read_csv(dataset_path)
initial_row_count = len(df)
df["admittime"] = pd.to_datetime(df["admittime"])
df["dischtime"] = pd.to_datetime(df["dischtime"])

# Load OMR
omr = pd.read_csv(
    os.path.join(hosp_path, "omr.csv"),
    usecols=["subject_id", "chartdate", "result_name", "result_value"]
)
omr["chartdate"] = pd.to_datetime(omr["chartdate"])

# Filter BP from OMR
bp_omr = omr[omr["result_name"] == "Blood Pressure"].copy()

# Parse "120/80" format
def parse_bp(value):
    if isinstance(value, str) and "/" in value:
        try:
            s, d = value.split("/")
            return float(s), float(d)
        except:
            return np.nan, np.nan
    return np.nan, np.nan

bp_omr["sys"], bp_omr["dia"] = zip(*bp_omr["result_value"].apply(parse_bp))

# ============================================
# Load ICU chartevents data (chunked for memory efficiency)
# ============================================
arterial_ids = [220050, 220051]  # systolic, diastolic

chartevents_path = os.path.join(icu_path, "chartevents.csv")

# Read in chunks and filter rows on the fly
chunk_iter = pd.read_csv(
    chartevents_path,
    usecols=["subject_id", "hadm_id", "itemid", "charttime", "valuenum"],
    chunksize=500_000,
    low_memory=False
)

filtered_chunks = []

for chunk in chunk_iter:
    # Optional: enforce smaller dtypes to save memory
    chunk = chunk.astype({
        "subject_id": "int32",
        "hadm_id": "Int32",     # nullable int
        "itemid": "int32",
        "valuenum": "float32"
    })
    
    # keep only arterial BP rows in this chunk
    chunk = chunk[chunk["itemid"].isin(arterial_ids)]
    
    if not chunk.empty:
        filtered_chunks.append(chunk)

# concatenate only the filtered rows (much smaller than the full file)
chartevents = pd.concat(filtered_chunks, ignore_index=True)

# convert charttime after filtering (less data ⇒ faster / less memory)
chartevents["charttime"] = pd.to_datetime(chartevents["charttime"])

# Filter out rows where valuenum or hadm_id is NaN (need hadm_id for merging)
chartevents = chartevents[chartevents["valuenum"].notna() & chartevents["hadm_id"].notna()]

# Map itemid to type
chartevents["bp_type"] = chartevents["itemid"].map({
    220050: "sys",
    220051: "dia"
})

# Pivot to systolic/diastolic per timestamp
icu_bp = chartevents.pivot_table(
    index=["subject_id", "hadm_id", "charttime"],
    columns="bp_type",
    values="valuenum",
    aggfunc="mean"
).reset_index()

# Mean BP per admission
icu_bp_adm = (
    icu_bp.groupby(["subject_id", "hadm_id"])
    .agg(
        ICU_BP_sys=("sys", "mean"),
        ICU_BP_dia=("dia", "mean")
    )
    .reset_index()
)

# ============================================
# OMR BP fallback: closest before admission (≤ 365 days)
# ============================================
# Merge on subject_id only (OMR doesn't have hadm_id)
omr_merge = df.merge(bp_omr, on="subject_id", how="left")

# Filter for chartdate before admittime (exclude NaN chartdates)
omr_merge = omr_merge[
    omr_merge["chartdate"].notna() & 
    omr_merge["admittime"].notna() & 
    (omr_merge["chartdate"] < omr_merge["admittime"])
]

# Filter for within 365 days
omr_merge = omr_merge[
    (omr_merge["admittime"] - omr_merge["chartdate"]).dt.days <= 365
]

# Pick closest (latest before admittime) per admission
# Note: We pick closest first, then filter invalid BP values
closest_omr = (
    omr_merge.sort_values(["subject_id", "hadm_id", "chartdate"])
    .groupby(["subject_id", "hadm_id"])
    .tail(1)
    .rename(columns={"sys": "OMR_BP_sys", "dia": "OMR_BP_dia"})
)

# Filter out rows where sys or dia are NaN (invalid BP values) after closest selection
closest_omr = closest_omr[
    closest_omr["OMR_BP_sys"].notna() & closest_omr["OMR_BP_dia"].notna()
]

closest_omr = closest_omr[["subject_id", "hadm_id", "OMR_BP_sys", "OMR_BP_dia"]]

# ============================================
# Merge and combine: ICU first, OMR as fallback
# ============================================
df = df.merge(icu_bp_adm, on=["subject_id", "hadm_id"], how="left")
df = df.merge(closest_omr, on=["subject_id", "hadm_id"], how="left")

df["BP_sys_final"] = df["ICU_BP_sys"].fillna(df["OMR_BP_sys"])
df["BP_dia_final"] = df["ICU_BP_dia"].fillna(df["OMR_BP_dia"])

# Drop intermediate columns
df = df.drop(columns=['ICU_BP_sys', 'ICU_BP_dia', 'OMR_BP_sys', 'OMR_BP_dia'])

# Rename final columns
df = df.rename(columns={
    'BP_sys_final': 'BP_systolic',
    'BP_dia_final': 'BP_diastolic'
})

# Save
df.to_csv(dataset_path, index=False)

# Diagnostic output
print(f"\nTotal rows: {len(df)}")
print(f"Expected total: {initial_row_count}")
if len(df) == initial_row_count:
    print(f"✓ Row count matches expected total ({initial_row_count})")
else:
    print(f"⚠ Row count mismatch! Expected {initial_row_count}, got {len(df)} (difference: {len(df) - initial_row_count})")
print(f"Dataset shape: {df.shape}")
print(f"Feature 'BP_systolic' and 'BP_diastolic' added.")
print(f"Systolic missing: {df['BP_systolic'].isna().sum()}")
print(f"Diastolic missing: {df['BP_diastolic'].isna().sum()}")
print(f"Dataset shape: {df.shape}")



Total rows: 158020
Expected total: 158020
✓ Row count matches expected total (158020)
Dataset shape: (158020, 98)
Feature 'BP_systolic' and 'BP_diastolic' added.
Systolic missing: 55651
Diastolic missing: 55647
Dataset shape: (158020, 98)


In [3]:
df.columns

Index(['subject_id', 'hadm_id', 'cauti_type', 'cauti_icd_codes',
       'other_uti_icd_codes', 'remaining_icd_codes', 'gender', 'anchor_age',
       'admittime', 'dischtime', 'admission_type', 'admission_location',
       'discharge_location', 'race', 'catheter_procedures',
       'catheter_procedures_ir', 'catheter_used_in_procedures_ir',
       'catheter_insertion_date_ir', 'catheter_removal_date_ir',
       'catheter_outputevents', 'catheter_used_in_output_events',
       'datetimeevents', 'catheter_used_in_datetime_events',
       'insertion_date_from_datetimeevents',
       'removal_date_from_datetimeevents', 'catheter_procedure_events',
       'catheter_used_in_procedure_events', 'catheter_size_from_chartevents',
       'index', 'catheter_clinical_notes', 'catheter_used_in_clinical_notes',
       'catheter_present', 'final_cauti_flag', 'final_insertion_date',
       'final_removal_date', 'catheter_duration_days', 'BMI_in_admission',
       'BMI_computed', 'BMI_last_year', 'BMI', 

In [4]:
# cols_to_drop = [
#  "BP_systolic",
#  "BP_diastolic"
# ]
# df = drop_columns(df, cols_to_drop)
# df.to_csv(dataset_path, index=False)