# Expanded preprocessing + smoke test

This notebook prepares a slightly larger smoke test for the 30-day readmission preprocessing pipeline.
Change the NROWS and N_HADM variables in Cell 2 to control how much data is loaded for the smoke test.

In [5]:
# Config: adjust these for local runs
DATA_DIR = '/Users/yuchenzhou/documents/duke/compsci526/final_proj/mimic-iv-3.1'
# How many chartevents / labevents rows to load (set low for quick smoke-test)
NROWS_CHARTEVENTS = 500000
NROWS_LABEVENTS = 200000
# How many hadm_ids to attempt to build (aim for ~100-300 for larger smoke-test)
N_HADM = 200
# random seed for reproducibility
RANDOM_STATE = 42
# First hours to keep from admission
FIRST_HOURS = 72
# Hourly resampling frequency
RESAMPLE_FREQ = '1H'

In [6]:
# Imports
import os, math, random, json
from pathlib import Path
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, average_precision_score
pd.options.mode.chained_assignment = None

In [7]:
# Helper: safe read with parse dates if present
def _read(path, nrows=None):
    parse_dates = []
    for col in ['admittime','dischtime','deathtime','charttime','charttime','charttime','chartdate','charttime_ts','charttime_local']:
        # presence-checked possible date columns; pandas will ignore unknown
        if col in pd.read_csv(path, nrows=0).columns:
            parse_dates.append(col)
    return pd.read_csv(path, parse_dates=parse_dates or None, nrows=nrows)

def compute_readmit_30d(adm_df):
    # expects admissions with subject_id, hadm_id, admit_dt and discharge_dt already parsed
    df = adm_df.sort_values(['subject_id','admittime'])[[ 'subject_id','hadm_id','admittime','dischtime' ]].copy()
    df['next_admit'] = df.groupby('subject_id')['admittime'].shift(-1)
    df['days_to_next_admit'] = (df['next_admit'] - df['dischtime']).dt.total_seconds() / 86400.0
    df['readmit_30d'] = (df['days_to_next_admit'] >= 0) & (df['days_to_next_admit'] <= 30)
    return df[['subject_id','hadm_id','readmit_30d']]

In [8]:
# Top-level load of admissions & patients (small) to compute labels and select hadm_ids
ads_path = os.path.join(DATA_DIR, 'hosp', 'admissions.csv')
pts_path = os.path.join(DATA_DIR, 'hosp', 'patients.csv')
print('Reading admissions (header check)...')
admissions = pd.read_csv(ads_path, parse_dates=['admittime','dischtime'], nrows=None)
patients = pd.read_csv(pts_path, parse_dates=['anchor_year']) if os.path.exists(pts_path) else pd.DataFrame()
labels_df = compute_readmit_30d(admissions)
print('Admissions rows:', len(admissions))
print('Labels computed, positive rate:', labels_df['readmit_30d'].mean())

Reading admissions (header check)...
Admissions rows: 546028
Labels computed, positive rate: 0.20025529826309274


In [9]:
# Expanded itemid lists (examples; adjust as needed)
vital_items_of_interest = [220045, 220210, 220277, 220045, 220046, 220047, 220048]
lab_items_of_interest = [50820, 51464, 50931, 50912, 50813, 50882, 50893, 51267]
print('vitals and labs lists lengths:', len(vital_items_of_interest), len(lab_items_of_interest))

vitals and labs lists lengths: 7 8


In [10]:
# Load slices of chartevents and labevents for the smoke test (may take memory)
ce_path = os.path.join(DATA_DIR, 'icu', 'chartevents.csv')
le_path = os.path.join(DATA_DIR, 'hosp', 'labevents.csv')
print('Reading chartevents nrows=', NROWS_CHARTEVENTS)
chartevents = _read(ce_path, nrows=NROWS_CHARTEVENTS)
print('Read chartevents rows:', len(chartevents))
print('Reading labevents nrows=', NROWS_LABEVENTS)
labevents = _read(le_path, nrows=NROWS_LABEVENTS)
print('Read labevents rows:', len(labevents))

Reading chartevents nrows= 500000
Read chartevents rows: 500000
Reading labevents nrows= 200000
Read labevents rows: 200000


In [11]:
# Narrow events to items of interest and create a unified time column
def prepare_events(df, time_col_candidates=('charttime','chartdate','charttime_ts')):
    # pick first available time col
    time_col = next((c for c in time_col_candidates if c in df.columns), None)
    if time_col is None:
        raise ValueError('No time column found in events')
    df = df.copy()
    df['event_time'] = pd.to_datetime(df[time_col])
    return df

In [12]:
chartevents = prepare_events(chartevents)
labevents = prepare_events(labevents)
# reduce columns to speed later ops
chartevents = chartevents[['subject_id','hadm_id','itemid','value','event_time']].dropna(subset=['hadm_id'])
labevents = labevents[['subject_id','hadm_id','itemid','value','event_time']].dropna(subset=['hadm_id'])
print('Prepared events shapes:', chartevents.shape, labevents.shape)

Prepared events shapes: (500000, 5) (100858, 5)


In [13]:
# Candidate hadm pool: admissions with labels; sample up to N_HADM balanced by label
pool = labels_df.merge(admissions[['hadm_id','subject_id','admittime','dischtime']], on=['subject_id','hadm_id'])
pos = pool[pool['readmit_30d']].sample(frac=1.0, random_state=RANDOM_STATE)
neg = pool[~pool['readmit_30d']].sample(frac=1.0, random_state=RANDOM_STATE)
n_each = max(1, N_HADM // 2)
sel = pd.concat([pos.head(n_each), neg.head(n_each)])
print('Selected hadm count:', len(sel))
hadm_ids = sel['hadm_id'].unique().tolist()

Selected hadm count: 200


In [14]:
# Build dataset: for each hadm, extract first FIRST_HOURS of events, resample hourly and pool
def build_patient_series(hadm_id):
    adm = admissions[admissions['hadm_id'] == hadm_id].iloc[0]
    sub_ce = chartevents[chartevents['hadm_id'] == hadm_id]
    sub_le = labevents[labevents['hadm_id'] == hadm_id]
    # combine and filter to FIRST_HOURS
    start = adm['admittime']
    end = adm['admittime'] + pd.Timedelta(hours=FIRST_HOURS)
    ev = pd.concat([sub_ce, sub_le], ignore_index=True)
    ev = ev[(ev['event_time'] >= start) & (ev['event_time'] <= end)]
    if ev.empty:
        return None
    # pivot by hour and itemid using mean aggregation
    ev = ev.assign(hour=lambda d: ((d['event_time'] - start).dt.total_seconds() // 3600).astype(int))
    pivot = ev.groupby(['hour','itemid'])['value'].agg('mean').unstack(fill_value=np.nan)
    # resample to ensure FIRST_HOURS rows
    pivot = pivot.reindex(range(0, FIRST_HOURS))
    return pivot

In [15]:
# Pooling & imputation: linear interp + ffill/bfill + column mean
def impute_and_pool(df_hourly):
    if df_hourly is None or df_hourly.shape[0] == 0:
        return None
    arr = df_hourly.copy()
    arr = arr.apply(pd.to_numeric, errors='coerce')
    arr = arr.interpolate(limit_direction='both', axis=0).ffill().bfill()
    arr = arr.fillna(arr.mean())
    pooled = []
    pooled.extend(arr.mean(axis=0).values.tolist())
    pooled.extend(arr.std(axis=0).values.tolist())
    pooled.extend(arr.min(axis=0).values.tolist())
    pooled.extend(arr.max(axis=0).values.tolist())
    return np.array(pooled)

# Build X, y
X = []
y = []
hadm_with_features = []
for hid in hadm_ids:
    s = build_patient_series(hid)
    vec = impute_and_pool(s)
    if vec is not None and not np.any(np.isnan(vec)):
        X.append(vec)
        y.append(labels_df[labels_df['hadm_id']==hid]['readmit_30d'].iloc[0])
        hadm_with_features.append(hid)
print('Built dataset rows:', len(X))

Built dataset rows: 0


In [16]:
# Quick baseline evaluation (train/test split)
if len(X) >= 10:
    X_arr = np.vstack(X)
    y_arr = np.array(y).astype(int)
    Xtr, Xte, ytr, yte = train_test_split(X_arr, y_arr, test_size=0.3, random_state=RANDOM_STATE, stratify=y_arr)
    clf = LogisticRegression(class_weight='balanced', max_iter=500)
    clf.fit(Xtr, ytr)
    probs = clf.predict_proba(Xte)[:,1]
    print('LogReg AUROC:', roc_auc_score(yte, probs))
    print('LogReg AUPRC:', average_precision_score(yte, probs))
else:
    print('Not enough rows to run baseline; need >=10')

Not enough rows to run baseline; need >=10


## Notes
- This notebook is a smoke test and may be memory heavy. Reduce NROWS_* and N_HADM for quick runs.
- The pipeline uses simple pooling (mean/std/min/max) — good for classical baselines. For deep models keep full time-series.
- If you want me to add StratifiedKFold CV and artifact saving, I can next.