In [143]:
# main.py
import os
import sys
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Add src folder for custom models
# Removed old path hack; using package imports))
from llm_prior_project.models.target_model import SklearnTargetModel

In [176]:
# ==================================================
# 1. Data Loading
# ==================================================
def load_all_data(base_dir="mimic_data"):
    all_data = {}
    for folder in os.listdir(base_dir):
        folder_path = os.path.join(base_dir, folder)
        if os.path.isdir(folder_path):
            all_data[folder] = {}
            for file in os.listdir(folder_path):
                if file.endswith(".csv"):
                    file_path = os.path.join(folder_path, file)
                    df_name = os.path.splitext(file)[0]
                    try:
                        df = pd.read_csv(file_path, sep=None, engine="python")
                        all_data[folder][df_name] = df
                    except Exception as e:
                        print(f"❌ Could not read {file}: {e}")
    return all_data


def load_df(df, drop_index=True):
    """Drop default index columns if present."""
    if drop_index and "Unnamed: 0" in df.columns:
        return df.drop(columns=["Unnamed: 0"])
    return df


def build_master_dataset(all_data):
    """Merge demographics, ICU, lab, and physio data into one master table."""
    demo = load_df(all_data["DEMOGRAPHIC_DATA"]["DEMO_DATA"])
    icu_los = load_df(all_data["ICU_DATA"]["ICU_LOS"])
    total_los = load_df(all_data["ICU_DATA"]["TOTAL_LOS"])
    mech_vent = load_df(all_data["ICU_DATA"]["MECH_VENT_TIME"])
    severity = load_df(all_data["ICU_DATA"]["SEVERITY_SCORES"])
    services = all_data["ICU_DATA"]["SERVICES"]
    surgery = load_df(all_data["ICU_DATA"]["SURGERY_FLAGS"])
    icd9 = load_df(all_data["ICU_DATA"]["ICD9_DIAG"])
    prev_adm = load_df(all_data["ICU_DATA"]["PREVIOUS_ADMISSION_COUNT"])

    # Merge labs
    labs = []
    for df in all_data["LAB_DATA"].values():
        df = load_df(df)
        avg_cols = [c for c in df.columns if c.startswith("avg_")]
        labs.append(df[["hadm_id"] + avg_cols].copy())
    lab_df = pd.concat(labs, axis=1).loc[:, ~pd.concat(labs, axis=1).columns.duplicated()]

    # Merge physio
    physio = []
    for df in all_data["PHYSIO_DATA"].values():
        df = load_df(df)
        avg_cols = [c for c in df.columns if c.startswith("avg_")]
        physio.append(df[["hadm_id"] + avg_cols].copy())
    physio_df = pd.concat(physio, axis=1).loc[:, ~pd.concat(physio, axis=1).columns.duplicated()]

    master = (
        demo.merge(icu_los, on="hadm_id", how="left")
        .merge(total_los, on="hadm_id", how="left")
        .merge(mech_vent, on="hadm_id", how="left")
        .merge(severity, on="hadm_id", how="left")
        .merge(services, on="hadm_id", how="left")
        .merge(surgery, on="hadm_id", how="left")
        .merge(icd9, on="hadm_id", how="left")
        .merge(prev_adm, on="hadm_id", how="left")
        .merge(lab_df, on="hadm_id", how="left")
        .merge(physio_df, on="hadm_id", how="left")
    )
    return master


# Load data
all_data = load_all_data()
master = build_master_dataset(all_data)
master_clean = master.dropna(subset=["los"])

print("✅ Master dataset ready:", master_clean.shape)

✅ Master dataset ready: (63504, 35)


In [177]:
# %%
# ==================================================
# Preprocessing Setup
# ==================================================
def build_preprocessor(X, target="los", exclude_feats=None):
    """
    Build preprocessing pipeline for numeric + categorical features,
    while excluding leakage features like LOS itself.
    """
    if exclude_feats is None:
        exclude_feats = []

    # make sure target is always excluded
    exclude_feats = set(exclude_feats + [target])

    categorical_features = [
        c for c in X.select_dtypes(include=["object"]).columns.tolist()
        if c not in exclude_feats
    ]
    numeric_features = [
        c for c in X.select_dtypes(exclude=["object"]).columns.tolist()
        if c not in exclude_feats
    ]

    numeric_transformer = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ])
    categorical_transformer = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore"))
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ("num", numeric_transformer, numeric_features),
            ("cat", categorical_transformer, categorical_features),
        ]
    )
    return preprocessor, numeric_features, categorical_features


preprocessor, num_feats, cat_feats = build_preprocessor(master_clean, target="los")


In [184]:
# %%
# ==================================================
# Feature exclusions (junk / leakage)
# ==================================================
exclude_feats = ["hadm_id", "subject_id"]

# Preview dataset structure
print("Columns:", len(master_clean.columns))
print(master_clean.columns.tolist()[:40])  # peek first 40 columns

# Look at missingness
missing_summary = master_clean.isna().mean().sort_values(ascending=False)
print("Top missingness:")
print(missing_summary.head(20))

# Preview a few rows
master_clean.head()


Columns: 35
['hadm_id', 'age', 'gender', 'marital_status', 'religion', 'ethnicity', 'los', 'total_los_days', 'total_mech_vent_time', 'oasis_avg', 'sofa_avg', 'saps_avg', 'service', 'SURGERY_FLAG', 'icd9_group', 'admissions_count', 'subject_id', 'avg_creatinine', 'avg_white_blood_cells', 'avg_blood_glucose', 'avg_bicarbonate', 'avg_platelet_count', 'avg_hematrocrit', 'avg_albumin', 'avg_potasssium', 'avg_sodium', 'avg_blood_urea_nitrogen', 'avg_sys_press', 'avg_temp', 'avg_resp_rate', 'avg_hr', 'avg_cvp', 'avg_spo2', 'avg_art_ph', 'avg_dias_press']
Top missingness:
avg_temp                   0.895156
avg_art_ph                 0.683122
avg_cvp                    0.676776
total_mech_vent_time       0.541053
avg_albumin                0.466427
avg_spo2                   0.141802
avg_dias_press             0.141172
avg_sys_press              0.141172
avg_blood_glucose          0.128086
SURGERY_FLAG               0.126968
avg_bicarbonate            0.122229
avg_potasssium             0.1221

Unnamed: 0,hadm_id,age,gender,marital_status,religion,ethnicity,los,total_los_days,total_mech_vent_time,oasis_avg,...,avg_sodium,avg_blood_urea_nitrogen,avg_sys_press,avg_temp,avg_resp_rate,avg_hr,avg_cvp,avg_spo2,avg_art_ph,avg_dias_press
0,165315,64.971282,F,MARRIED,NONE,WHITE,1.1438,1.144444,5.0,41.0,...,135.263158,12.230769,141.538462,,19.9125,70.609375,,97.779221,,53.486111
1,152223,71.17891,M,MARRIED,CHRISTIAN,WHITE,1.2641,5.496528,4.666667,24.0,...,137.428571,9.0,101.340909,,22.101695,94.435103,,96.9,,52.5
2,124321,75.306343,M,MARRIED,CHRISTIAN,WHITE,1.1862,6.768056,,24.0,...,140.571429,65.2,122.18,,17.130435,150.5,5.266667,97.781022,7.413333,65.424
3,161859,39.042949,M,SINGLE,CHRISTIAN,WHITE,0.5124,2.856944,,15.0,...,139.2,14.75,122.9375,,25.890728,95.347826,,98.889273,,43.15
4,129635,58.989281,M,MARRIED,NONE,WHITE,3.5466,3.534028,,24.0,...,138.466667,11.653846,115.065934,,17.595238,137.259804,10.344828,95.238095,7.527692,59.52


In [185]:
# %%
# ==================================================
# EDA: subgroup counts and LOS distribution
# ==================================================
# Ethnicity distribution
print(master_clean["ethnicity"].value_counts(dropna=False).head(20))

# LOS summary by ethnicity
los_summary = (
    master_clean.groupby("ethnicity")["los"]
    .describe(percentiles=[0.25,0.5,0.75])
    .sort_values("count", ascending=False)
)
display(los_summary.head(10))

# Gender × Ethnicity crosstab
pd.crosstab(master_clean["gender"], master_clean["ethnicity"])


ethnicity
WHITE       44713
NONE         6441
BLACK        6120
HISPANIC     2265
ASIAN        2075
OTHER        1890
Name: count, dtype: int64


Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
ethnicity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
WHITE,44713.0,4.727738,9.002312,0.0001,1.1202,2.0897,4.3813,173.0725
NONE,6441.0,5.525501,10.2068,0.0014,1.204,2.3164,5.2386,169.4202
BLACK,6120.0,4.999474,10.91345,0.0025,1.070775,2.03735,4.0848,171.6227
HISPANIC,2265.0,4.743731,9.53755,0.0019,1.0478,1.9313,4.14,133.2542
ASIAN,2075.0,4.739803,11.397314,0.001,0.4188,1.5739,3.68725,126.8261
OTHER,1890.0,5.636142,11.584511,0.0209,1.04785,1.94985,4.878875,135.5667


ethnicity,ASIAN,BLACK,HISPANIC,NONE,OTHER,WHITE
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
F,898,3374,894,2636,770,19282
M,1177,2746,1371,3805,1120,25431


In [186]:
# %%
# ==================================================
# Preprocessor with exclusions
# ==================================================
preprocessor, num_feats, cat_feats = build_preprocessor(
    master_clean, target="los", exclude_feats=exclude_feats
)

print("Numeric features:", len(num_feats))
print("Categorical features:", len(cat_feats))


Numeric features: 25
Categorical features: 7


In [187]:
# %%
# ==================================================
# Subgroup Opportunity Sweep
# ==================================================
from collections import defaultdict

def subgroup_opportunity(X, y, groups, alpha=1.0, min_n=200):
    """
    Compare coefficients between subgroup-specific Ridge vs complement Ridge.
    Compute opportunity index for each feature.

    Parameters
    ----------
    X : ndarray
        Encoded feature matrix
    y : Series or ndarray
        Outcome vector
    groups : Series
        Subgroup labels
    alpha : float
        Ridge regularization strength
    min_n : int
        Minimum subgroup size

    Returns
    -------
    per_feature_df : pd.DataFrame
        Feature-level opportunity scores for this grouping
    summary : pd.DataFrame
        Group-level summary (max opp per subgroup)
    """
    feature_names = X.columns if isinstance(X, pd.DataFrame) else np.arange(X.shape[1])
    results = []
    per_feature = {}

    for gval, idx in groups.groupby(groups):
        if len(idx) < min_n or gval is None or gval != gval:  # skip NaN/low n
            continue

        mask = groups == gval
        X_sub, y_sub = X[mask], y[mask]
        X_comp, y_comp = X[~mask], y[~mask]

        # Ridge on subgroup
        ridge_sub = Ridge(alpha=alpha).fit(X_sub, y_sub)
        coef_sub = ridge_sub.coef_

        # Ridge on complement
        ridge_comp = Ridge(alpha=alpha).fit(X_comp, y_comp)
        coef_comp = ridge_comp.coef_

        # Opportunity: delta * std
        delta = coef_sub - coef_comp
        std_sub = X_sub.std(axis=0)
        opp = np.abs(delta) * std_sub

        df = pd.DataFrame({
            "feature": feature_names,
            "delta_beta": delta,
            "std_subgroup": std_sub,
            "opportunity_index": opp
        }).sort_values("opportunity_index", ascending=False)

        per_feature[(groups.name, gval)] = df
        results.append({
            "group": groups.name,
            "value": gval,
            "n": len(X_sub),
            "max_opportunity": df["opportunity_index"].max(),
            "top_feature": df.iloc[0]["feature"]
        })

    summary = pd.DataFrame(results).sort_values("max_opportunity", ascending=False)
    return per_feature, summary


def run_subgroup_sweep(
    df, subgroup_specs, exclude_feats=None, alpha_grid=[1.0], min_group_n=200
):
    """
    Wrapper: run subgroup_opportunity across multiple subgroup specs.

    Parameters
    ----------
    df : DataFrame
        Clean dataset including target + subgroup columns
    subgroup_specs : list of (col, val) or (col, None)
        Subgroups to evaluate
    exclude_feats : list
        Columns to exclude from predictors
    alpha_grid : list
        Alphas to test (report best per group)
    min_group_n : int
        Minimum subgroup size
    """
    if exclude_feats is None:
        exclude_feats = []

    results = []
    per_feature_all = {}

    # Build X, y once
    y = df["los"]
    X = df.drop(columns=exclude_feats).drop(columns=["los"])

    # One-hot encode categoricals for coefficient comparability
    X_enc = pd.get_dummies(X, drop_first=True)
    feature_names = X_enc.columns

    for col, val in subgroup_specs:
        groups = df[col]

        best_alpha, best_summary, best_feat = None, None, None
        best_score = -np.inf

        for alpha in alpha_grid:
            per_feature, summary = subgroup_opportunity(
                X_enc, y, groups, alpha=alpha, min_n=min_group_n
            )
            if not summary.empty:
                # Use average max_opportunity as score
                score = summary["max_opportunity"].mean()
                if score > best_score:
                    best_alpha = alpha
                    best_summary = summary
                    best_feat = per_feature
                    best_score = score

        if best_summary is not None:
            results.append(best_summary.assign(alpha=best_alpha))
            per_feature_all.update(best_feat)

    return pd.concat(results, ignore_index=True), per_feature_all


In [188]:
# %%
# ==================================================
# Subgroup opportunity analysis (refined)
# ==================================================
subgroup_specs = [
    ("ethnicity", None),
    ("gender", None),
    ("service", None),
]

subgroup_report, per_feature_opp = run_subgroup_sweep(
    master_clean,
    subgroup_specs=subgroup_specs,
    exclude_feats=exclude_feats,
    alpha_grid=[0.1, 1, 10, 100],
    min_group_n=200
)

display(subgroup_report.head(15))


ValueError: Input X contains NaN.
Ridge does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values