In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import LeaveOneOut
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.base import clone


In [2]:
DATA = "ml_outputs/ml_dataset.csv"
df = pd.read_csv(DATA)

In [3]:
# ---- features ----
categorical = ["size_bucket", "risk_level_bucket", "late_concentration", "tail_type", "coupling"]
numeric_base = ["n_tasks", "n_streams", "burn_rate_per_day", "fixed_cost", "risk_prob_sum", "late_risk_prob_sum", "avg_prob"]
coupling_feats = [
    "E_sched_add_total", "E_cost_lump_total", "E_mul_excess_total",
    "late_E_sched_add", "late_E_cost_lump",
    "risk_cost_per_day", "late_risk_share_cost", "late_risk_share_sched",
    "expected_delay_ratio"
]
joint_features = categorical + numeric_base + ["cpm_cost", "cpm_duration"] + coupling_feats


In [4]:
# For fairness:
# cost-only does NOT use schedule predictors (cpm_duration), schedule-only does NOT use cost predictors (cpm_cost)
cost_only_features = categorical + numeric_base + ["cpm_cost"]
schedule_only_features = categorical + numeric_base + ["cpm_duration"]
# Joint gets both + coupling
joint_features = categorical + numeric_base + ["cpm_cost", "cpm_duration"] + coupling_feats


In [5]:
TARGETS = {
    "duration_p90": "y_duration",
    "cost_p90": "y_cost",
}

In [6]:
def make_model(model):
    pre = ColumnTransformer(
        transformers=[
            ("cat", OneHotEncoder(handle_unknown="ignore"), categorical),
            ("num", "passthrough", [c for c in model["features"] if c not in categorical]),
        ],
        remainder="drop",
    )
    return Pipeline([("pre", pre), ("m", model["estimator"])])


def loo_eval(Xcols, ycol, estimator):
    X = df[Xcols]
    y = df[ycol].values

    loo = LeaveOneOut()
    preds = np.zeros_like(y, dtype=float)

    for train_idx, test_idx in loo.split(X):
        Xtr, Xte = X.iloc[train_idx], X.iloc[test_idx]
        ytr = y[train_idx]
        pipe = Pipeline([
            ("pre", ColumnTransformer([
                ("cat", OneHotEncoder(handle_unknown="ignore"), categorical),
                ("num", "passthrough", [c for c in Xcols if c not in categorical]),
            ])),
            ("m", clone(estimator)),
        ])
        pipe.fit(Xtr, ytr)
        preds[test_idx[0]] = pipe.predict(Xte)[0]

    mae = mean_absolute_error(y, preds)
    rmse = np.sqrt(mean_squared_error(y, preds))
    r2 = r2_score(y, preds)
    return mae, rmse, r2


In [7]:
models = [
    ("Cost-only Linear", cost_only_features, LinearRegression()),
    ("Cost-only RF", cost_only_features, RandomForestRegressor(n_estimators=400, random_state=0)),

    ("Schedule-only Linear", schedule_only_features, LinearRegression()),
    ("Schedule-only RF", schedule_only_features, RandomForestRegressor(n_estimators=400, random_state=0)),

    ("Joint Linear", joint_features, LinearRegression()),
    ("Joint RF", joint_features, RandomForestRegressor(n_estimators=400, random_state=0)),
]


In [8]:
rows = []
for target_name, ycol in TARGETS.items():
    for name, feats, est in models:
        mae, rmse, r2 = loo_eval(feats, ycol, est)
        rows.append({
            "target": target_name,
            "model": name,
            "MAE": mae,
            "RMSE": rmse,
            "R2": r2,
        })


In [9]:
res = pd.DataFrame(rows).sort_values(["target","RMSE"])
print(res)

          target                 model            MAE          RMSE         R2
8       cost_p90  Schedule-only Linear   52073.388600  6.285846e+04   0.900783
6       cost_p90      Cost-only Linear   66903.054837  8.992011e+04   0.796964
7       cost_p90          Cost-only RF  108140.567532  1.352436e+05   0.540703
11      cost_p90              Joint RF  108986.087695  1.374177e+05   0.525817
9       cost_p90      Schedule-only RF  146023.895652  1.810732e+05   0.176680
10      cost_p90          Joint Linear  349657.794500  1.024214e+06 -25.341543
2   duration_p90  Schedule-only Linear       5.383438  7.048850e+00   0.969310
0   duration_p90      Cost-only Linear      12.002871  1.523128e+01   0.856705
5   duration_p90              Joint RF      17.375135  2.104799e+01   0.726360
3   duration_p90      Schedule-only RF      17.103608  2.184851e+01   0.705149
1   duration_p90          Cost-only RF      20.174129  2.564960e+01   0.593632
4   duration_p90          Joint Linear      41.23445

In [10]:
# Save for paper tables
res.to_csv("ml_outputs/model_benchmark_results.csv", index=False)
print("\nSaved: ml_outputs/model_benchmark_results.csv")


Saved: ml_outputs/model_benchmark_results.csv
