In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import LeaveOneOut
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score, f1_score, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.base import clone
from sklearn.preprocessing import StandardScaler


In [2]:

DATA = "ml_outputs/ml_dataset.csv"
df = pd.read_csv(DATA)


In [3]:

# Labels: high-risk projects (decision-oriented)
df["y_high_cost_overrun"] = (df["p_cost_over_cpm"] >= df["p_cost_over_cpm"].quantile(0.75)).astype(int)
df["y_high_sched_overrun"] = (df["p_duration_over_cpm"] >= df["p_duration_over_cpm"].quantile(0.75)).astype(int)


In [4]:

categorical = ["size_bucket", "risk_level_bucket", "late_concentration", "tail_type", "coupling"]
numeric_base = ["n_tasks", "n_streams", "burn_rate_per_day", "fixed_cost",
                "risk_prob_sum", "late_risk_prob_sum", "avg_prob"]


In [5]:
# Joint-only features from risk register expectations (non-leaky)
coupling_feats = [
    "E_sched_add_total", "E_cost_lump_total", "E_mul_excess_total",
    "late_E_sched_add", "late_E_cost_lump",
    "risk_cost_per_day", "late_risk_share_cost", "late_risk_share_sched",
    "expected_delay_ratio"
]


In [6]:
cost_only = categorical + numeric_base + ["cpm_cost"]
schedule_only = categorical + numeric_base + ["cpm_duration"]
joint = categorical + numeric_base + ["cpm_cost", "cpm_duration"] + coupling_feats


In [7]:

def loo_classification(Xcols, ycol, estimator):
    X = df[Xcols]
    y = df[ycol].values

    loo = LeaveOneOut()
    probs = np.zeros_like(y, dtype=float)
    preds = np.zeros_like(y, dtype=int)

    for tr, te in loo.split(X):
        Xtr, Xte = X.iloc[tr], X.iloc[te]
        ytr = y[tr]

        pre = ColumnTransformer([
            ("cat", OneHotEncoder(handle_unknown="ignore"), categorical),
            ("num", StandardScaler(), [c for c in Xcols if c not in categorical]),
        ])


        pipe = Pipeline([
            ("pre", pre),
            ("m", clone(estimator))
        ])

        pipe.fit(Xtr, ytr)

        # Some classifiers expose predict_proba, others decision_function
        if hasattr(pipe.named_steps["m"], "predict_proba"):
            p = pipe.predict_proba(Xte)[0, 1]
        else:
            # convert decision score to probability-ish via sigmoid
            s = pipe.decision_function(Xte)[0]
            p = 1.0 / (1.0 + np.exp(-s))

        probs[te[0]] = p
        preds[te[0]] = int(p >= 0.5)

    auc = roc_auc_score(y, probs) if len(np.unique(y)) == 2 else np.nan
    f1 = f1_score(y, preds, zero_division=0)
    acc = accuracy_score(y, preds)
    return auc, f1, acc


In [8]:
models = [
    ("LogReg", LogisticRegression(max_iter=5000)),
    ("RF", RandomForestClassifier(n_estimators=400, random_state=0))
]

experiments = [
    ("High Cost Overrun", "y_high_cost_overrun"),
    ("High Schedule Overrun", "y_high_sched_overrun")
]

feature_sets = [
    ("Cost-only", cost_only),
    ("Schedule-only", schedule_only),
    ("Joint", joint)
]

In [9]:
# ---- run models ----
rows = []
for exp_name, ycol in experiments:
    for fs_name, feats in feature_sets:
        for mname, est in models:
            auc, f1, acc = loo_classification(feats, ycol, est)
            rows.append({
                "experiment": exp_name,
                "features": fs_name,
                "model": mname,
                "ROC_AUC": auc,
                "F1": f1,
                "Accuracy": acc
            })

res = pd.DataFrame(rows)

In [10]:

# ---- add baseline ----
baseline_rows = []
for exp_name, ycol in experiments:
    y = df[ycol].values
    baseline_acc = max(y.mean(), 1 - y.mean())
    baseline_rows.append({
        "experiment": exp_name,
        "features": "Baseline",
        "model": "Always-majority",
        "ROC_AUC": np.nan,
        "F1": np.nan,
        "Accuracy": baseline_acc
    })


In [11]:
res = pd.concat([res, pd.DataFrame(baseline_rows)], ignore_index=True)
res = res.sort_values(["experiment","ROC_AUC"], ascending=[True, False])
print(res)


               experiment       features            model   ROC_AUC        F1  \
5       High Cost Overrun          Joint               RF  0.694444  0.200000   
4       High Cost Overrun          Joint           LogReg  0.675926  0.500000   
1       High Cost Overrun      Cost-only               RF  0.643519  0.181818   
0       High Cost Overrun      Cost-only           LogReg  0.629630  0.333333   
2       High Cost Overrun  Schedule-only           LogReg  0.629630  0.333333   
3       High Cost Overrun  Schedule-only               RF  0.615741  0.200000   
12      High Cost Overrun       Baseline  Always-majority       NaN       NaN   
9   High Schedule Overrun  Schedule-only               RF  0.805556  0.181818   
10  High Schedule Overrun          Joint           LogReg  0.805556  0.500000   
11  High Schedule Overrun          Joint               RF  0.805556  0.200000   
8   High Schedule Overrun  Schedule-only           LogReg  0.787037  0.363636   
7   High Schedule Overrun   

In [12]:
res.to_csv("ml_outputs/classification_results.csv", index=False)
print("\nSaved: ml_outputs/classification_results.csv")
print("\nLabel balance:")
print(df[["y_high_cost_overrun","y_high_sched_overrun"]].mean())



Saved: ml_outputs/classification_results.csv

Label balance:
y_high_cost_overrun     0.25
y_high_sched_overrun    0.25
dtype: float64
