In [1]:
import pandas as pd
import numpy as np
import os

DATA_DIR = "synthetic_data"
MC_DIR = "mc_outputs"
OUT_DIR = "ml_outputs"

os.makedirs(OUT_DIR, exist_ok=True)

In [2]:
projects = pd.read_csv(f"{DATA_DIR}/projects.csv")
risks = pd.read_csv(f"{DATA_DIR}/risks.csv")
summary = pd.read_csv(f"{MC_DIR}/project_summary.csv")

In [3]:
# ---- mark late risks ----
risks["is_late"] = risks["risk_id"].isin(["R_VULN", "R_INCI"]).astype(int)



In [4]:
# ---- basic risk aggregates (safe) ----
risk_agg = risks.groupby("project_id").agg(
    risk_count=("risk_id", "count"),
    risk_prob_sum=("probability", "sum"),
    late_risk_prob_sum=("probability", lambda s: float((s * risks.loc[s.index, "is_late"]).sum())),
    avg_prob=("probability", "mean"),
).reset_index()

In [5]:
# ---- expectation-based risk exposures (safe; no Monte Carlo outputs used) ----
# Expected value of triangular(a,m,b) is (a+m+b)/3
risks["sched_add_mean"] = (risks["sched_add_tri_o"] + risks["sched_add_tri_m"] + risks["sched_add_tri_p"]) / 3.0
risks["cost_lump_mean"] = (risks["cost_lump_tri_o"] + risks["cost_lump_tri_m"] + risks["cost_lump_tri_p"]) / 3.0

# Probability-weighted expectations
risks["E_sched_add"] = risks["probability"] * risks["sched_add_mean"]
risks["E_cost_lump"] = risks["probability"] * risks["cost_lump_mean"]

# For MUL risks: expected multiplier = exp(mu + 0.5*sigma^2)
risks["is_mul"] = (risks["risk_type"] == "MUL").astype(int)
risks["E_mul_factor"] = risks["is_mul"] * np.exp(
    risks["mul_logn_mu"].fillna(0) + 0.5 * (risks["mul_logn_sigma"].fillna(0) ** 2)
)
risks["E_mul_excess"] = risks["probability"] * (risks["E_mul_factor"] - 1.0)

risk_exp = risks.groupby("project_id").agg(
    E_sched_add_total=("E_sched_add", "sum"),
    E_cost_lump_total=("E_cost_lump", "sum"),
    E_mul_excess_total=("E_mul_excess", "sum"),
    late_E_sched_add=("E_sched_add", lambda s: float((s * risks.loc[s.index, "is_late"]).sum())),
    late_E_cost_lump=("E_cost_lump", lambda s: float((s * risks.loc[s.index, "is_late"]).sum())),
).reset_index()

In [6]:
# ---- merge into one ML table (df must be created BEFORE merging risk_exp) ----
projects_keep = projects[["project_id"]]  # optional: keep only id to avoid duplicates


In [7]:
# ---- merge into one ML table ----
df = (
    summary
    .merge(projects_keep, on="project_id", how="left")
    .merge(risk_agg, on="project_id", how="left")
    .merge(risk_exp, on="project_id", how="left")
)

In [8]:
# ---- targets (Monte Carlo outputs are targets only) ----
df["y_duration"] = df["mc_duration_p90"]
df["y_cost"] = df["mc_cost_p90"]
df["y_sched_over"] = (df["p_duration_over_cpm"] > 0.5).astype(int)
df["y_cost_over"] = (df["p_cost_over_cpm"] > 0.5).astype(int)


In [9]:
# ---- joint-only coupling features (NO Monte Carlo outputs used) ----
eps = 1e-9
df["risk_cost_per_day"] = df["E_cost_lump_total"] / (df["E_sched_add_total"] + eps)
df["late_risk_share_cost"] = df["late_E_cost_lump"] / (df["E_cost_lump_total"] + eps)
df["late_risk_share_sched"] = df["late_E_sched_add"] / (df["E_sched_add_total"] + eps)
df["expected_delay_ratio"] = df["E_sched_add_total"] / (df["cpm_duration"] + eps)


In [10]:
# Save
out_path = f"{OUT_DIR}/ml_dataset.csv"
df.to_csv(out_path, index=False)

In [11]:
print("Saved:", out_path)
print("\nNew joint-only features (first 5):")
print(df[["project_id","E_sched_add_total","E_cost_lump_total","E_mul_excess_total",
          "risk_cost_per_day","late_risk_share_cost","late_risk_share_sched","expected_delay_ratio"]].head(5))

print("\nHead (paper-friendly):")
print(df[["project_id","risk_level_bucket","size_bucket","risk_prob_sum","cpm_duration","y_duration","cpm_cost","y_cost"]].head(10))

Saved: ml_outputs/ml_dataset.csv

New joint-only features (first 5):
  project_id  E_sched_add_total  E_cost_lump_total  E_mul_excess_total  \
0        P12          16.830667       42531.166667           -1.047763   
1        P24          16.830667       42531.166667           -1.047763   
2        P08          16.830667       42531.166667           -1.047763   
3        P20          16.830667       42531.166667           -1.047763   
4        P04          16.830667       42531.166667           -1.047763   

   risk_cost_per_day  late_risk_share_cost  late_risk_share_sched  \
0        2527.004278              0.706882                0.61281   
1        2527.004278              0.706882                0.61281   
2        2527.004278              0.706882                0.61281   
3        2527.004278              0.706882                0.61281   
4        2527.004278              0.706882                0.61281   

   expected_delay_ratio  
0              0.142464  
1              0.11