In [1]:
from __future__ import annotations

import math
from dataclasses import dataclass
from typing import Dict, List, Tuple, Optional
import numpy as np
import pandas as pd


In [2]:

# =========================
# Configuration
# =========================

@dataclass
class GeneratorConfig:
    n_projects: int = 24             # minimum 20; 24/30 is nicer for grouping
    seed: int = 42
    out_dir: str = "."
    # templates control: you can tweak later
    min_tasks: int = 14
    max_tasks: int = 22


In [3]:
# =========================
# Project Template Builder
# =========================

# We use a "phase DAG" (requirements -> design -> build (parallel streams) -> test -> deploy)
# Each project will have a different number of build streams and optional tasks.
PHASES = [
    "INIT",
    "REQ",
    "DESIGN",
    "GOV",       # security governance / policy / compliance planning
    "BUILD_A",   # parallel stream A
    "BUILD_B",   # parallel stream B (optional)
    "BUILD_C",   # parallel stream C (optional)
    "INTEGRATE",
    "DOC",
    "UAT",
    "SEC_TEST",  # pen test / security testing gate
    "REMEDIATE",
    "PREPROD",
    "PILOT",
    "ROLLOUT",
]


In [4]:
# Tasks are defined as (task_code, task_name, phase, base_duration_scale_weight)
BASE_TASK_LIBRARY = [
    ("T_INIT", "Project Initiation", "INIT", 1.0),
    ("T_REQ", "Requirements Gathering", "REQ", 1.2),
    ("T_DES", "Solution Architecture & Design", "DESIGN", 1.1),
    ("T_GOV", "Security Policy / Governance", "GOV", 1.0),

    ("T_BA", "Core Implementation Stream A", "BUILD_A", 2.0),
    ("T_BB", "Implementation Stream B", "BUILD_B", 1.6),
    ("T_BC", "Implementation Stream C", "BUILD_C", 1.4),

    ("T_INT", "Systems Integration", "INTEGRATE", 1.3),
    ("T_DOC", "Compliance Documentation", "DOC", 1.0),

    ("T_UAT", "User Acceptance Testing", "UAT", 1.2),
    ("T_SEC", "Security Testing / Penetration Testing", "SEC_TEST", 1.3),
    ("T_REM", "Vulnerability Remediation", "REMEDIATE", 1.2),

    ("T_PRE", "Pre-Production Validation", "PREPROD", 1.0),
    ("T_PIL", "Pilot Deployment", "PILOT", 1.0),
    ("T_ROL", "Full Production Rollout", "ROLLOUT", 0.9),
]


In [5]:
# Core dependencies (DAG)
# We will add/omit certain tasks depending on project size
BASE_DEPS = {
    "T_INIT": [],
    "T_REQ": ["T_INIT"],
    "T_DES": ["T_REQ"],
    "T_GOV": ["T_DES"],

    "T_BA": ["T_DES", "T_GOV"],
    "T_BB": ["T_DES", "T_GOV"],
    "T_BC": ["T_DES", "T_GOV"],

    "T_INT": [],   # will be set based on included build streams
    "T_DOC": ["T_GOV"],

    "T_UAT": [],   # will depend on integration + doc
    "T_SEC": ["T_UAT"],
    "T_REM": ["T_SEC"],

    "T_PRE": ["T_REM"],
    "T_PIL": ["T_PRE"],
    "T_ROL": ["T_PIL"],
}


In [6]:
# =========================
# Risk Model
# =========================

# We model risks as events with:
# - probability p
# - schedule impact: additive days on target tasks OR multiplicative factor on target tasks
# - cost impact: lump sum + optional per-day uplift (kept simple for now)

RISK_LIBRARY = [
    # risk_id, name, type, timing_bias, base_p_low, base_p_med, base_p_high
    ("R_COMP", "Compliance Change (policy/regulatory)", "ADD", "EARLY", 0.10, 0.25, 0.40),
    ("R_VEND", "Vendor / third-party integration delay", "ADD", "MID",   0.15, 0.30, 0.45),
    ("R_VULN", "Critical vulnerability discovered late", "MUL", "LATE",  0.08, 0.20, 0.35),
    ("R_INCI", "Pilot security incident / rollback",     "ADD", "LATE",  0.05, 0.12, 0.22),
]

In [7]:
# Target tasks per risk
RISK_TARGETS = {
    "R_COMP": ["T_GOV", "T_DOC"],
    "R_VEND": ["T_INT"],
    "R_VULN": ["T_BA", "T_INT", "T_SEC", "T_REM"],   # rework spreads
    "R_INCI": ["T_PIL", "T_ROL"],
}


In [8]:
# =========================
# Utility Functions
# =========================

def rng_for(seed: int) -> np.random.Generator:
    return np.random.default_rng(seed)

def triangular_params_from_scale(
    base_days: float,
    uncertainty: float,
    r: np.random.Generator
) -> Tuple[float, float, float]:
    """
    Generate (O, M, P) around base_days.
    uncertainty in [0.15..0.60] controls spread.
    """
    # Most-likely around base_days with a small random drift
    m = max(1.0, base_days * r.uniform(0.90, 1.10))
    spread = base_days * uncertainty
    o = max(1.0, m - spread * r.uniform(0.7, 1.1))
    p = max(m + 1.0, m + spread * r.uniform(0.7, 1.3))
    # Ensure ordering
    if not (o <= m <= p):
        o, m, p = sorted([o, m, p])
    return (round(o, 2), round(m, 2), round(p, 2))

def pick_project_profile(i: int, r: np.random.Generator) -> Dict[str, str]:
    """
    Creates controlled variety.
    """
    # 4 buckets: low/med/high/extreme (tail-heavy)
    bucket = ["LOW", "MED", "HIGH", "EXTREME"][i % 4]

    # Size bucket
    size = ["SMALL", "MEDIUM", "LARGE"][i % 3]

    # Late-risk concentration: EXTREME tends to be late-heavy
    if bucket == "EXTREME":
        late_conc = "LATE_HEAVY"
        tail = "HEAVY_TAIL"
    else:
        late_conc = r.choice(["BALANCED", "LATE_HEAVY"], p=[0.6, 0.4])
        tail = r.choice(["NORMAL_TAIL", "HEAVY_TAIL"], p=[0.75, 0.25])

    # Coupling: whether cost and schedule are strongly tied
    coupling = r.choice(["WEAK", "STRONG"], p=[0.45, 0.55])

    return {
        "risk_level": bucket,
        "size": size,
        "late_concentration": late_conc,
        "tail": tail,
        "coupling": coupling,
    }

def included_streams(size: str, r: np.random.Generator) -> List[str]:
    """
    Decide how many build streams.
    Small: A only (sometimes A+B)
    Medium: A+B (sometimes A+B+C)
    Large: A+B+C
    """
    if size == "SMALL":
        return ["T_BA"] if r.random() < 0.7 else ["T_BA", "T_BB"]
    if size == "MEDIUM":
        return ["T_BA", "T_BB"] if r.random() < 0.7 else ["T_BA", "T_BB", "T_BC"]
    return ["T_BA", "T_BB", "T_BC"]



In [9]:
# =========================
# Core Generator
# =========================

def generate_projects(cfg: GeneratorConfig) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    r = rng_for(cfg.seed)

    projects_rows = []
    tasks_rows = []
    risks_rows = []

    for i in range(cfg.n_projects):
        project_id = f"P{i+1:02d}"
        profile = pick_project_profile(i, r)

        # Baseline scale by size (days)
        size = profile["size"]
        if size == "SMALL":
            base_scale = r.uniform(0.7, 1.0)
            base_uncertainty = r.uniform(0.18, 0.32)
        elif size == "MEDIUM":
            base_scale = r.uniform(1.0, 1.35)
            base_uncertainty = r.uniform(0.22, 0.40)
        else:  # LARGE
            base_scale = r.uniform(1.35, 1.75)
            base_uncertainty = r.uniform(0.28, 0.55)

        # Tail heaviness increases uncertainty
        if profile["tail"] == "HEAVY_TAIL":
            base_uncertainty = min(0.65, base_uncertainty + 0.10)

        # Burn rate and fixed cost (synthetic but realistic ranges)
        burn_rate = float(r.uniform(1800, 6500))
        fixed_cost = float(r.uniform(15000, 120000))

        # Include build streams based on size
        streams = included_streams(size, r)

        # Build task set
        task_defs = {code: (name, phase, w) for code, name, phase, w in BASE_TASK_LIBRARY}
        included_tasks = ["T_INIT", "T_REQ", "T_DES", "T_GOV"] + streams + ["T_INT", "T_DOC", "T_UAT", "T_SEC", "T_REM", "T_PRE", "T_PIL", "T_ROL"]

        # Set deps dynamically
        deps = dict(BASE_DEPS)
        deps["T_INT"] = streams  # integrate waits for all streams
        deps["T_UAT"] = ["T_INT", "T_DOC"]

        # Generate 3-point estimates per task
        for tcode in included_tasks:
            name, phase, weight = task_defs[tcode]
            base_days = 6.0 * weight * base_scale  # base anchor
            # Late-stage tasks tend to have more uncertainty in late-heavy projects
            uncertainty = base_uncertainty
            if profile["late_concentration"] == "LATE_HEAVY" and phase in ["SEC_TEST", "REMEDIATE", "PILOT", "ROLLOUT"]:
                uncertainty = min(0.70, uncertainty + 0.12)

            o, m, p = triangular_params_from_scale(base_days, uncertainty, r)

            tasks_rows.append({
                "project_id": project_id,
                "task_code": tcode,
                "task_name": name,
                "phase": phase,
                "opt_o": o,
                "likely_m": m,
                "pess_p": p,
                "pred": ";".join(deps.get(tcode, [])),
            })

        # Risk probabilities based on risk_level bucket
        rl = profile["risk_level"]
        col = {"LOW": 0, "MED": 1, "HIGH": 2, "EXTREME": 2}[rl]  # EXTREME uses "high" base probs but stronger impacts

        for risk_id, rname, rtype, timing, p_low, p_med, p_high in RISK_LIBRARY:
            p_base = [p_low, p_med, p_high][col]

            # If late-heavy, increase late risk probabilities slightly
            if profile["late_concentration"] == "LATE_HEAVY" and timing == "LATE":
                p_base = min(0.60, p_base + 0.06)

            # If extreme tail, push late risks + impact magnitude
            impact_multiplier = 1.0
            if rl == "EXTREME" and timing == "LATE":
                p_base = min(0.65, p_base + 0.08)
                impact_multiplier = 1.35

            # Define impact distributions (stored as parameters for Phase 2 engine)
            # ADD risks: add triangular days; cost lump sum triangular
            # MUL risks: multiply durations by lognormal factor; plus extra add-on days for testing
            if rtype == "ADD":
                # schedule add: triangular
                add_o = 1.0 * impact_multiplier
                add_m = 6.0 * impact_multiplier
                add_p = 16.0 * impact_multiplier if timing != "LATE" else 24.0 * impact_multiplier

                cost_o = 2000.0 * impact_multiplier
                cost_m = 12000.0 * impact_multiplier
                cost_p = 45000.0 * impact_multiplier if timing == "LATE" else 30000.0 * impact_multiplier

                risks_rows.append({
                    "project_id": project_id,
                    "risk_id": risk_id,
                    "risk_name": rname,
                    "risk_type": "ADD",
                    "probability": round(float(p_base), 4),
                    "targets": ";".join([t for t in RISK_TARGETS[risk_id] if t in included_tasks]),
                    "sched_add_tri_o": round(add_o, 2),
                    "sched_add_tri_m": round(add_m, 2),
                    "sched_add_tri_p": round(add_p, 2),
                    "mul_logn_mu": np.nan,
                    "mul_logn_sigma": np.nan,
                    "cost_lump_tri_o": round(cost_o, 2),
                    "cost_lump_tri_m": round(cost_m, 2),
                    "cost_lump_tri_p": round(cost_p, 2),
                })

            else:  # MUL (critical vulnerability)
                # lognormal factor controls heaviness of rework
                # EXTREME tail -> higher sigma
                mu = 0.12 * impact_multiplier
                sigma = 0.28 * impact_multiplier
                if profile["tail"] == "HEAVY_TAIL":
                    sigma = min(0.85, sigma + 0.12)

                # plus some additive days on SEC/REM in addition to multipliers
                add_o = 1.0 * impact_multiplier
                add_m = 5.0 * impact_multiplier
                add_p = 18.0 * impact_multiplier

                cost_o = 5000.0 * impact_multiplier
                cost_m = 18000.0 * impact_multiplier
                cost_p = 70000.0 * impact_multiplier if profile["tail"] == "HEAVY_TAIL" else 45000.0 * impact_multiplier

                risks_rows.append({
                    "project_id": project_id,
                    "risk_id": risk_id,
                    "risk_name": rname,
                    "risk_type": "MUL",
                    "probability": round(float(p_base), 4),
                    "targets": ";".join([t for t in RISK_TARGETS[risk_id] if t in included_tasks]),
                    "sched_add_tri_o": round(add_o, 2),
                    "sched_add_tri_m": round(add_m, 2),
                    "sched_add_tri_p": round(add_p, 2),
                    "mul_logn_mu": round(float(mu), 4),
                    "mul_logn_sigma": round(float(sigma), 4),
                    "cost_lump_tri_o": round(cost_o, 2),
                    "cost_lump_tri_m": round(cost_m, 2),
                    "cost_lump_tri_p": round(cost_p, 2),
                })

        projects_rows.append({
            "project_id": project_id,
            "size_bucket": size,
            "risk_level_bucket": rl,
            "late_concentration": profile["late_concentration"],
            "tail_type": profile["tail"],
            "coupling": profile["coupling"],
            "burn_rate_per_day": round(burn_rate, 2),
            "fixed_cost": round(fixed_cost, 2),
            "n_tasks": len(included_tasks),
            "n_streams": len(streams),
        })

    projects_df = pd.DataFrame(projects_rows)
    tasks_df = pd.DataFrame(tasks_rows)
    risks_df = pd.DataFrame(risks_rows)

    return projects_df, tasks_df, risks_df


In [10]:
def save_outputs(cfg: GeneratorConfig, projects_df: pd.DataFrame, tasks_df: pd.DataFrame, risks_df: pd.DataFrame) -> None:
    import os
    os.makedirs(cfg.out_dir, exist_ok=True)

    projects_path = os.path.join(cfg.out_dir, "projects.csv")
    tasks_path = os.path.join(cfg.out_dir, "tasks.csv")
    risks_path = os.path.join(cfg.out_dir, "risks.csv")

    projects_df.to_csv(projects_path, index=False)
    tasks_df.to_csv(tasks_path, index=False)
    risks_df.to_csv(risks_path, index=False)

    print("Saved:")
    print(" -", projects_path)
    print(" -", tasks_path)
    print(" -", risks_path)


if __name__ == "__main__":
    cfg = GeneratorConfig(n_projects=24, seed=42, out_dir="synthetic_data")
    projects_df, tasks_df, risks_df = generate_projects(cfg)
    save_outputs(cfg, projects_df, tasks_df, risks_df)

    # quick sanity summaries
    print("\nProject bucket counts:")
    print(projects_df.groupby(["risk_level_bucket", "size_bucket"]).size().sort_values(ascending=False).head(12))

    print("\nExample tasks (first project):")
    print(tasks_df[tasks_df["project_id"] == "P01"][["task_code","task_name","opt_o","likely_m","pess_p","pred"]])

    print("\nExample risks (first project):")
    print(risks_df[risks_df["project_id"] == "P01"][["risk_id","probability","risk_type","targets"]])

Saved:
 - synthetic_data\projects.csv
 - synthetic_data\tasks.csv
 - synthetic_data\risks.csv

Project bucket counts:
risk_level_bucket  size_bucket
EXTREME            LARGE          2
                   MEDIUM         2
                   SMALL          2
HIGH               LARGE          2
                   MEDIUM         2
                   SMALL          2
LOW                LARGE          2
                   MEDIUM         2
                   SMALL          2
MED                LARGE          2
                   MEDIUM         2
                   SMALL          2
dtype: int64

Example tasks (first project):
   task_code                               task_name  opt_o  likely_m  pess_p  \
0     T_INIT                      Project Initiation   4.12      5.05    6.05   
1      T_REQ                  Requirements Gathering   5.89      7.11    8.61   
2      T_DES          Solution Architecture & Design   5.02      5.93    7.13   
3      T_GOV            Security Policy / Governan