In [31]:
from __future__ import annotations

from pathlib import Path
import json
import numpy as np
import pandas as pd
import yaml


In [32]:
# =============================================================================
# OLEFINS — Runner piloté par olefins_config.yaml
# =============================================================================

CONFIG_PATH = Path("config_olefins.yaml")  # <-- adapter si besoin

with CONFIG_PATH.open("r", encoding="utf-8") as f:
    CFG = yaml.safe_load(f)

# -----------------------------
# Paramètres YAML
# -----------------------------
YEAR_START = int(CFG["years"]["start"])
YEAR_END = int(CFG["years"]["end"])
YEARS = np.arange(YEAR_START, YEAR_END + 1)

INP = CFG["input"]
OLEFINS_CSV = Path(INP["csv_path"])
BASE_YEAR = int(INP["base_year"])
COUNTRIES_SEL = INP.get("countries", "ALL")
ACTIVITY_COL_CONF = str(INP.get("activity_column", "olefins_production")).strip()

OUT = CFG["output"]
OUT_BASE_DIR = Path(OUT["base_dir"])
SCEN_DIRS = {k: OUT_BASE_DIR / v for k, v in OUT["scenario_dirs"].items()}
OUT_NAME = str(OUT.get("output_csv_name", "olefins.csv"))
SCOPE_TAG = str(OUT.get("scope", "petrochem_olefins"))

MAP = CFG.get("scenario_mapping", {}).get("refinery_to_olefins", {})
LINK = CFG.get("linkage", {})
LINK_ENABLED = bool(LINK.get("enabled", False))

H2M = CFG["h2_demand_model"]
H2_INT_2019 = float(H2M["intensity_2019_tH2_per_activity_unit"])

INT_TR = H2M["intensity_trajectory"]
INT_START = int(INT_TR["start"])
INT_END = int(INT_TR["end"])
INT_FACT_2050 = {k: float(v) for k, v in INT_TR["scenario_factors_2050"].items()}

ACT_TR = H2M["activity_trajectory_if_no_linkage"]
ACT_START = int(ACT_TR["start"])
ACT_END = int(ACT_TR["end"])
ACT_DECL_2050 = {k: float(v) for k, v in ACT_TR["scenario_decline_2050"].items()}

CHK = CFG.get("checks", {})
CHK_NONNEG = bool(CHK.get("non_negative", True))
CHK_COV = bool(CHK.get("full_year_coverage", True))
CHK_SCOPE = bool(CHK.get("enforce_scope_tag", True))
CHK_WARN_DIVERGE = bool(CHK.get("warn_if_diverges_from_refinery_index", True))


In [34]:
# =============================================================================
# Utils
# =============================================================================
def linear_ramp(years: np.ndarray, start: int, end: int) -> np.ndarray:
    if end <= start:
        return (years >= start).astype(float)
    x = (years - start) / (end - start)
    return np.clip(x, 0.0, 1.0)


def check_coverage(df: pd.DataFrame, y0: int, y1: int, keys=("country", "scenario")) -> None:
    exp = y1 - y0 + 1
    for k, g in df.groupby(list(keys)):
        ys = np.sort(g["year"].unique())
        if ys[0] != y0 or ys[-1] != y1 or len(ys) != exp:
            raise ValueError(
                f"Couverture années KO pour {k}: {ys[0]}..{ys[-1]} n={len(ys)} (attendu {exp})."
            )


def detect_activity_column(df: pd.DataFrame, preferred: str) -> str:
    if preferred in df.columns:
        return preferred
    cands = [
        c for c in df.columns
        if isinstance(c, str)
        and ("olefins" in c.lower())
        and (("production" in c.lower()) or ("activity" in c.lower()))
    ]
    if len(cands) == 1:
        return cands[0]
    if len(cands) > 1:
        raise ValueError(f"Colonnes d'activité ambiguës: {cands} (précisez input.activity_column).")
    raise ValueError(f"Impossible d'identifier la colonne d'activité. Colonnes: {list(df.columns)}")


def read_olefins_raw(path: Path) -> pd.DataFrame:
    df = pd.read_csv(path)
    if "country" not in df.columns or "year" not in df.columns:
        raise ValueError(f"{path}: colonnes requises manquantes (attendu au moins country, year).")
    df = df.copy()
    df["country"] = df["country"].astype(str).str.strip().str.upper()
    df["year"] = pd.to_numeric(df["year"], errors="coerce")
    df = df.dropna(subset=["year"])
    df["year"] = df["year"].astype(int)
    return df


def build_base_activity(
    df_in: pd.DataFrame,
    base_year: int,
    activity_col: str,
    scenario_col: str | None = "scenario",
    dedup_policy: str = "prefer_DE",
) -> pd.DataFrame:
    df = df_in[df_in["year"] == base_year].copy()
    if df.empty:
        years = sorted(df_in["year"].unique().tolist())
        raise ValueError(f"Aucune ligne pour base_year={base_year}. Années dispo: {years[:10]}...")

    df[activity_col] = pd.to_numeric(df[activity_col], errors="coerce")
    df = df.dropna(subset=[activity_col])

    if CHK_NONNEG and (df[activity_col] < -1e-9).any():
        raise ValueError("Activité négative détectée en année de base.")

    # Option: si scenario existe, prioriser 'DE' (main) pour l'ancrage
    if scenario_col and scenario_col in df.columns and dedup_policy == "prefer_DE":
        df_s = df[df[scenario_col].astype(str).str.strip().str.upper() == "DE"]
        if not df_s.empty:
            df = df_s

    base = df.groupby("country", as_index=False)[activity_col].mean()
    base = base.rename(columns={activity_col: "activity_2019"})
    return base


# ---- linkage refinery -> index(t) ----
def _refinery_csv_path(
    refinery_outputs_base_dir: Path,
    refinery_scenario: str,
    refinery_file: str,
    scenario_dirs: dict[str, str] | None = None,
) -> Path:
    """
    Résout le chemin du CSV refinery pour un scenario logique (ex: 'more-molecule').

    Supporte 2 conventions :
    - défaut : <base_dir>/<refinery_scenario>/<refinery_file>
    - si mapping YAML fourni : <base_dir>/<scenario_dirs[refinery_scenario]>/<refinery_file>
    """
    if scenario_dirs and refinery_scenario in scenario_dirs:
        return refinery_outputs_base_dir / scenario_dirs[refinery_scenario] / refinery_file
    return refinery_outputs_base_dir / refinery_scenario / refinery_file


def read_refinery_activity(refinery_csv: Path, activity_col: str) -> pd.DataFrame:
    df = pd.read_csv(refinery_csv)
    needed = {"country", "scenario", "year", activity_col}
    missing = needed.difference(df.columns)
    if missing:
        raise ValueError(f"{refinery_csv}: colonnes manquantes {sorted(missing)}. Colonnes: {list(df.columns)}")

    out = df[["country", "scenario", "year", activity_col]].copy()
    out["country"] = out["country"].astype(str).str.strip().str.upper()
    out["scenario"] = out["scenario"].astype(str).str.strip()
    out["year"] = pd.to_numeric(out["year"], errors="coerce").astype(int)
    out[activity_col] = pd.to_numeric(out[activity_col], errors="coerce").fillna(0.0)

    # si le fichier est détaillé par techno, on somme
    out = out.groupby(["country", "scenario", "year"], as_index=False)[activity_col].sum()
    return out


def build_refinery_index(
    refinery_df: pd.DataFrame,
    refinery_scenario: str,
    activity_col: str,
    base_year: int,
    years: np.ndarray,
) -> pd.DataFrame:
    df = refinery_df[refinery_df["scenario"] == refinery_scenario].copy()
    if df.empty:
        raise ValueError(f"Aucune donnée refinery pour scenario='{refinery_scenario}'.")

    rows = []
    for c, g in df.groupby("country"):
        s = g.set_index("year")[activity_col].sort_index().reindex(years)
        s = s.interpolate(limit_direction="both").fillna(0.0)

        base = float(s.loc[base_year]) if base_year in s.index else 0.0
        if abs(base) < 1e-12:
            idx = np.ones_like(years, dtype=float)
        else:
            idx = s.to_numpy() / base

        rows.append(pd.DataFrame({
            "country": c,
            "year": years,
            "refinery_scenario": refinery_scenario,
            "refinery_activity": s.to_numpy(),
            "refinery_index": idx,
        }))

    out = pd.concat(rows, ignore_index=True)
    if CHK_NONNEG and (out["refinery_index"] < -1e-9).any():
        raise ValueError("Index refinery négatif détecté.")
    return out


In [35]:

# =============================================================================
# 1) Lecture olefins + base 2019
# =============================================================================
df_ole = read_olefins_raw(OLEFINS_CSV)

# pays
if COUNTRIES_SEL == "ALL":
    COUNTRIES = sorted(df_ole.loc[df_ole["year"] == BASE_YEAR, "country"].unique().tolist())
else:
    COUNTRIES = [str(c).strip().upper() for c in COUNTRIES_SEL]

df_ole = df_ole[df_ole["country"].isin(COUNTRIES)].copy()

ACT_COL = detect_activity_column(df_ole, preferred=ACTIVITY_COL_CONF)
base = build_base_activity(df_ole, base_year=BASE_YEAR, activity_col=ACT_COL, scenario_col="scenario", dedup_policy="prefer_DE")

if base.empty:
    raise ValueError("Base olefins vide (après filtres).")

base_map = dict(zip(base["country"], base["activity_2019"]))


In [36]:
# =============================================================================
# 2) Index refinery (si activé) : un index par scénario olefins
# =============================================================================
refinery_index_by_ole_sc: dict[str, pd.DataFrame] = {}
refinery_index_map: dict[tuple[str, str, int], float] = {}

if LINK_ENABLED:
    ref_base = Path(LINK["refinery_outputs_base_dir"])
    ref_file = str(LINK.get("refinery_output_csv_name", "refinery.csv"))
    ref_act_col = str(LINK.get("refinery_activity_column", "refinery_output_total"))
    ref_base_year = int(LINK.get("base_year", BASE_YEAR))
    missing_policy = str(LINK.get("missing_country_policy", "keep_base")).strip()

    # quels scénarios refinery doivent être chargés ?
    needed_ref_scenarios = sorted(set(MAP.keys()))
    if not needed_ref_scenarios:
        raise ValueError("scenario_mapping.refinery_to_olefins est vide : impossible de lier à refinery.")

    # lecture + concat
    ref_all = []
    ref_scenario_dirs = LINK.get("refinery_scenario_dirs", None)
    for ref_sc in needed_ref_scenarios:
        pth = _refinery_csv_path(
            ref_base,
            refinery_scenario=ref_sc,
            refinery_file=ref_file,
            scenario_dirs=ref_scenario_dirs,
        )
        if not pth.exists():
            raise FileNotFoundError(f"Refinery CSV introuvable: {pth}")
        ref_all.append(read_refinery_activity(pth, activity_col=ref_act_col))
    ref_all = pd.concat(ref_all, ignore_index=True)

    # index par scénario olefins (via mapping)
    for ref_sc, ole_sc in MAP.items():
        idx_df = build_refinery_index(ref_all, refinery_scenario=ref_sc, activity_col=ref_act_col,
                                      base_year=ref_base_year, years=YEARS)
        refinery_index_by_ole_sc[ole_sc] = idx_df

        # map pour lookup rapide
        for _, r in idx_df.iterrows():
            refinery_index_map[(ole_sc, r["country"], int(r["year"]))] = float(r["refinery_index"])

    # policy missing country
    def get_index(ole_sc: str, country: str, year: int) -> float:
        key = (ole_sc, country, int(year))
        if key in refinery_index_map:
            return refinery_index_map[key]
        if missing_policy == "keep_base":
            return 1.0
        raise ValueError(f"Pays manquant dans l'index refinery pour {ole_sc}/{country}/{year}.")

In [37]:
# =============================================================================
# 3) Scénarios olefins : activité(t) + intensité(t) => h2_demand(t)
# =============================================================================
def intensity_series(ole_sc: str) -> np.ndarray:
    if ole_sc not in INT_FACT_2050:
        raise ValueError(f"Facteur intensité 2050 manquant pour scénario '{ole_sc}' dans YAML.")
    f2050 = float(INT_FACT_2050[ole_sc])
    r = linear_ramp(YEARS, INT_START, INT_END)
    return H2_INT_2019 * (1.0 + r * (f2050 - 1.0))


def activity_series(country: str, ole_sc: str) -> np.ndarray:
    a0 = float(base_map[country])

    if LINK_ENABLED:
        idx = np.array([get_index(ole_sc, country, int(y)) for y in YEARS], dtype=float)
        if CHK_WARN_DIVERGE:
            # warning simple : si l'index explose, c'est suspect (audit)
            if np.nanmax(idx) > 5.0:
                print(f"[WARN] Index refinery très élevé pour {ole_sc}/{country}: max={np.nanmax(idx):.2f}")
        return a0 * idx

    # fallback autonome si linkage désactivé
    if ole_sc not in ACT_DECL_2050:
        raise ValueError(f"Decline 2050 manquant pour '{ole_sc}' (activity_trajectory_if_no_linkage).")
    d2050 = float(ACT_DECL_2050[ole_sc])
    r = linear_ramp(YEARS, ACT_START, ACT_END)
    factor = 1.0 - r * d2050
    return a0 * factor


rows = []
for ole_sc in SCEN_DIRS.keys():
    h2_int = intensity_series(ole_sc)

    for c in COUNTRIES:
        act = activity_series(c, ole_sc)
        h2 = act * h2_int

        df_c = pd.DataFrame({
            "country": c,
            "scenario": ole_sc,
            "year": YEARS,
            "olefins_activity": act,
            "h2_intensity": h2_int,
            "h2_demand_t_per_yr": h2,
            "scope": SCOPE_TAG,
        })
        rows.append(df_c)

df_out = pd.concat(rows, ignore_index=True)

In [38]:
# =============================================================================
# 4) Checks / invariants
# =============================================================================
if CHK_SCOPE:
    if df_out["scope"].nunique() != 1 or df_out["scope"].iloc[0] != SCOPE_TAG:
        raise ValueError(f"Scope invalide : attendu '{SCOPE_TAG}'.")

if CHK_NONNEG:
    if (df_out[["olefins_activity", "h2_demand_t_per_yr"]] < -1e-9).any(axis=None):
        raise ValueError("Valeurs négatives détectées (activité ou demande H2).")

if CHK_COV:
    check_coverage(df_out, YEAR_START, YEAR_END, keys=("country", "scenario"))


In [39]:
# =============================================================================
# 5) Écriture sur disque : un dossier par scénario
# =============================================================================
OUT_BASE_DIR.mkdir(parents=True, exist_ok=True)

# trace base
(base.sort_values("country")).to_csv(OUT_BASE_DIR / "olefins_base_2019.csv", index=False)

# trace index refinery (si linkage)
if LINK_ENABLED:
    idx_trace = pd.concat(
        [v.assign(olefins_scenario=k) for k, v in refinery_index_by_ole_sc.items()],
        ignore_index=True
    )
    idx_trace.to_csv(OUT_BASE_DIR / "olefins_refinery_index.csv", index=False)

# outputs
for ole_sc, out_dir in SCEN_DIRS.items():
    out_dir.mkdir(parents=True, exist_ok=True)
    df_sc = df_out[df_out["scenario"] == ole_sc].copy()
    df_sc.to_csv(out_dir / OUT_NAME, index=False)

# metadata (traçabilité)
meta = {
    "config_path": str(CONFIG_PATH),
    "olefins_input_csv": str(OLEFINS_CSV),
    "activity_column_used": ACT_COL,
    "countries_n": len(COUNTRIES),
    "years": {"start": YEAR_START, "end": YEAR_END},
    "linkage_enabled": LINK_ENABLED,
    "scope": SCOPE_TAG,
    "scenario_dirs": {k: str(v) for k, v in SCEN_DIRS.items()},
}
with (OUT_BASE_DIR / "metadata.json").open("w", encoding="utf-8") as f:
    json.dump(meta, f, indent=2, ensure_ascii=False)

print("OK: scénarios olefins écrits dans", OUT_BASE_DIR.resolve())
for ole_sc, out_dir in SCEN_DIRS.items():
    print(" -", ole_sc, "->", (out_dir / OUT_NAME).resolve())

OK: scénarios olefins écrits dans /Users/simonbrigode/Desktop/tp_pommes_kraft/data_country/industry/pommes-h2-network/data-pommes/scenarios-olefins
 - bau-eu -> /Users/simonbrigode/Desktop/tp_pommes_kraft/data_country/industry/pommes-h2-network/data-pommes/scenarios-olefins/bau-eu/olefins.csv
 - decarbo-eu -> /Users/simonbrigode/Desktop/tp_pommes_kraft/data_country/industry/pommes-h2-network/data-pommes/scenarios-olefins/decarbo-eu/olefins.csv
