In [None]:
import pandas as pd
import numpy as np
from pathlib import Path

# ============ Configuration ============
PATH_PROJECTS  = Path("./load and costs/PJM/projects_with_dc_flags.xlsx")  # Output from previous step (contains DC_ATTR_*, teac_cost, allocation columns)
SHEET_PROJECTS = 0  # Or specify the sheet name
PATH_RUDC_WB  = Path("./rider/PJM/00_PJM_share.xlsx")     # One region per sheet, contains RU/DC rows, columns are years
OUT_PATH      = Path("./rider/PJM/sankey_inputs.xlsx")
# ====================================================

# Fixed Excel column range C..AB as "Zone Allocation" columns (Do not auto-detect)
# C -> 3rd column; AB -> 28th column; pandas 0-based index is 2..27 (slice is right-exclusive)
ZONE_COL_SLICE = slice(2, 28)  # [2, 28)

def _norm(s):
    return str(s).strip().upper().replace("\n"," ").replace("\t"," ")

def find_col(df, candidates):
    """Loose matching for common column names (Only for year/owner/teac_cost; allocation columns fixed to C..AB)"""
    keymap = { ''.join(c.lower().split()).replace('_',''): c for c in df.columns }
    for cand in candidates:
        k = ''.join(cand.lower().split()).replace('_','')
        if k in keymap:
            return keymap[k]
    raise KeyError(f"Column name not found; candidates={candidates}; available columns={list(df.columns)}")

def sum_alloc_by_zone_year(df_alloc, years, zone_cols, zone_map, value_name):
    """Prevent many-to-many explosion: Aggregate by year in wide format first, then melt to long format"""
    tmp = df_alloc.copy()
    tmp["year"] = years.values
    wide = tmp.groupby("year", as_index=False)[zone_cols].sum()
    long = wide.melt(id_vars=["year"], var_name="zone_col", value_name=value_name)
    long["zone"] = long["zone_col"].map(zone_map)
    long = long.groupby(["zone","year"], as_index=False)[value_name].sum()
    return long

# ---------- 0) Read RU/DC Allocation Shares ----------
rudc = pd.read_excel(PATH_RUDC_WB, sheet_name=None, header=0, index_col=0)
rows = []
for zname, df in rudc.items():
    if df is None or df.empty:
        continue
    idx = { _norm(i): i for i in df.index.astype(str) }
    if "RU" not in idx or "DC" not in idx:
        continue
    # Columns should be years
    years = [int(str(c).strip()) for c in df.columns]
    rows.append(pd.DataFrame({
        "zone": _norm(zname),
        "year": years,
        "RU_share": pd.to_numeric(df.loc[idx["RU"]].values, errors="coerce"),
        "DC_share": pd.to_numeric(df.loc[idx["DC"]].values, errors="coerce"),
    }))
if not rows:
    raise RuntimeError("No regions parsed from RU/DC workbook (Each sheet must contain RU/DC rows with years as columns).")

zone_shares = pd.concat(rows, ignore_index=True).fillna(0.0)
zone_shares["zone"] = zone_shares["zone"].map(_norm)
zone_shares["year"] = zone_shares["year"].astype(int)

# ---------- 1) Read Project Table (Fixed C..AB for allocation columns) ----------
proj = pd.read_excel(PATH_PROJECTS, sheet_name=SHEET_PROJECTS)

# Column validation: Must have at least up to column AB
if proj.shape[1] < ZONE_COL_SLICE.stop:
    raise RuntimeError(f"Insufficient columns in project table: Need at least {ZONE_COL_SLICE.stop} columns for C..AB allocation, but only found {proj.shape[1]}.")

owner_col = find_col(proj, ["owner_mapped", "owner mapped", "owner"])
year_col  = find_col(proj, ["year", "年份"])
cost_col  = find_col(proj, ["teac_cost", "teac cost", "cost", "TEAC cost"])

proj = proj.copy()
proj[owner_col] = proj[owner_col].astype(str).map(_norm)
proj["year"]    = pd.to_numeric(proj[year_col], errors="coerce").astype("Int64")
proj["teac_cost"] = pd.to_numeric(proj[cost_col], errors="coerce").fillna(0.0)

# Attribution Scenario Columns (0/1), Conservative capture: Name contains DC_ATTR
attr_cols = [c for c in proj.columns if "DC_ATTR" in str(c).upper()]
if not attr_cols:
    # Fallback: If no DC_ATTR_*, try to capture columns containing only 0/1
    for c in proj.columns:
        s = pd.to_numeric(proj[c], errors="coerce").dropna()
        if s.empty:
            continue
        if s.isin([0,1]).all():
            attr_cols.append(c)
if not attr_cols:
    raise RuntimeError("No attribution scenario columns identified (DC_ATTR_* or all 0/1 columns).")

# —— Fixed Allocation Columns: C..AB ——
zone_cols = list(proj.columns[ZONE_COL_SLICE])
zone_map_from_col = {c: _norm(c) for c in zone_cols}

# Full sets for zero-filling
ALL_YEARS  = sorted(pd.to_numeric(proj["year"].dropna()).astype(int).unique().tolist())
ALL_ZONES  = sorted(set(zone_map_from_col.values()) | set(zone_shares["zone"].unique()))
ALL_OWNERS = sorted(proj[owner_col].dropna().unique().tolist())

# Pad RU/DC shares with 0 for (zone, year)
idx_sh = pd.MultiIndex.from_product([ALL_ZONES, ALL_YEARS], names=["zone","year"])
zone_shares = (zone_shares.set_index(["zone","year"])[["RU_share","DC_share"]]
               .reindex(idx_sh, fill_value=0.0).reset_index())

owner_tabs, zone_tabs, user_tabs, links1_tabs, links2_tabs, diag_tabs = [], [], [], [], [], []

# ---------- 2) Main Process (Per Scenario) ----------
for attr_col in attr_cols:
    scenario = str(attr_col)

    # Table B: Owner x Year Initial Cost (0/1 Attribution)
    is_dc = pd.to_numeric(proj[attr_col], errors="coerce").fillna(0).astype(int).clip(0,1).values
    proj["cost_dc_attr_init"] = proj["teac_cost"] * (is_dc == 1)
    proj["cost_non_dc_init"]  = proj["teac_cost"] * (is_dc == 0)

    g = proj.groupby([owner_col, "year"], dropna=True)
    owner_init = (g[["cost_dc_attr_init","cost_non_dc_init"]].sum()
                    .rename(columns={"cost_dc_attr_init":"cost_dc_attr",
                                     "cost_non_dc_init":"cost_non_dc"})
                    .reset_index().rename(columns={owner_col:"region"}))

    # Zero-fill full set (owner, year)
    idx_oy = pd.MultiIndex.from_product([ALL_OWNERS, ALL_YEARS], names=["region","year"])
    owner_init = (owner_init.set_index(["region","year"])[["cost_dc_attr","cost_non_dc"]]
                    .reindex(idx_oy, fill_value=0.0).reset_index())
    owner_init["scenario"] = scenario
    owner_tabs.append(owner_init)

    # Table C: Zone x Year Cost (Row-wise normalization + Cost * 0/1 * Weight; Aggregate first then merge to ensure conservation)
    shares_raw = proj[zone_cols].apply(pd.to_numeric, errors="coerce").fillna(0.0).copy()

    # Row-wise normalization (approx 100 treated as percentage; otherwise sum and normalize; keep 0 if sum is 0)
    row_sum = shares_raw.sum(axis=1)
    is_pct  = (row_sum >= 90.0) & (row_sum <= 110.0)          # Approx 100 -> Percentage
    need_norm = (row_sum > 0) & (~np.isclose(row_sum, 1.0)) & (~is_pct)
    norm_factor = pd.Series(1.0, index=shares_raw.index)
    norm_factor[is_pct] = 100.0
    norm_factor[need_norm] = row_sum[need_norm]
    shares = shares_raw.div(norm_factor, axis=0).clip(lower=0.0)

    # Row-level Conservation Check (Critical)
    cost = proj["teac_cost"].values
    dc_alloc = shares.mul(cost * (is_dc == 1), axis=0)
    nd_alloc = shares.mul(cost * (is_dc == 0), axis=0)
    dc_row_err = np.abs(dc_alloc.sum(axis=1).values - cost * (is_dc == 1))
    nd_row_err = np.abs(nd_alloc.sum(axis=1).values - cost * (is_dc == 0))
    max_row_err = float(max(dc_row_err.max(initial=0.0), nd_row_err.max(initial=0.0)))

    diag = pd.DataFrame({
        "scenario": scenario,
        "row_index": np.arange(len(proj)),
        "year": proj["year"].values,
        "teac_cost": proj["teac_cost"].values,
        "sum_before": row_sum.values,
        "treated_as_percent": is_pct.astype(int).values,
        "norm_factor": norm_factor.values,
        "sum_after": shares.sum(axis=1).values,
        "is_dc": is_dc,
        "row_err": (dc_row_err + nd_row_err)
    })
    diag_tabs.append(diag)

    # Aggregate by year in wide format first, then melt, avoiding many-to-many amplification
    dc_zone_year = sum_alloc_by_zone_year(dc_alloc, proj["year"], zone_cols, zone_map_from_col, "cost_dc_attr")
    nd_zone_year = sum_alloc_by_zone_year(nd_alloc, proj["year"], zone_cols, zone_map_from_col, "cost_non_dc")
    zone_year = dc_zone_year.merge(nd_zone_year, on=["zone","year"], how="outer").fillna(0.0)

    # Zero-fill full set (zone, year)
    idx_zy = pd.MultiIndex.from_product([ALL_ZONES, ALL_YEARS], names=["zone","year"])
    zone_year = (zone_year.set_index(["zone","year"])[["cost_dc_attr","cost_non_dc"]]
                    .reindex(idx_zy, fill_value=0.0).reset_index())
    zone_year["scenario"] = scenario
    zone_tabs.append(zone_year)

    # Annual Conservation Check (B vs C)
    b_tot = owner_init.groupby("year")[["cost_dc_attr","cost_non_dc"]].sum().sort_index()
    c_tot = zone_year.groupby("year")[["cost_dc_attr","cost_non_dc"]].sum().sort_index()
    diff = (c_tot - b_tot).abs()
    max_diff_year = float(diff.values.max()) if not diff.empty else 0.0
    if max_diff_year > 1e-6:
        print(f"[WARN][{scenario}] B <-> C Annual Conservation Max Diff = {max_diff_year:.6f}")

    # Table D: Zone -> User (Allocated by final RU/DC shares)
    zjoin = zone_year.merge(zone_shares, on=["zone","year"], how="left", validate="m:1").fillna(0.0)
    zjoin["ru_from_dc_growth"]  = zjoin["cost_dc_attr"] * zjoin["RU_share"]
    zjoin["dc_from_dc_growth"]  = zjoin["cost_dc_attr"] * zjoin["DC_share"]
    zjoin["ru_from_non_dc"]     = zjoin["cost_non_dc"]  * zjoin["RU_share"]
    zjoin["dc_from_non_dc"]     = zjoin["cost_non_dc"]  * zjoin["DC_share"]
    zjoin["ru_total"] = zjoin["ru_from_dc_growth"] + zjoin["ru_from_non_dc"]
    zjoin["dc_total"] = zjoin["dc_from_dc_growth"] + zjoin["dc_from_non_dc"]

    user_tbl = zjoin[["zone","year",
                      "ru_from_dc_growth","ru_from_non_dc","ru_total",
                      "dc_from_dc_growth","dc_from_non_dc","dc_total"]].copy()
    idx_uy = pd.MultiIndex.from_product([ALL_ZONES, ALL_YEARS], names=["zone","year"])
    user_tbl = (user_tbl.set_index(["zone","year"])
                .reindex(idx_uy, fill_value=0.0).reset_index())
    user_tbl["scenario"] = scenario
    user_tabs.append(user_tbl)

    # Two-layer Sankey links
    links1_tabs.append(pd.concat([
        zone_year.assign(source="DC_growth", target=zone_year["zone"], value=zone_year["cost_dc_attr"])[["year","source","target","value"]].assign(scenario=scenario),
        zone_year.assign(source="NonDC",     target=zone_year["zone"], value=zone_year["cost_non_dc"])[["year","source","target","value"]].assign(scenario=scenario),
    ], ignore_index=True))
    l2_ru = user_tbl.assign(source=user_tbl["zone"], target="RU", value=user_tbl["ru_total"])[["year","source","target","value"]].assign(scenario=scenario)
    l2_dc = user_tbl.assign(source=user_tbl["zone"], target="DC", value=user_tbl["dc_total"])[["year","source","target","value"]].assign(scenario=scenario)
    links2_tabs.append(pd.concat([l2_ru, l2_dc], ignore_index=True))

# ---------- 3) Export ----------
owner_all = pd.concat(owner_tabs,  ignore_index=True)
zone_all  = pd.concat(zone_tabs,   ignore_index=True)
user_all  = pd.concat(user_tabs,   ignore_index=True)
links1_all= pd.concat(links1_tabs, ignore_index=True)
links2_all= pd.concat(links2_tabs, ignore_index=True)
diag_all  = pd.concat(diag_tabs,   ignore_index=True)

# Meta info sheet, explicitly stating: Allocation columns used C..AB
meta = pd.DataFrame({
    "key": ["n_rows_proj","n_zone_cols","zone_col_slice","attr_cols"],
    "value": [len(proj), len(zone_cols), "C..AB (pandas slice 2:28)", str(attr_cols)]
})

with pd.ExcelWriter(OUT_PATH, engine="openpyxl") as w:
    owner_all.to_excel(w, index=False, sheet_name="B_owner_init_costs")
    zone_all.to_excel(w,  index=False, sheet_name="C_zone_alloc_costs")
    user_all.to_excel(w,  index=False, sheet_name="D_zone_user_costs")
    links1_all.to_excel(w,index=False, sheet_name="L1_links_src_to_zone")
    links2_all.to_excel(w,index=False, sheet_name="L2_links_zone_to_user")
    diag_all.to_excel(w, index=False, sheet_name="Diagnostics")
    meta.to_excel(w, index=False, sheet_name="Meta")

print(f"Done! Saved to: {OUT_PATH.resolve()}")