In [4]:
import pandas as pd
from pathlib import Path

# ===================== Configuration Area =====================
# Input File Paths
PATH_INVEST = "./load and costs/CAISO/CAISO_cost.xlsx"         # Table 1: Year, Cost
PATH_GROWTH = "./load and costs/CAISO/22_load_compare.xlsx"    # Table 2: One sheet per region
PATH_SPLIT  = "./rider/CAISO/00_CAISO_share.xlsx"              # Table 3: One sheet per region (contains RU, DC rows)

# Table 1 Column Names
COL_YEAR_1  = "Year"
COL_COST_1  = "Cost"

# Table 2 Column Names (in each region's sheet)
COL_YEAR_2      = None   # If None, the program attempts to auto-identify the first column as Year
COL_RES_DELTA   = ["RES DELTA", "Residential Delta", "RES_DELTA", "RES"]  # Residential growth (may be negative; zeroed out after averaging)
COL_DC_DELTA    = ["DC DELTA", "DC delta", "DC_DELTA", "DC"]              # DC growth (4th column)

# Special Row Handling (Table 2)
YEARS_TO_DROP = [2019]           # First row is 2019, needs to be skipped
DROP_ROWS_IF_ALL_ZERO = True     # Note: This version drops rows ONLY if they are all zero AFTER averaging

# Keywords to identify RU/DC rows in Table 3 (case-insensitive, fuzzy match)
KW_RU = ["RU", "Residential"]
KW_DC = ["DC"]

# Lookahead Window
LOOKAHEAD_YEARS = 5              # Set to 5 for a 5-year forward rolling mean

# Output Directory
OUTDIR = Path("./rider/CAISO/")
OUTDIR.mkdir(parents=True, exist_ok=True)
# ===================================================

def _find_first_existing_col(df, candidates):
    if candidates is None:
        return df.columns[0]
    if isinstance(candidates, str):
        return candidates if candidates in df.columns else None
    for c in candidates:
        if c in df.columns:
            return c
    return None

def _normalize_year_series(s):
    return pd.to_numeric(s, errors="coerce").astype("Int64")

def read_table1(path):
    df = pd.read_excel(path)
    ycol = COL_YEAR_1 if COL_YEAR_1 in df.columns else df.columns[0]
    ccol = COL_COST_1 if COL_COST_1 in df.columns else df.columns[1]
    df = df.rename(columns={ycol: "Year", ccol: "TotalCost"})
    df["Year"] = _normalize_year_series(df["Year"])
    df = df.dropna(subset=["Year", "TotalCost"]).copy()
    df["Year"] = df["Year"].astype(int)
    return df[["Year", "TotalCost"]]

def read_table2_all_regions_raw(path):
    """
    Read Table 2 as a long format (Region, Year, RES_raw, DC_raw).
    **No sign adjustment or zero-row dropping here** (to avoid affecting forward averaging).
    """
    xl = pd.ExcelFile(path)
    out = []
    for sheet in xl.sheet_names:
        df = xl.parse(sheet)

        # Year Column
        ycol = _find_first_existing_col(df, COL_YEAR_2) or df.columns[0]
        df = df.rename(columns={ycol: "Year"})
        df["Year"] = _normalize_year_series(df["Year"])
        df = df.dropna(subset=["Year"]).copy()
        df["Year"] = df["Year"].astype(int)

        # Skip specified years (e.g., 2019)
        if YEARS_TO_DROP:
            df = df[~df["Year"].isin(YEARS_TO_DROP)]

        # Capture raw RES / DC columns
        res_col = _find_first_existing_col(df, COL_RES_DELTA)
        dc_col  = _find_first_existing_col(df, COL_DC_DELTA)
        if res_col is None or dc_col is None:
            raise ValueError(f"[{sheet}] RES/DC columns not found, please check column names. Existing columns: {df.columns.tolist()}")

        part = pd.DataFrame({
            "Region": sheet,
            "Year": df["Year"].values,
            "RES_raw": pd.to_numeric(df[res_col], errors="coerce").fillna(0.0).values,
            "DC_raw":  pd.to_numeric(df[dc_col],  errors="coerce").fillna(0.0).values
        })
        out.append(part)

    full = pd.concat(out, ignore_index=True)
    return full  # cols: Region, Year, RES_raw, DC_raw

def read_table3_splits(path):
    xl = pd.ExcelFile(path)
    long_rows = []

    def pick_row_index(idx):
        idx_lower = [str(x).lower() for x in idx]
        def _find(kw_list):
            for kw in kw_list:
                kw_l = kw.lower()
                for i, name in enumerate(idx_lower):
                    if kw_l in name:
                        return i
            return None
        return _find(KW_RU), _find(KW_DC)

    for sheet in xl.sheet_names:
        df = xl.parse(sheet)
        df = df.rename(columns={df.columns[0]: "index"})
        df = df.set_index("index")

        i_ru, i_dc = pick_row_index(df.index)
        if i_ru is None and i_dc is None:
            raise ValueError(f"[{sheet}] RU/DC rows not found; please adjust KW_RU/KW_DC. Index={df.index.tolist()}")

        # Select RU row
        ru_row = None
        for kw in KW_RU:
            cand = df.index[df.index.str.contains(kw, case=False, na=False)]
            if len(cand) > 0:
                ru_row = df.loc[cand[0]]
                break
        # Select DC row
        dc_row = None
        for kw in KW_DC:
            cand = df.index[df.index.str.contains(kw, case=False, na=False)]
            if len(cand) > 0:
                dc_row = df.loc[cand[0]]
                break

        # Keep only year-like columns
        year_cols = []
        for c in df.columns:
            try:
                y = int(str(c))
                if 1900 <= y <= 2100:
                    year_cols.append(c)
            except:
                pass
        if not year_cols:
            raise ValueError(f"[{sheet}] No year columns found; columns={df.columns.tolist()}")

        ru_s = pd.to_numeric(ru_row[year_cols], errors="coerce") if ru_row is not None else pd.Series(index=year_cols, dtype=float)
        dc_s = pd.to_numeric(dc_row[year_cols], errors="coerce") if dc_row is not None else pd.Series(index=year_cols, dtype=float)

        tmp = pd.DataFrame({
            "Region": sheet,
            "Year": [int(y) for y in year_cols],
            "RU_share": ru_s.values,
            "DC_share": dc_s.values
        })
        long_rows.append(tmp)

    long_df = pd.concat(long_rows, ignore_index=True)
    long_df[["RU_share", "DC_share"]] = long_df[["RU_share", "DC_share"]].fillna(0.0)
    return long_df  # cols: Region, Year, RU_share, DC_share

def forward_mean(series: pd.Series, window: int) -> pd.Series:
    """
    Forward rolling mean (y..y+H-1), keeping signs unchanged; clipping/merging done later.
    Method: Reverse time series -> backward rolling -> reverse back.
    """
    s = series.sort_index(ascending=True)
    fm = s.iloc[::-1].rolling(window=window, min_periods=1).mean().iloc[::-1]
    return fm.reindex(s.index)

# ==================== Main Process ====================

# Table 1: Total Investment
inv_df = read_table1(PATH_INVEST)  # Year, TotalCost

# Table 2: Raw Growth (No clipping, no merging)
growth_raw = read_table2_all_regions_raw(PATH_GROWTH)  # Region, Year, RES_raw, DC_raw

# Align Year Range: Intersection with investment years (averaging done within this set per region)
years = sorted(set(inv_df["Year"].astype(int)))
years = sorted(set(years).intersection(set(growth_raw["Year"].astype(int))))
inv_df = inv_df[inv_df["Year"].isin(years)].copy()

regions = sorted(growth_raw["Region"].unique())

# === Calculate "Forward Mean" ===
# Pivot to wide format for rolling: index=Year, columns=Region
res_wide_raw = growth_raw.pivot(index="Year", columns="Region", values="RES_raw").reindex(years).sort_index()
dc_wide_raw  = growth_raw.pivot(index="Year", columns="Region", values="DC_raw").reindex(years).sort_index()
res_wide_raw = res_wide_raw.fillna(0.0)
dc_wide_raw  = dc_wide_raw.fillna(0.0)

# Calculate forward mean for each region
res_fw_raw = pd.DataFrame(index=years, columns=regions, dtype=float)
dc_fw_raw  = pd.DataFrame(index=years, columns=regions, dtype=float)
for r in regions:
    res_fw_raw[r] = forward_mean(res_wide_raw[r], LOOKAHEAD_YEARS)
    dc_fw_raw[r]  = forward_mean(dc_wide_raw[r],  LOOKAHEAD_YEARS)

# === Apply AFTER averaging: Set RES<0 to 0, add absolute value to DC ===
res_fw_pos = res_fw_raw.clip(lower=0.0)
dc_fw_adj  = dc_fw_raw + res_fw_raw.clip(upper=0.0).abs()

# Optional: Drop rows where RES/DC are both 0 after averaging (instead of before)
# Not dropping rows directly here, handled naturally during aggregation; uncomment to enforce dropping:
# if DROP_ROWS_IF_ALL_ZERO:
#     mask_nonzero = ~((res_fw_pos == 0.0) & (dc_fw_adj == 0.0))
#     res_fw_pos = res_fw_pos.where(mask_nonzero, other=0.0)
#     dc_fw_adj  = dc_fw_adj.where(mask_nonzero,  other=0.0)

# Melt averaged wide table back to long format for calculation
growth_fw = (
    res_fw_pos.stack().rename("RES_pos").to_frame()
    .join(dc_fw_adj.stack().rename("DC"))
    .reset_index().rename(columns={"level_0":"Year","level_1":"Region"})
)
growth_fw = growth_fw[["Region","Year","RES_pos","DC"]].sort_values(["Year","Region"])

# Read Table 3: RU/DC allocation splits
splits = read_table3_splits(PATH_SPLIT)  # Region, Year, RU_share, DC_share

# 1) Allocate annual investment to regions based on "(RES_pos+DC) share" (after averaging)
growth_fw["TotalGrow"] = growth_fw["RES_pos"] + growth_fw["DC"]

year_den = growth_fw.groupby("Year", as_index=False)["TotalGrow"].sum().rename(columns={"TotalGrow":"YearGrowSum"})
g2 = growth_fw.merge(year_den, on="Year", how="left")
g2["YearGrowSum"] = g2["YearGrowSum"].replace(0, pd.NA)

g2 = g2.merge(inv_df, on="Year", how="left")

# Regional Total Cost
g2["RegionCost"] = (g2["TotalGrow"] / g2["YearGrowSum"]) * g2["TotalCost"]
g2["RegionCost"] = g2["RegionCost"].fillna(0.0)

# 2) Intra-region split (using averaged shares)
g2["inner_den"] = g2["RES_pos"] + g2["DC"]
g2.loc[g2["inner_den"] == 0, "inner_den"] = pd.NA

g2["RegionCost_DCgrowth"]  = (g2["DC"]      / g2["inner_den"]) * g2["RegionCost"]
g2["RegionCost_RESgrowth"] = (g2["RES_pos"] / g2["inner_den"]) * g2["RegionCost"]
g2[["RegionCost_DCgrowth","RegionCost_RESgrowth"]] = g2[["RegionCost_DCgrowth","RegionCost_RESgrowth"]].fillna(0.0)

# 3) Level 1 Summary (CAISO-wide)
level1 = g2.groupby("Year", as_index=False).agg(
    DC_growth_cost    = ("RegionCost_DCgrowth","sum"),
    NonDC_growth_cost = ("RegionCost_RESgrowth","sum"),
    Total             = ("RegionCost","sum")
)
level1.to_csv(OUTDIR/f"caiso_level1_totals_fw{LOOKAHEAD_YEARS}y.csv", index=False)

# 4) Level 2 Summary (By Region)
level2 = g2[["Year","Region","RegionCost","RegionCost_DCgrowth","RegionCost_RESgrowth"]].copy()
level2 = level2.sort_values(["Year","Region"])
level2.to_csv(OUTDIR/f"caiso_level2_by_region_fw{LOOKAHEAD_YEARS}y.csv", index=False)

# 5) Level 3 Summary (Calculate final RU vs DC burden per region based on splits)
g3 = g2.merge(splits, on=["Region","Year"], how="left")
g3[["RU_share","DC_share"]] = g3[["RU_share","DC_share"]].fillna(0.0)

# For Sankey conservation, apply RU/DC ratios separately to "DC Growth Cost" / "Non-DC Growth Cost"
g3["RU_charge_from_DCgrowth"]   = g3["RegionCost_DCgrowth"]  * g3["RU_share"]
g3["DC_charge_from_DCgrowth"]   = g3["RegionCost_DCgrowth"]  * g3["DC_share"]
g3["RU_charge_from_RESgrowth"]  = g3["RegionCost_RESgrowth"] * g3["RU_share"]
g3["DC_charge_from_RESgrowth"]  = g3["RegionCost_RESgrowth"] * g3["DC_share"]

# Also provide allocation based on total regional cost (for reconciliation)
g3["RU_charge_total"] = g3["RegionCost"] * g3["RU_share"]
g3["DC_charge_total"] = g3["RegionCost"] * g3["DC_share"]

level3 = g3[[
    "Year","Region","RU_share","DC_share",
    "RU_charge_total","DC_charge_total",
    "RU_charge_from_DCgrowth","DC_charge_from_DCgrowth",
    "RU_charge_from_RESgrowth","DC_charge_from_RESgrowth"
]].sort_values(["Year","Region"])
level3.to_csv(OUTDIR/f"caiso_level3_user_burden_fw{LOOKAHEAD_YEARS}y.csv", index=False)

# 6) Sankey Links (by Year), 3-layer structure:
#    L0:  [CAISO-DC, CAISO-NonDC]
#    L1:  [Region DC growth, Region NonDC growth]
#    L2:  [Region RU, Region DC]
sankey_links = []
for y, sub in g3.groupby("Year"):
    for _, r in sub.iterrows():
        sankey_links.append({
            "Year": y, "source": "CAISO-DC growth",
            "target": f"{r['Region']} - DC growth",
            "value": float(r["RegionCost_DCgrowth"])
        })
        sankey_links.append({
            "Year": y, "source": "CAISO-NonDC growth",
            "target": f"{r['Region']} - NonDC growth",
            "value": float(r["RegionCost_RESgrowth"])
        })
        sankey_links.append({
            "Year": y, "source": f"{r['Region']} - DC growth",
            "target": f"{r['Region']} - RU",
            "value": float(r["RU_charge_from_DCgrowth"])
        })
        sankey_links.append({
            "Year": y, "source": f"{r['Region']} - DC growth",
            "target": f"{r['Region']} - DC",
            "value": float(r["DC_charge_from_DCgrowth"])
        })
        sankey_links.append({
            "Year": y, "source": f"{r['Region']} - NonDC growth",
            "target": f"{r['Region']} - RU",
            "value": float(r["RU_charge_from_RESgrowth"])
        })
        sankey_links.append({
            "Year": y, "source": f"{r['Region']} - NonDC growth",
            "target": f"{r['Region']} - DC",
            "value": float(r["DC_charge_from_RESgrowth"])
        })

pd.DataFrame(sankey_links).to_csv(OUTDIR/f"caiso_sankey_links_fw{LOOKAHEAD_YEARS}y.csv", index=False)

# Extra Output: Wide tables "before/after averaging" for verification
with pd.ExcelWriter(OUTDIR / f"caiso_forward_mean_debug_fw{LOOKAHEAD_YEARS}y.xlsx") as w:
    res_wide_raw.to_excel(w, sheet_name="RES_raw_wide")
    dc_wide_raw.to_excel(w,  sheet_name="DC_raw_wide")
    res_fw_raw.to_excel(w,   sheet_name="RES_fw_raw")
    dc_fw_raw.to_excel(w,    sheet_name="DC_fw_raw")
    res_fw_pos.to_excel(w,   sheet_name="RES_fw_after_clip")
    dc_fw_adj.to_excel(w,    sheet_name="DC_fw_after_merge")
print("✅ Done! Output generated:")
print("-", OUTDIR / f"caiso_level1_totals_fw{LOOKAHEAD_YEARS}y.csv")
print("-", OUTDIR / f"caiso_level2_by_region_fw{LOOKAHEAD_YEARS}y.csv")
print("-", OUTDIR / f"caiso_level3_user_burden_fw{LOOKAHEAD_YEARS}y.csv")
print("-", OUTDIR / f"caiso_sankey_links_fw{LOOKAHEAD_YEARS}y.csv")
print("-", OUTDIR / f"caiso_forward_mean_debug_fw{LOOKAHEAD_YEARS}y.xlsx")



✅ 完成！已输出：
- rider/CAISO/caiso_level1_totals_fw5y.csv
- rider/CAISO/caiso_level2_by_region_fw5y.csv
- rider/CAISO/caiso_level3_user_burden_fw5y.csv
- rider/CAISO/caiso_sankey_links_fw5y.csv
- rider/CAISO/caiso_forward_mean_debug_fw5y.xlsx
