In [None]:
import pandas as pd
import numpy as np
from pathlib import Path

# ========= Configuration =========
PATH_T1 = Path("./load and costs/ERCOT/ERCOT_cost.xlsx")   # Table 1: ERCOT Investment
PATH_T2 = Path("./load and costs/ERCOT/22_load_compare.xlsx")   # Table 2: Regional Load Increment (Multi-sheet)
PATH_T3 = Path("./rider/ERCOT/00_ERCOT_share.xlsx")   # Table 3: Regional RU/DC Share (Multi-sheet)
OUT_DIR = Path("./rider/ERCOT/")
OUT_DIR.mkdir(parents=True, exist_ok=True)

# ===== Lookahead Window Parameter =====
LOOKAHEAD_YEARS = 5  # Set to 5 for a 5-year lookahead average

def _norm(s):
    return str(s).strip().upper()

def load_table2_load_diff(path):
    """
    Reads Table 2: Load Increments (Projected - Historical).
    Each sheet is a region (e.g., 'West', 'South'...).
    Row index: Year.
    """
    dfs = pd.read_excel(path, sheet_name=None, index_col=0)
    out = {}
    for k, v in dfs.items():
        # Keep only numeric columns/rows
        v = v.apply(pd.to_numeric, errors="coerce").fillna(0.0)
        out[_norm(k)] = v
    return out

def load_table3_shares(path):
    """
    Reads Table 3: RU vs DC Share.
    Each sheet is a region.
    Must contain rows 'RU' and 'DC'.
    Columns are Years.
    """
    dfs = pd.read_excel(path, sheet_name=None, index_col=0)
    out = {}
    for k, v in dfs.items():
        # Normalize index to uppercase
        v.index = v.index.map(_norm)
        out[_norm(k)] = v
    return out

# ================= Main Process =================
# 1) Load Data
t1 = pd.read_excel(PATH_T1)
# Filter Table 1: Only keep rows where 'Project Description' contains 'Total'
# (Assuming the summary rows represent the total cost to be allocated)
mask_total = t1["Project Description"].astype(str).str.contains("Total", case=False, na=False)
t1 = t1[mask_total].copy()
# Ensure Year is numeric
t1["Year"] = pd.to_numeric(t1["Year"], errors="coerce")

load_diffs = load_table2_load_diff(PATH_T2)  # Dict[Zone, DataFrame(Year x Col)]
shares_ru_dc = load_table3_shares(PATH_T3)   # Dict[Zone, DataFrame(RU/DC x Year)]

# Identify all relevant zones (intersection of keys)
all_zones = sorted(list(set(load_diffs.keys()) & set(shares_ru_dc.keys())))
print(f"Zones to process: {all_zones}")

# Containers for results
owner_tabs = []
zone_tabs = []
user_tabs = []
links1_tabs = []
links2_tabs = []

# 2) Process Each Zone
years_all = sorted(t1["Year"].dropna().unique().astype(int))
min_y, max_y = min(years_all), max(years_all)
zone_growth_map = {}
for z in all_zones:
    df_load = load_diffs[z]
    # Reindex to full year range
    df_load = df_load.reindex(range(min_y, max_y + 1 + LOOKAHEAD_YEARS), fill_value=0.0)
    # Take the first column as the load increment value
    zone_growth_map[z] = df_load.iloc[:, 0].fillna(0.0)

# DataFrame: Index=Year, Columns=Zones
df_growth_all = pd.DataFrame(zone_growth_map).sort_index()
# Total System Growth (Sum of all zones)
s_total_growth = df_growth_all.sum(axis=1).replace(0.0, 1e-9) # Avoid divide by zero

for zone in all_zones:
    print(f"... Processing Zone: {zone}")

    
    s_zone = df_growth_all[zone]
    
    # 1. Total System Rolling Sum
    roll_total = s_total_growth.rolling(window=LOOKAHEAD_YEARS, min_periods=1).sum().shift(-(LOOKAHEAD_YEARS-1))
    # 2. Zone Rolling Sum
    roll_zone  = s_zone.rolling(window=LOOKAHEAD_YEARS, min_periods=1).sum().shift(-(LOOKAHEAD_YEARS-1))
    
    # 3. Share
    # Handle zeros
    roll_total = roll_total.replace(0.0, np.nan)
    share_series = (roll_zone / roll_total).fillna(0.0)
    
    # Clip to [0,1] just in case
    share_series = share_series.clip(0, 1)
    
    # Align share_series to the cost years
    share_series = share_series.reindex(years_all).fillna(0.0)
    
    col_zone_t1 = None
    for c in t1.columns:
        if "ZONE" in c.upper():
            col_zone_t1 = c
            break
    
    # Container for this zone's allocated cost per year
    cost_allocated = pd.Series(0.0, index=years_all)
    
    if col_zone_t1:
        # 1. Specific Rows
        mask_specific = t1[col_zone_t1].astype(str).map(_norm) == zone
        cost_spec = t1.loc[mask_specific].groupby("Year")["Total Cost"].sum() # Assuming 'Total Cost' col
        cost_allocated = cost_allocated.add(cost_spec, fill_value=0)
        
        # 2. Shared Rows (Marked as 'ERCOT' or 'SYSTEM' or similar)
        # Assuming 'ERCOT' represents system-wide
        mask_shared = t1[col_zone_t1].astype(str).map(_norm).isin(["ERCOT", "SYSTEM", "TOTAL"])
        cost_shared_raw = t1.loc[mask_shared].groupby("Year")["Total Cost"].sum()
        # Multiply by share
        cost_shared_alloc = cost_shared_raw * share_series
        cost_allocated = cost_allocated.add(cost_shared_alloc, fill_value=0)
    else:
        # Fallback: Assume ALL rows are System-wide shared
        # (Or maybe the Zone name is in Project Description? Simplified here.)
        # Taking 'Total' rows sum by year
        # Note: If T1 contains breakdown, summing everything might double count.
        # But we filtered `mask_total` at start.
        cost_all = t1.groupby("Year").sum(numeric_only=True).iloc[:, 0] # First numeric col as cost
        cost_allocated = cost_all * share_series

    # --- C. Split into RU / DC ---
    # Get RU/DC ratios for this zone
    df_share_rudc = shares_ru_dc.get(zone)
    if df_share_rudc is None:
        # Default to 0/0 if missing
        ru_share = pd.Series(0.0, index=years_all)
        dc_share = pd.Series(0.0, index=years_all)
    else:
        # Transpose if needed or select rows. Index is RU/DC.
        # df_share_rudc cols are years.
        # Reindex columns to match years_all
        df_share_rudc = df_share_rudc.reindex(columns=years_all).fillna(method='ffill', axis=1).fillna(method='bfill', axis=1)
        ru_share = df_share_rudc.loc["RU"]
        dc_share = df_share_rudc.loc["DC"]
        
        # Normalize to 1 (just in case)
        tot = ru_share + dc_share
        ru_share = ru_share / tot.replace(0, 1)
        dc_share = dc_share / tot.replace(0, 1)

    cost_ru = cost_allocated * ru_share
    cost_dc = cost_allocated * dc_share
    
    # --- D. Collect Results ---
    
    # 1. Zone Tab (Allocated Total)
    # Zone | Year | Cost_Total | Share_Load_Growth
    df_z = pd.DataFrame({
        "Zone": zone,
        "Year": years_all,
        "Allocated_Cost_Total": cost_allocated.values,
        "Load_Growth_Share_Used": share_series.values
    })
    zone_tabs.append(df_z)
    
    # 2. User Tab (RU vs DC)
    df_u = pd.DataFrame({
        "Zone": zone,
        "Year": years_all,
        "Cost_RU": cost_ru.values,
        "Cost_DC": cost_dc.values
    })
    user_tabs.append(df_u)
    
    # 3. Links
    # L1: Source(ERCOT_Invest) -> Zone
    l1 = pd.DataFrame({
        "Year": years_all,
        "source": "ERCOT_Grid",
        "target": zone,
        "value": cost_allocated.values,
        "scenario": "Base"
    })
    links1_tabs.append(l1)
    
    # L2: Zone -> RU / DC
    l2_ru = pd.DataFrame({
        "Year": years_all,
        "source": zone,
        "target": "RU",
        "value": cost_ru.values,
        "scenario": "Base"
    })
    l2_dc = pd.DataFrame({
        "Year": years_all,
        "source": zone,
        "target": "DC",
        "value": cost_dc.values,
        "scenario": "Base"
    })
    links2_tabs.append(pd.concat([l2_ru, l2_dc]))

# 3) Export
df_zone_all = pd.concat(zone_tabs, ignore_index=True)
df_user_all = pd.concat(user_tabs, ignore_index=True)
df_l1_all = pd.concat(links1_tabs, ignore_index=True)
df_l2_all = pd.concat(links2_tabs, ignore_index=True)

# Remove zero values to clean up Sankey
df_l1_all = df_l1_all[df_l1_all["value"] > 1e-3]
df_l2_all = df_l2_all[df_l2_all["value"] > 1e-3]

out_file = OUT_DIR / "sankey_inputs_ERCOT.xlsx"
with pd.ExcelWriter(out_file, engine="openpyxl") as w:
    df_zone_all.to_excel(w, index=False, sheet_name="C_zone_alloc")
    df_user_all.to_excel(w, index=False, sheet_name="D_user_split")
    df_l1_all.to_excel(w, index=False, sheet_name="L1_links")
    df_l2_all.to_excel(w, index=False, sheet_name="L2_links")

print(f"Done! Saved to {out_file}")