In [1]:
from __future__ import annotations
import re
from pathlib import Path
from typing import Dict, List, Tuple
import numpy as np
import pandas as pd

# ============ Path Configuration (Modify as Needed) ============
PATH_MAP = Path("./poverty/county_to_iso_or_city.xlsx")     # Mapping table (county_fips, iso, zone)
MAP_SHEET = 0
DIR_LEAD_2022 = Path("./poverty/LEAD/")                     # State LEAD 2022 CSVs
PATH_INCOME_GROWTH = Path("./poverty/macroeconomic.xlsx")   # Income growth table (Take last row; 2022=1)
INCOME_SHEET = 0
DIR_ISO_PRICES = Path("./rider/equality/")                  # ISO *_zone_prices.xlsx (sheet=zone)
PATH_GAS_OIL = Path("./poverty/fuel/state_fuel_price.xlsx") # State Gas/Oil Prices
OUT_DIR = Path("./rider/equality_poverty/")
YEARS = list(range(2025, 2030 + 1))
# ============ FIPS -> State Abbreviation ============
STATE_FIPS_TO_ABBR = {
    "01":"AL","02":"AK","04":"AZ","05":"AR","06":"CA","08":"CO","09":"CT","10":"DE","11":"DC","12":"FL",
    "13":"GA","15":"HI","16":"ID","17":"IL","18":"IN","19":"IA","20":"KS","21":"KY","22":"LA","23":"ME",
    "24":"MD","25":"MA","26":"MI","27":"MN","28":"MS","29":"MO","30":"MT","31":"NE","32":"NV","33":"NH",
    "34":"NJ","35":"NM","36":"NY","37":"NC","38":"ND","39":"OH","40":"OK","41":"OR","42":"PA","44":"RI",
    "45":"SC","46":"SD","47":"TN","48":"TX","49":"UT","50":"VT","51":"VA","53":"WA","54":"WV","55":"WI",
    "56":"WY","72":"PR"}

# ============ Utility Functions ============
def clean_zone(z: str, iso: str | None = None) -> str:
    """Remove ISO prefix, remove all non-alphanumeric chars, to uppercase."""
    if pd.isna(z) or z is None:
        return ""
    s = str(z).strip()
    if iso:
        s = re.sub(rf"^\s*{re.escape(iso)}[\s_\-:/]+", "", s, flags=re.IGNORECASE)
    s = re.sub(r"[^A-Za-z0-9]+", "", s)
    return s.upper()

def split_and_clean_zones(series_zones: pd.Series, iso: str) -> List[str]:
    """Split zone column by '|', clean each part, and return a unique list."""
    tokens = []
    for z in series_zones.astype(str):
        parts = re.split(r"\s*\|\s*", z)
        for p in parts:
            if p and p.lower() != "nan":
                cz = clean_zone(p, iso)
                if cz:
                    tokens.append(cz)
    return sorted(set(tokens))

def iso_from_filename(p: Path) -> str:
    """e.g. PJM_zone_prices.xlsx -> PJM"""
    return re.sub(r"[^A-Za-z]", "", p.stem.split("_")[0]).upper()

# ============ 1) Mapping Table ============
def load_county_map(path: Path, sheet=0) -> pd.DataFrame:
    if path.suffix.lower() in [".xlsx", ".xls"]:
        df = pd.read_excel(path, sheet_name=sheet, dtype={"county_fips": str})
    else:
        df = pd.read_csv(path, dtype={"county_fips": str})
    need = ["county_fips", "iso", "zone"]
    miss = [c for c in need if c not in df.columns]
    if miss:
        raise ValueError(f"Mapping table missing columns: {miss}")
    df["county_fips"] = df["county_fips"].astype(str).str.zfill(5)
    df["iso"] = df["iso"].astype(str).str.strip().str.upper()
    return df[need].copy()

# ============ 2) LEAD 2022 (Keep Segment Columns) ============
SEG_COLS = ["AMI150","TEN","TEN-YBL6","TEN-BLD","TEN-HFL","NAME"]  # Common household segment columns

def load_lead_2022(dir_path: Path) -> pd.DataFrame:
    """
    Keep only: FIP + Segment Cols + UNITS + HINCP*UNITS + ELEP*UNITS + GASP*UNITS + FULP*UNITS
    """
    rows = []
    for csv in sorted(dir_path.glob("* AMI Counties 2022.csv")):
        try:
            t = pd.read_csv(csv, low_memory=False)
        except UnicodeDecodeError:
            t = pd.read_csv(csv, low_memory=False, encoding="latin1")
        t.columns = [c.strip() for c in t.columns]

        need_vals = ["UNITS","HINCP*UNITS","ELEP*UNITS","GASP*UNITS","FULP*UNITS"]
        if "FIP" not in t.columns or any(c not in t.columns for c in need_vals):
            # Skip invalid files
            continue

        keep = ["FIP"] + [c for c in SEG_COLS if c in t.columns] + need_vals
        t = t[keep].copy()
        t["county_fips"] = t["FIP"].astype(str).str.zfill(5)
        t["state_abbr"]  = t["county_fips"].str[:2].map(STATE_FIPS_TO_ABBR)

        # Numeric columns
        for c in need_vals:
            t[c] = pd.to_numeric(t[c], errors="coerce").fillna(0.0)

        rows.append(t)

    if not rows:
        raise RuntimeError("No valid 2022 CSV found in LEAD directory.")
    lead = pd.concat(rows, ignore_index=True)
    return lead

# ============ 3) Income Growth (Last Row; 2022=1) ============
def load_income_growth(path: Path, sheet=0, years: List[int] = YEARS) -> Dict[int, float]:
    df = pd.read_excel(path, sheet_name=sheet)
    # Use the last row as growth factors
    last = df.dropna(how="all", axis=1).iloc[-1]
    growth = {}
    for y in years + [2022]:
        if y not in df.columns:
            raise KeyError(f"Income growth table missing year column {y}")
        growth[y] = float(last[y])
    # Normalize to 2022=1
    if abs(growth[2022] - 1.0) > 1e-9:
        base = growth[2022]
        for k in list(growth.keys()):
            growth[k] = growth[k] / base
    return {y: growth[y] for y in years}

# ============ 4) ISO Price Ratios: total vs total_res_capped ============
def load_iso_price_ratios(dir_path: Path, years: List[int]) -> Dict[str, Dict[str, Dict[str, float]]]:
    """
    Returns:
      ratios[ISO][ZONE]['with_dc_YYYY']   = total(y)/total(2022)
      ratios[ISO][ZONE]['res_cap_YYYY']   = total_res_capped(y)/total_res_capped(2022)
    """
    ratios: Dict[str, Dict[str, Dict[str, float]]] = {}
    for xlsx in sorted(dir_path.glob("*_zone_prices_res_capped.xlsx")):
        iso = iso_from_filename(xlsx)
        ratios.setdefault(iso, {})
        try:
            sheets = pd.ExcelFile(xlsx).sheet_names
        except Exception:
            continue

        for sh in sheets:
            df = pd.read_excel(xlsx, sheet_name=sh)
            # Convert year-like columns to int
            mapper = {}
            for c in df.columns:
                cs = str(c).strip()
                if re.fullmatch(r"\d{4}", cs):
                    mapper[c] = int(cs)
            df = df.rename(columns=mapper)

            # Find 'item' column
            item_col = None
            for c in df.columns:
                if str(c).strip().lower() == "item":
                    item_col = c
                    break
            if item_col is None:
                continue

            df[item_col] = df[item_col].astype(str).str.strip().str.lower()
            df = df.set_index(item_col)

            def pick_row(name_candidates: List[str]) -> pd.Series | None:
                for nm in name_candidates:
                    if nm in df.index:
                        return df.loc[nm]
                for nm in name_candidates:
                    m = df.index.to_series().str.contains(re.escape(nm), case=False, na=False)
                    if m.any():
                        return df.loc[m].iloc[0]
                return None

            s_total = pick_row(["total"])
            s_cap   = pick_row(["total_res_capped", "total res capped", "res_capped"])
            if s_total is None or s_cap is None:
                continue

            # 2022 Baseline (Use 2022 if present, else min year)
            ycols = [c for c in s_total.index if isinstance(c, (int, np.integer))]
            base_year = 2022 if 2022 in ycols else (min(ycols) if ycols else None)
            if base_year is None:
                continue

            try:
                base_total = float(s_total.loc[base_year])
                base_cap   = float(s_cap.loc[base_year])
            except Exception:
                continue
            if base_total == 0 or base_cap == 0:
                continue

            zone_clean = clean_zone(sh, iso)
            if not zone_clean:
                continue

            for y in years:
                if y not in s_total.index or y not in s_cap.index:
                    continue
                r_with = float(s_total.loc[y]) / base_total
                r_cap  = float(s_cap.loc[y])   / base_total
                ratios[iso].setdefault(zone_clean, {})
                ratios[iso][zone_clean][f"with_dc_{y}"] = r_with
                ratios[iso][zone_clean][f"res_cap_{y}"] = r_cap
    return ratios

# ============ 5) State Gas/Oil Price Ratios (vs 2022) ============
def load_state_fuel_price_ratios(path: Path, years: List[int]) -> Tuple[pd.DataFrame, pd.DataFrame]:
    gas_ratios = {}
    oil_ratios = {}
    xls = pd.ExcelFile(path)
    for sheet in xls.sheet_names:
        df = pd.read_excel(path, sheet_name=sheet)
        idx_col = df.columns[0]
        df[idx_col] = df[idx_col].astype(str).str.strip()
        df = df.set_index(idx_col)
        def pick_row(cands):
            for nm in cands:
                if nm in df.index: return df.loc[nm]
            for nm in cands:
                m = df.index.to_series().str.contains(re.escape(nm), case=False, na=False)
                if m.any(): return df.loc[m].iloc[0]
            return None
        s_ng  = pick_row(["NG (Natural Gas)","Natural Gas"])
        s_dfo = pick_row(["DFO (Distillate Fuel Oil)","Distillate Fuel Oil"])
        if s_ng is None or s_dfo is None:
            continue
        base_ng  = float(s_ng[2022])
        base_dfo = float(s_dfo[2022])
        if base_ng == 0 or base_dfo == 0:
            continue
        gas_ratios[sheet] = {y: float(s_ng[y])  / base_ng  for y in years if y in s_ng.index}
        oil_ratios[sheet] = {y: float(s_dfo[y]) / base_dfo for y in years if y in s_dfo.index}
    gas_df = pd.DataFrame.from_dict(gas_ratios, orient="index").sort_index(); gas_df.index.name = "state_abbr"
    oil_df = pd.DataFrame.from_dict(oil_ratios, orient="index").sort_index(); oil_df.index.name = "state_abbr"
    return gas_df, oil_df

In [2]:
try:
    from tqdm.auto import tqdm
except Exception:
    def tqdm(x, **k): return x
OUT_DIR.mkdir(parents=True, exist_ok=True)
# 1. Mapping Table
mapping = load_county_map(PATH_MAP, MAP_SHEET)
# ISOs with price tables
iso_available = {iso_from_filename(p) for p in DIR_ISO_PRICES.glob("*_zone_prices_res_capped.xlsx")}
mapping = mapping[mapping["iso"].isin(iso_available)].copy()   # Filter: Discard counties not mapped to any ISO

In [3]:
# 2. LEAD 2022
lead = load_lead_2022(DIR_LEAD_2022)
# Only process counties present in mapping
base = lead.merge(mapping, on="county_fips", how="inner")
# Add state abbreviation (in case missing from LEAD)
if "state_abbr" not in base.columns:
    base["state_abbr"] = base["county_fips"].str[:2].map(STATE_FIPS_TO_ABBR)
# 3. Income Growth Ratios
income_growth = load_income_growth(PATH_INCOME_GROWTH, INCOME_SHEET, YEARS)
# 4. Electricity Price Ratios
price_ratios = load_iso_price_ratios(DIR_ISO_PRICES, YEARS)
# 5. Gas/Oil Ratios
gas_ratio_df, oil_ratio_df = load_state_fuel_price_ratios(PATH_GAS_OIL, YEARS)
results_by_year = {y: [] for y in YEARS}

In [4]:
# Process per county
n_counties = base["county_fips"].nunique()
for county, g in tqdm(base.groupby("county_fips", sort=False), total=n_counties, desc="Processing counties"):
    iso = g["iso"].iloc[0]
    state_abbr = g["state_abbr"].iloc[0]
    # Split, clean, and deduplicate zones
    zones = split_and_clean_zones(g["zone"], iso)
    # Get available price ratios for zones in this county
    zrat = [price_ratios.get(iso, {}).get(z, None) for z in zones]
    zrat = [r for r in zrat if r]
    if not zrat:
        continue  # No matching zones, skip county

    def avg_ratio(key: str, year: int) -> float | None:
        vals = [r.get(f"{key}_{year}") for r in zrat if r.get(f"{key}_{year}") is not None]
        return float(np.mean(vals)) if vals else None

    # Household segment columns (carry over if present in LEAD)
    seg_cols = [c for c in SEG_COLS if c in g.columns]
    for _, row in g.iterrows():
        income0 = float(row["HINCP*UNITS"])
        elec0   = float(row["ELEP*UNITS"])
        gas0    = float(row["GASP*UNITS"])
        fuel0   = float(row["FULP*UNITS"])

        for y in YEARS:
            r_with = avg_ratio("with_dc", y)     # total
            r_cap  = avg_ratio("res_cap", y)     # total_res_capped
            if r_with is None or r_cap is None:
                continue

            # Income Projection
            income = income0 * float(income_growth[y])

            # Electricity Cost Scenarios
            elec_with = elec0 * r_with
            elec_cap  = elec0 * r_cap

            # State Gas/Oil Ratios
            try:
                gas_ratio = float(gas_ratio_df.loc[state_abbr, y])
                oil_ratio = float(oil_ratio_df.loc[state_abbr, y])
            except Exception:
                continue
            gas_cost  = gas0  * gas_ratio
            fuel_cost = fuel0 * oil_ratio

            # Burden (Percentage)
            if income <= 0:
                burden_with = np.nan
                burden_cap  = np.nan
            else:
                burden_with = 100.0 * (elec_with + gas_cost + fuel_cost) / income
                burden_cap  = 100.0 * (elec_cap  + gas_cost + fuel_cost) / income

            out = {
                "county_fips": county,
                "iso": iso,
                "UNITS": float(row["UNITS"]),
                "zone_list": "|".join(zones),
                "state": state_abbr,
                "income_total": income,
                "elec_with_dc": elec_with,
                "elec_res_capped": elec_cap,
                "gas": gas_cost,
                "fuel": fuel_cost,
                "energy_burden_with_dc_%": burden_with,
                "energy_burden_res_capped_%": burden_cap,
            }
            for c in seg_cols:
                out[c] = row[c]
            results_by_year[y].append(out)

# 6. Export to CSV (One per Year)
OUT_DIR.mkdir(parents=True, exist_ok=True)
for y in YEARS:
    dfy = pd.DataFrame(results_by_year[y])
    if dfy.empty:
        continue
    # No sorting, keep traversal order (add sort_values if needed)
    dfy.to_csv(OUT_DIR / f"energy_burden_{y}.csv", index=False, encoding="utf-8-sig")

print(f"Done! Output directory: {OUT_DIR.resolve()}")


Processing counties:   0%|          | 0/1591 [00:00<?, ?it/s]

完成！输出目录：/home/cfeng/LCA/rider/equality_poverty
