In [None]:
from __future__ import annotations
import re
from pathlib import Path
from typing import List
import numpy as np
import pandas as pd

# ============ Path Configuration (Modify as needed) ============
DIR_YEARLY = Path("./poverty")         # Directory where energy_burden_YYYY.csv was saved in the previous step
DIR_LEAD_2022 = Path("./poverty/LEAD") # Directory containing "AK AMI Counties 2022.csv" etc.
OUT_DIR = Path("./energy_poverty_outputs")  # Output directory for this script
YEARS = list(range(2025, 2030 + 1))

# Columns potentially used for LEAD segmentation (same names on both sides, missing filled with 'NA')
SEG_COLS: List[str] = ["AMI150", "TEN", "TEN-YBL6", "TEN-BLD", "TEN-HFL", "NAME"]

# ============ Utility Functions ============
def ensure_cols_and_key(df: pd.DataFrame, seg_cols=SEG_COLS) -> pd.DataFrame:
    """Ensure segment columns exist, standardize as strings, fill missing with 'NA', and construct seg_key for alignment."""
    for c in seg_cols:
        if c not in df.columns:
            df[c] = np.nan
        # Standardize as string keys; set NaN to 'NA'
        df[c] = df[c].astype(str)
        df.loc[df[c].str.lower().isin(["nan", "none", "null", ""]), c] = "NA"
    # Generate stable key (joined by '||' to avoid conflict with actual values)
    df["seg_key"] = df[seg_cols].agg("||".join, axis=1)
    return df

def load_lead_units(dir_lead: Path) -> pd.DataFrame:
    """Aggregate LEAD 2022 for all states, extracting county_fips + segments + UNITS."""
    rows = []
    for csv in sorted(dir_lead.glob("* AMI Counties 2022.csv")):
        try:
            t = pd.read_csv(csv, low_memory=False)
        except UnicodeDecodeError:
            t = pd.read_csv(csv, low_memory=False, encoding="latin1")
        t.columns = [c.strip() for c in t.columns]
        if "FIP" not in t.columns or "UNITS" not in t.columns:
            continue
        keep = ["FIP", "UNITS"] + [c for c in SEG_COLS if c in t.columns]
        t = t[keep].copy()
        t["county_fips"] = t["FIP"].astype(str).str.zfill(5)
        t.rename(columns={"UNITS": "units"}, inplace=True)
        t = ensure_cols_and_key(t)
        rows.append(t[["county_fips", "units", "seg_key"] + SEG_COLS])
    if not rows:
        raise RuntimeError("No state files read from LEAD directory or missing FIP/UNITS columns.")
    units = pd.concat(rows, ignore_index=True)
    # Convert to numeric
    units["units"] = pd.to_numeric(units["units"], errors="coerce").fillna(0.0)
    return units

def compute_poverty_shares(df_year: pd.DataFrame, units_ref: pd.DataFrame, year: int) -> pd.DataFrame:

    df = df_year.copy()
    df["county_fips"] = df["county_fips"].astype(str).str.zfill(5)
    df = ensure_cols_and_key(df)
    need_cols = [
        "county_fips", "iso", "seg_key",
        "energy_burden_with_dc_%", "energy_burden_no_dc_%"
    ] + SEG_COLS
    df = df[[c for c in need_cols if c in df.columns]].copy()

    # Merge UNITS (household count weight)
    merged = df.merge(
        units_ref[["county_fips", "seg_key", "units"]],
        on=["county_fips", "seg_key"],
        how="left",
        validate="m:1"
    )

    # Convert to numeric
    merged["units"] = pd.to_numeric(merged["units"], errors="coerce").fillna(0.0)
    for c in ["energy_burden_with_dc_%", "energy_burden_no_dc_%"]:
        merged[c] = pd.to_numeric(merged[c], errors="coerce")

    # ===== Exception and Missing Value Exclusion Rules =====
    # 1) Units must be > 0
    mask_units = merged["units"] > 0
    # 2) At least one scenario has a valid percentage (avoid counting if both are NaN)
    mask_has_burden = merged["energy_burden_with_dc_%"].notna() | merged["energy_burden_no_dc_%"].notna()
    # 3) Percentage > 100% in any scenario is considered anomalous, exclude entire row
    mask_anomaly = (merged["energy_burden_with_dc_%"] > 100) | (merged["energy_burden_no_dc_%"] > 100)

    valid = merged[mask_units & mask_has_burden & (~mask_anomaly)].copy()

    out_rows = []
    for county, g in valid.groupby("county_fips", sort=False):
        total_units = g["units"].sum()
        if total_units <= 0:
            continue
        # Threshold determination (percentage units)
        gt6_with = g.loc[g["energy_burden_with_dc_%"] > 6, "units"].sum()
        gt10_with = g.loc[g["energy_burden_with_dc_%"] > 10, "units"].sum()
        gt6_no = g.loc[g["energy_burden_no_dc_%"] > 6, "units"].sum()
        gt10_no = g.loc[g["energy_burden_no_dc_%"] > 10, "units"].sum()

        share_with_dc_gt6 = float(gt6_with / total_units)
        share_with_dc_gt10 = float(gt10_with / total_units)
        share_no_dc_gt6 = float(gt6_no / total_units)
        share_no_dc_gt10 = float(gt10_no / total_units)

        # ISO for this county (take first non-null)
        iso = g["iso"].dropna().astype(str).iloc[0] if "iso" in g.columns and g["iso"].notna().any() else ""

        out_rows.append({
            "county_fips": county,
            "iso": iso,
            "year": year,
            "total_units_counted": float(total_units),

            "share_with_dc_gt6": share_with_dc_gt6,
            "share_with_dc_gt10": share_with_dc_gt10,
            "share_no_dc_gt6": share_no_dc_gt6,
            "share_no_dc_gt10": share_no_dc_gt10,

            # Add two columns: Difference in share (with - no)
            "share_diff_gt6": share_with_dc_gt6 - share_no_dc_gt6,
            "share_diff_gt10": share_with_dc_gt10 - share_no_dc_gt10,
        })

    return pd.DataFrame(out_rows)


# ============ Main Process ============
OUT_DIR.mkdir(parents=True, exist_ok=True)

# 1) Read LEAD UNITS (once)
units_ref = load_lead_units(DIR_LEAD_2022)

# 2) Process by year
for y in YEARS:
    in_csv = DIR_YEARLY / f"energy_burden_{y}.csv"
    if not in_csv.exists():
        print(f"[Skip] Previous output not found: {in_csv}")
        continue
    dfy = pd.read_csv(in_csv, low_memory=False)
    res = compute_poverty_shares(dfy, units_ref, y)
    if res.empty:
        print(f"[Warning] No valid records for year {y} (data might be filtered out).")
        continue
    out_csv = OUT_DIR / f"energy_poverty_share_{y}.csv"
    res.to_csv(out_csv, index=False, encoding="utf-8-sig")
    print(f"[OK] {y} -> {out_csv}")

# 3) Optional: Merge into one master table
parts = []
for y in YEARS:
    p = OUT_DIR / f"energy_poverty_share_{y}.csv"
    if p.exists():
        t = pd.read_csv(p, low_memory=False)
        parts.append(t)
if parts:
    big = pd.concat(parts, ignore_index=True)
    big.to_csv(OUT_DIR / "energy_poverty_share_2025_2030_all_years.csv",
               index=False, encoding="utf-8-sig")
    print(f"[OK] Merged master table: {(OUT_DIR / 'energy_poverty_share_2025_2030_all_years.csv').resolve()}")

