In [None]:
import pandas as pd
import re
from pathlib import Path

# ======= To be configured =======
ESIID_FILE = "./load and costs/ERCOT/oncor.xlsx"  # Your file (csv/xlsx supported, no header)
SHEET_NAME = 0                              # Excel sheet, ignored for CSV
ZIP_COL_INDEX_1BASED = 11                     # [Critical] Column index for ZIP (1-based)
CITY_COL_INDEX_1BASED = 10                    # [Fallback] Column index for City (modify if incorrect)
OUT_CSV = None                              # Set a filename to export results if needed
# ======================

def read_noheader(path, sheet=None):
    p = Path(path)
    if p.suffix.lower() in [".xlsx", ".xls"]:
        df = pd.read_excel(p, sheet_name=sheet, header=None, dtype=str)
    else:
        df = pd.read_csv(p, header=None, dtype=str)
    df.columns = [f"c{i}" for i in range(df.shape[1])]
    return df

def detect_esiid_col(df):
    cand, best = None, 0
    for c in df.columns:
        s = df[c].astype(str).str.replace(r"\s+","",regex=True)
        frac = s.str.fullmatch(r"\d{15,25}").mean()
        if frac > best:
            best, cand = frac, c
    if best < 0.5:
        raise ValueError("No plausible ESIID column found (>=50% are 15-25 digit numbers).")
    return cand

def mask_contains(df, patterns):
    m = pd.Series(False, index=df.index)
    for c in df.columns:
        s = df[c].astype(str).str.lower()
        for p in patterns:
            m |= s.str.contains(p)
    return m

def norm_token(series):
    # Normalize to lowercase alphabets/digits, compress spaces; e.g. 'Non-Residential ' -> 'nonresidential'
    return (series.fillna("")
                 .str.lower()
                 .str.replace(r"[^a-z0-9]+", "", regex=True))

def mask_equals_any(df, targets):
    """Search the entire table for cells strictly matching any value in targets."""
    targets = {re.sub(r"[^a-z0-9]+","", t.lower()) for t in targets}
    m = pd.Series(False, index=df.index)
    for c in df.columns:
        s = norm_token(df[c])
        m |= s.isin(targets)
    return m

# ---- Main Process ----
df = read_noheader(ESIID_FILE, SHEET_NAME)

esi_col = detect_esiid_col(df)
is_res = mask_equals_any(df, {"residential"})
is_act = mask_contains(df, [r"\bactive\b", r"\benergized\b"])
if not is_act.any():  # Assume all active if no status column found
    is_act = pd.Series(True, index=df.index)

# ZIP (Column index from config) -> Extract first 5 digits
zip_col = df.columns[ZIP_COL_INDEX_1BASED - 1]
zip5 = df[zip_col].astype(str).str.extract(r"(\d{5})")[0]

# City Fallback (Column index from config)
city_col = df.columns[CITY_COL_INDEX_1BASED - 1]
city_raw = df[city_col].astype(str).str.upper().str.replace(r"[^A-Z\s]", "", regex=True).str.strip()

# List of West Texas cities (add more as needed)
WEST_CITY_SET = {
    "MIDLAND","ODESSA","ANDREWS","MONAHANS","KERMIT","PECOS","STANTON",
    "BIG SPRING","WINK","FORT STOCKTON","CRANE","GARDENDALE","GOLDSMITH",
    "PENWELL","ECTOR","REAGAN","MARTIN","HOWARD","WARD","WINKLER"
}

# Zone classification: 797xx -> WEST; else if city matches -> WEST; others NORTH
zone = pd.Series("NORTH", index=df.index)
zone[zip5.str.startswith("797", na=False)] = "WEST"
zone[city_raw.isin(WEST_CITY_SET)] = "WEST"

# Count unique ESIIDs for Residential & Active & zone in {NORTH, WEST} only
sub = pd.DataFrame({
    "esiid": df[esi_col].astype(str).str.replace(r"\s+","",regex=True),
    "zone": zone, "is_res": is_res, "is_act": is_act
})
sub = sub[sub["is_res"] & sub["is_act"] & sub["zone"].isin(["NORTH","WEST"])].copy()
sub = sub.dropna(subset=["esiid"]).drop_duplicates(subset=["esiid"])

counts = sub.groupby("zone")["esiid"].nunique().rename("esiid_count")
total = int(counts.sum())
share = (counts / total).rename("share")
result = pd.concat([counts, share], axis=1).reindex(["NORTH","WEST"]).fillna(0)

print(result)
print(f"\nTotal residential active ESIIDs counted: {total}")

# Check for rows not classified as North/West (usually 0)
unknown = (~zone.isin(["NORTH","WEST"])).sum()
if unknown:
    print(f"[Info] {unknown} rows not classified into North/West (ZIP/City might be missing or not in the fallback list).")
