In [None]:
#!/usr/bin/env python3
# store_clean_encode.py
# Run from terminal: python store_clean_encode.py --csv store_data.csv
# Run in VS Code/Jupyter: just run the cell (it will ignore injected args)

import argparse
import re
from pathlib import Path

# --- Robust imports (auto-install pgeocode if missing; sklearn is optional) ---
try:
    import pgeocode
except Exception:
    import sys, subprocess
    subprocess.run([sys.executable, "-m", "pip", "install", "pgeocode", "-q"], check=False)
    import pgeocode

try:
    from sklearn.preprocessing import LabelEncoder  # optional
except Exception:
    LabelEncoder = None

import pandas as pd
import numpy as np

# ---------- Config ----------
COUNTRY_CODE = "US"
nomi = pgeocode.Nominatim(COUNTRY_CODE)

# ---------- Helpers ----------
def safe_read_csv_local(path: Path) -> pd.DataFrame:
    try:
        return pd.read_csv(path)
    except UnicodeDecodeError:
        return pd.read_csv(path, encoding="latin1")

def clean_postal(pc):
    if pd.isna(pc):
        return None
    s = str(pc).strip()
    m = re.search(r"(\d{5})", s)
    if m:
        return m.group(1).zfill(5)
    m2 = re.search(r"(\d{1,4})$", s)
    if m2:
        return m2.group(1).zfill(5)
    return None

def norm_state(s):
    if pd.isna(s): return None
    s = str(s).strip().upper()
    if s in ("", "NAN"): return None
    m = re.match(r"([A-Z]{2})", s)  # keep first 2 letters if longer
    return m.group(1) if m else None

def norm_city(c):
    if pd.isna(c): return None
    c = str(c).strip()
    if c == "" or c.lower() == "nan": return None
    return c

def safe_attr(obj, key):
    try:
        if obj is None: return None
        if isinstance(obj, (pd.Series, dict)): return obj.get(key, None)
        return getattr(obj, key, None)
    except Exception:
        return None

# pgeocode caches (avoid repeated lookups)
ZIP_CACHE = {}
CITY_CACHE = {}

def query_zip(zip5):
    if not zip5: return None
    if zip5 in ZIP_CACHE:
        return ZIP_CACHE[zip5]
    try:
        rec = nomi.query_postal_code(zip5)
    except Exception:
        rec = None
    ZIP_CACHE[zip5] = rec
    return rec

def query_city(city):
    if not city: return None
    if city in CITY_CACHE:
        return CITY_CACHE[city]
    try:
        df = nomi.query_location(city)
    except Exception:
        df = None
    CITY_CACHE[city] = df
    return df

def dominant_map(frame, key, val, min_share=0.90):
    tmp = (frame.dropna(subset=[key, val])
                 .groupby(key)[val]
                 .value_counts(normalize=True)
                 .rename("share")
                 .reset_index())
    top = (tmp.sort_values(["share"], ascending=False)
              .groupby(key)
              .head(1))
    top = top[top["share"] >= min_share]
    return top.set_index(key)[val].to_dict()

def build_final(df: pd.DataFrame) -> pd.DataFrame:
    # Normalize inputs
    na_values = ["NA", "N/A", "nan", "NaN", ""]
    df = df.replace(na_values, np.nan).copy()

    for col in ["STORE_NUMBER", "CITY", "STATE", "POSTAL_CODE"]:
        if col not in df.columns:
            df[col] = np.nan

    df["STORE_NUMBER"] = df["STORE_NUMBER"].astype(str)
    df["CITY"]  = df["CITY"].map(norm_city)
    df["STATE"] = df["STATE"].map(norm_state)
    df["POSTAL_CODE"] = df["POSTAL_CODE"].map(clean_postal)

    # Dataset-driven fallbacks (only used if pgeocode fails)
    zip2state_ds = dominant_map(df, "POSTAL_CODE", "STATE", min_share=0.90)
    zip2city_ds  = dominant_map(df, "POSTAL_CODE", "CITY",  min_share=0.90)
    city2state_ds= dominant_map(df, "CITY",        "STATE", min_share=0.90)

    def fill_row(row):
        city = norm_city(row.get("CITY"))
        state = norm_state(row.get("STATE"))
        zip5 = clean_postal(row.get("POSTAL_CODE"))

        # ZIP -> CITY/STATE via pgeocode
        if zip5:
            z = query_zip(zip5)
            place  = norm_city(safe_attr(z, "place_name"))
            stcode = norm_state(safe_attr(z, "state_code"))
            if city is None and place:   city = place
            if state is None and stcode: state = stcode
            row["POSTAL_CODE"] = zip5  # keep normalized

        # CITY -> ZIP/STATE via pgeocode
        if city:
            cand = query_city(city)
            if isinstance(cand, pd.DataFrame) and not cand.empty:
                if "state_code" in cand.columns and state:
                    st_series = cand["state_code"].astype(str).str.upper().fillna("")
                    cand = cand[st_series == state]
                if "population" in cand.columns:
                    cand = cand.sort_values("population", ascending=False)
                chosen = None
                for _, r in cand.iterrows():
                    pc = r.get("postal_code", None)
                    st = r.get("state_code", None)
                    if pd.notnull(pc) or pd.notnull(st):
                        chosen = r
                        break
                if chosen is not None:
                    if pd.isna(row.get("POSTAL_CODE")):
                        pc = chosen.get("postal_code", None)
                        if pd.notnull(pc):
                            row["POSTAL_CODE"] = clean_postal(pc)
                    if state is None and pd.notnull(chosen.get("state_code", None)):
                        state = norm_state(chosen.get("state_code", None))

        # Dataset fallbacks (only when still missing)
        if state is None and zip5 and zip5 in zip2state_ds:
            state = norm_state(zip2state_ds[zip5])
        if city is None and zip5 and zip5 in zip2city_ds:
            city = norm_city(zip2city_ds[zip5])
        if state is None and city and city in city2state_ds:
            state = norm_state(city2state_ds[city])

        row["CITY"] = city if city is not None else row.get("CITY")
        row["STATE"] = state if state is not None else row.get("STATE")
        return row

    df = df.apply(fill_row, axis=1)

    # Mark completely empty location rows as Unknown/UNK
    mask_all_null = df[["CITY", "STATE", "POSTAL_CODE"]].isnull().all(axis=1)
    df.loc[mask_all_null, "STATE"] = "UNK"
    df.loc[mask_all_null, "CITY"]  = "Unknown"

    # Final normalization
    df["CITY"]  = df["CITY"].map(lambda x: str(x).upper() if pd.notnull(x) else x)
    df["STATE"] = df["STATE"].map(lambda x: str(x).upper()[:2] if pd.notnull(x) else x)
    df["POSTAL_CODE"] = df["POSTAL_CODE"].map(lambda x: clean_postal(x) if pd.notnull(x) else x)

    # Encode + OHE (original output schema)
    if LabelEncoder is not None:
        try:
            enc = LabelEncoder()
            df["STORE_NUMBER_ENC"] = enc.fit_transform(df["STORE_NUMBER"].astype(str))
        except Exception:
            df["STORE_NUMBER_ENC"] = pd.factorize(df["STORE_NUMBER"].astype(str))[0]
    else:
        df["STORE_NUMBER_ENC"] = pd.factorize(df["STORE_NUMBER"].astype(str))[0]

    state_ohe = pd.get_dummies(df["STATE"].fillna("UNK"), prefix="state").astype(int)
    city_ohe  = pd.get_dummies(df["CITY"].fillna("Unknown"), prefix="city").astype(int)

    final_df = pd.concat([
        df[["STORE_NUMBER", "STORE_NUMBER_ENC", "STATE"]],
        state_ohe,
        city_ohe
    ], axis=1)

    return final_df

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--csv", default="store_data.csv",
                        help="Input CSV filename located in the same folder as this script")

    # Ignore VS Code/Jupyter's injected args like --f=...
    ns, _ = parser.parse_known_args()

    # Resolve paths relative to this script; fallback to CWD in notebooks
    base_dir = Path(__file__).resolve().parent if "__file__" in globals() else Path.cwd()

    in_path = (base_dir / ns.csv).resolve()
    if not in_path.exists():
        raise FileNotFoundError(f"Input file not found at: {in_path}")

    print(f"Reading: {in_path}")
    df = safe_read_csv_local(in_path)
    final_df = build_final(df)

    out_path = base_dir / "store_data_cleaned_and_encoded.csv"
    final_df.to_csv(out_path, index=False)
    print(f"Written {out_path} with shape {final_df.shape}")
    print("✅ store_data_cleaning: Completed successfully.")

if __name__ == "__main__":
    main()