
# main_pipeline.ipynb

Pipeline to:
1) Identify failure years from `year_trends_top_stress.csv` per crop.  
2) Build year-level feature matrices from wide county files.  
3) Rank analog years to 2023 (Top-N and Elbow).

**Requirements:** `pandas`, `numpy`, `scikit-learn`, `matplotlib`

Set the paths in the **Config** cell and run all cells top-to-bottom.


In [None]:

# ---- Config ----
BASE = "PSI Hackathon/Kashish_results"              # base results dir
TRENDS_DIR = "stress_analysis_v2"                   # folder with <CROP>/year_trends_top_stress.csv
WIDE_DIR_MAP = {
    "SOYBEANS": "soybeans/soybean_stage_features_wide_plus.csv",
    "CORN":     "corn/corn_stage_features_wide_plus.csv",
    "COTTON":   "cotton/cotton_stage_features_wide_plus.csv",
    "PEANUTS":  "peanuts/peanut_stage_features_wide_plus.csv",
    "WHEAT":    "wheat/wheat_stage_features_wide_plus.csv",
}
TARGET_YEAR = 2023
TOP_N = 12
OUTDIR = "PSI Hackathon/Kashish_results/failure_years_and_analogs"
Z_HARD = (-1.0, 1.0)   # (Yield_z <=, Stress_z >=)
Z_SOFT = (-0.5, 0.5)   # (Yield_z <=, Stress_z >=)


In [None]:

import json, os
from pathlib import Path
import numpy as np, pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

def ensure_dir(p: Path):
    p.mkdir(parents=True, exist_ok=True)

def find_trends_csv(base: Path, trends_dir_name: str, crop: str) -> Path:
    p = base / trends_dir_name / crop / "year_trends_top_stress.csv"
    if not p.exists():
        raise FileNotFoundError(f"Missing trends CSV for {crop}: {p}")
    return p

def build_year_features_from_wide(wide_csv: Path) -> pd.DataFrame:
    df = pd.read_csv(wide_csv)
    if "Year" not in df.columns:
        raise ValueError(f"'Year' column not found in {wide_csv}")
    df["Year"] = pd.to_numeric(df["Year"], errors="coerce")
    df = df.dropna(subset=["Year"]).copy()
    df["Year"] = df["Year"].astype(int)

    id_like = {"Year","COUNTY","Name","Commodity","Data Item"}
    num_cols = df.select_dtypes(include="number").columns.tolist()
    feat_cols = [c for c in num_cols if c not in id_like]
    if len(feat_cols) == 0:
        raise ValueError(f"No numeric feature columns found in {wide_csv}")
    Y = (df.groupby("Year", as_index=True)[feat_cols].mean().sort_index())
    Y.index.name = "Year"
    return Y

def detect_failures(trends_csv: Path, z_hard=(-1.0, 1.0), z_soft=(-0.5, 0.5)):
    df = pd.read_csv(trends_csv)
    for col in ("Year","Yield_z","Stress_z"):
        if col not in df.columns:
            raise ValueError(f"{trends_csv} missing column: {col}")
    hard = df[(df["Yield_z"] <= z_hard[0]) & (df["Stress_z"] >= z_hard[1])]["Year"].astype(int).tolist()
    soft = df[(df["Yield_z"] <= z_soft[0])  & (df["Stress_z"] >= z_soft[1])]["Year"].astype(int).tolist()
    return soft, hard, df

def plot_trends(df: pd.DataFrame, crop: str, fail_soft, fail_hard, out_png: Path):
    plt.figure(figsize=(9,4.5))
    plt.plot(df["Year"], df["Yield_z"], label="Yield (z)")
    plt.plot(df["Year"], df["Stress_z"], label="Top-stress (z)", linestyle="--")
    for y in fail_soft: plt.axvline(int(y), alpha=0.15)
    for y in fail_hard: plt.axvline(int(y), alpha=0.35)
    plt.axhline(0, linewidth=0.8)
    plt.xlabel("Year"); plt.ylabel("z-score")
    plt.title(f"{crop}: Yield vs Top Stress (z)")
    plt.legend(loc="upper left")
    plt.tight_layout()
    plt.savefig(out_png, dpi=200); plt.close()

def rank_analogs(Y: pd.DataFrame, target_year: int) -> pd.DataFrame:
    if target_year not in Y.index:
        raise ValueError(f"{target_year} not in year features index.")
    scaler = StandardScaler()
    X = scaler.fit_transform(Y.values)
    yrs = Y.index.to_numpy()
    idx = np.where(yrs == target_year)[0][0]
    sims = cosine_similarity(X[idx:idx+1], X).ravel()
    rank = (pd.DataFrame({"Year": yrs, "cosine_sim": sims})
            .sort_values("cosine_sim", ascending=False))
    rank = rank[rank["Year"] != target_year].drop_duplicates(subset=["Year"]).reset_index(drop=True)
    return rank

def elbow_k(rank: pd.DataFrame, top_n: int) -> int:
    s = rank["cosine_sim"].to_numpy()
    if len(s) < 3:
        return min(len(rank), top_n)
    drops = np.diff(s)
    return int(np.argmin(drops) + 1)


In [None]:

base = Path(BASE)
outdir = Path(OUTDIR)
ensure_dir(outdir)

for crop, rel_path in WIDE_DIR_MAP.items():
    print(f"\n=== {crop} ===")
    trends_csv = find_trends_csv(base, TRENDS_DIR, crop)
    soft, hard, trends_df = detect_failures(trends_csv, Z_HARD, Z_SOFT)

    # save failure years table with tag
    fail_df = pd.DataFrame({"Year": sorted(set(soft + hard))})
    fail_df["tag"] = fail_df["Year"].apply(lambda y: "hard" if y in set(hard) else ("soft" if y in set(soft) else ""))
    fail_csv = outdir / f"{crop.lower()}_failure_years.csv"
    fail_df.to_csv(fail_csv, index=False)
    print(f"[{crop}] Failures saved -> {fail_csv.name} | hard={hard} | soft={soft}")

    # plot
    plot_png = outdir / f"{crop.lower()}_yield_stress_z_trend.png"
    plot_trends(trends_df, crop, soft, hard, plot_png)
    print(f"[{crop}] Trend plot -> {plot_png.name}")

    # year-level features
    wide_csv = base / rel_path
    Y = build_year_features_from_wide(wide_csv)
    yf_csv = outdir / f"{crop.lower()}_year_features.csv"
    Y.to_csv(yf_csv)
    print(f"[{crop}] Year features -> {yf_csv.name} ({Y.shape[0]} years × {Y.shape[1]} feats)")

    # analogs
    rank = rank_analogs(Y, TARGET_YEAR)
    k_elbow = elbow_k(rank, TOP_N)

    topN = rank.head(min(TOP_N, len(rank))).copy()
    topE = rank.head(min(k_elbow, len(rank))).copy()

    out_topN = outdir / f"{crop.lower()}_analogs_{TARGET_YEAR}_top{len(topN)}.csv"
    out_elb  = outdir / f"{crop.lower()}_analogs_{TARGET_YEAR}_elbow{len(topE)}.csv"
    topN.to_csv(out_topN, index=False)
    topE.to_csv(out_elb, index=False)
    print(f"[{crop}] Saved Top-{len(topN)} -> {out_topN.name} | Elbow({k_elbow}) -> {out_elb.name}")


In [None]:

"""
Post-processing on top of main_pipeline.py outputs:
- Load {crop}_year_features.csv and {crop}_failure_years.csv
- Rank analog years to TARGET_YEAR (Top-N and Elbow)
- Build row-level training masks from a WIDE file
- (Optional) export per-crop analog tables and filtered rows

Usage:
python analysis_pipeline.py \
  --base "PSI Hackathon/Kashish_results" \
  --out  "PSI Hackathon/Kashish_results/failure_years_and_analogs" \
  --wide_dir_map '{"SOYBEANS":"soybeans/soybean_stage_features_wide_plus.csv","CORN":"corn/corn_stage_features_wide_plus.csv"}' \
  --target_year 2023 --top_n 12 --exclude soft
"""
import json
import argparse
from pathlib import Path
import numpy as np, pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

def ensure_dir(p: Path):
    p.mkdir(parents=True, exist_ok=True)

def load_year_features(base_out: Path, crop: str) -> pd.DataFrame:
    f = base_out / f"{crop.lower()}_year_features.csv"
    Y = pd.read_csv(f, index_col=0)
    if not Y.index.is_unique:
        Y = Y.groupby(level=0).mean()
    Y.index = Y.index.astype(int)
    return Y.sort_index()

def load_fail_years(base_out: Path, crop: str, exclude: str) -> set[int]:
    f = base_out / f"{crop.lower()}_failure_years.csv"
    if not f.exists() or exclude == "none":
        return set()
    df = pd.read_csv(f)
    yrs = set(pd.to_numeric(df["Year"], errors="coerce").dropna().astype(int))
    if "tag" not in df.columns or exclude == "any":
        return yrs
    tag = df["tag"].astype(str).str.lower()
    if exclude == "soft":
        return set(pd.to_numeric(df.loc[tag.str.contains("soft"), "Year"], errors="coerce").dropna().astype(int))
    if exclude == "hard":
        return set(pd.to_numeric(df.loc[tag.str.contains("hard"), "Year"], errors="coerce").dropna().astype(int))
    return set()

def rank_analogs(Y: pd.DataFrame, target_year: int) -> pd.DataFrame:
    assert target_year in Y.index, f"{target_year} not in year features."
    scaler = StandardScaler(); X = scaler.fit_transform(Y.values)
    years = Y.index.to_numpy()
    i = np.where(years == target_year)[0][0]
    sims = cosine_similarity(X[i:i+1], X).ravel()
    rank = (pd.DataFrame({"Year": years, "cosine_sim": sims})
              .sort_values("cosine_sim", ascending=False))
    return rank[rank["Year"] != target_year].drop_duplicates("Year").reset_index(drop=True)

def elbow_k(rank: pd.DataFrame, top_n: int) -> int:
    s = rank["cosine_sim"].to_numpy()
    if len(s) < 3: return min(len(rank), top_n)
    drops = np.diff(s); return int(np.argmin(drops) + 1)

def build_year_matrix_from_wide(wide_csv: Path) -> pd.DataFrame:
    df = pd.read_csv(wide_csv)
    df["Year"] = pd.to_numeric(df["Year"], errors="coerce")
    df = df.dropna(subset=["Year"]).copy()
    df["Year"] = df["Year"].astype(int)
    id_like = {"Year","COUNTY","Name","Commodity","Data Item"}
    num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    feat_cols = [c for c in num_cols if c not in id_like]
    Y = df.groupby("Year", as_index=True)[feat_cols].mean().sort_index()
    Y.index.name = "Year"; return Y

def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("--base", required=True, help="Base dir (e.g., PSI Hackathon/Kashish_results)")
    ap.add_argument("--out", required=True, help="Folder with {crop}_year_features.csv and failures")
    ap.add_argument("--wide_dir_map", required=True,
                    help='JSON dict crop->relative WIDE csv path (from --base)')
    ap.add_argument("--target_year", type=int, default=2023)
    ap.add_argument("--top_n", type=int, default=12)
    ap.add_argument("--exclude", choices=["none","soft","hard","any"], default="none",
                    help="Exclude which failure years from analog ranking")
    ap.add_argument("--export_rows", action="store_true",
                    help="Also export row-level subsets for Top-N and Elbow")
    args = ap.parse_args()

    base = Path(args.base)
    out = Path(args.out)
    ensure_dir(out)
    wide_map = json.loads(args.wide_dir_map)

    for crop, rel_path in wide_map.items():
        print(f"\n=== {crop} ===")
        # 1) Load year features and failures
        Y = load_year_features(out, crop)
        excl = load_fail_years(out, crop, args.exclude)
        # 2) Rank analogs
        rank = rank_analogs(Y, args.target_year)
        if excl: rank = rank[~rank["Year"].isin(excl)].reset_index(drop=True)
        k = elbow_k(rank, args.top_n)
        topN = rank.head(min(args.top_n, len(rank))).copy()
        topE = rank.head(min(k, len(rank))).copy()
        topN.to_csv(out / f"{crop.lower()}_analogs_{args.target_year}_top{len(topN)}_{args.exclude}.csv", index=False)
        topE.to_csv(out / f"{crop.lower()}_analogs_{args.target_year}_elbow{len(topE)}_{args.exclude}.csv", index=False)
        print(f"[{crop}] Top-{len(topN)} saved | Elbow({k}) saved")

        # 3) Optional: export row-level subsets from WIDE for training
        if args.export_rows:
            wide_csv = base / rel_path
            df = pd.read_csv(wide_csv)
            df["Year"] = pd.to_numeric(df["Year"], errors="coerce").astype("Int64")
            keep_cols = ["Year","COUNTY","Name","Commodity","Data Item"]
            # TopN rows
            yrs_top = set(topN["Year"].tolist()); yrs_elb = set(topE["Year"].tolist())
            rows_top = df[df["Year"].isin(yrs_top)].copy()
            rows_elb = df[df["Year"].isin(yrs_elb)].copy()
            rows_top.to_csv(out / f"{crop.lower()}_rows_top{len(topN)}_{args.exclude}.csv", index=False)
            rows_elb.to_csv(out / f"{crop.lower()}_rows_elbow{len(topE)}_{args.exclude}.csv", index=False)
            print(f"[{crop}] Exported rows: topN={rows_top.shape} elbow={rows_elb.shape}")

if __name__ == "__main__":
    main()
