In [10]:
import os
import json
from datetime import datetime
from pathlib import Path

import numpy as np
import pandas as pd


In [11]:
def parse_month_to_ym(m):
    """Parse diverse month formats to 'YYYY-MM' or return None."""
    if pd.isna(m):
        return None
    dt = pd.to_datetime(str(m), errors="coerce")
    if pd.isna(dt):
        return None
    return f"{dt.year:04d}-{dt.month:02d}"


def run_step1_audit(features_path: str, out_dir: str = ".", preview_rows: int = 15) -> dict:
    features_path = Path(features_path)
    out_dir = Path(out_dir)
    assert features_path.exists(), f"File not found: {features_path}"
    out_dir.mkdir(parents=True, exist_ok=True)

    df = pd.read_csv(features_path)

    # Identify numeric columns: transactions/transfers + common totals
    num_cols = [c for c in df.columns if c.startswith(("trs_", "trf_"))]
    for c in ["income_total", "spending_total", "avg_monthly_balance_KZT", "age"]:
        if c in df.columns:
            num_cols.append(c)
    num_cols = sorted(set(num_cols))

    # Month distribution
    if "month" in df.columns:
        month_parsed = df["month"].apply(parse_month_to_ym)
        month_counts = (
            month_parsed.value_counts(dropna=False)
            .rename_axis("month")
            .reset_index(name="rows")
            .sort_values("month")
            .reset_index(drop=True)
        )
    else:
        month_counts = pd.DataFrame({"month": ["<absent>"], "rows": [len(df)]})

    # Basic stats
    n_rows = len(df)
    n_clients = df["client_code"].nunique() if "client_code" in df.columns else None
    n_cities = df["city"].nunique() if "city" in df.columns else None

    # Missing values in numeric columns
    nan_summary = (
        df[num_cols].isna().sum().sort_values(ascending=False)
        .rename_axis("column").reset_index(name="n_missing")
    )

    # Negative values in numeric columns (should be >=0 for spends/income here)
    neg_summary = (
        (df[num_cols] < 0).sum().sort_values(ascending=False)
        .rename_axis("column").reset_index(name="n_negative")
    )

    # Totals sanity (only if columns exist)
    totals = {
        "sum_spending_total": float(df["spending_total"].sum()) if "spending_total" in df.columns else None,
        "sum_income_total": float(df["income_total"].sum()) if "income_total" in df.columns else None,
        "avg_balance_mean": float(df["avg_monthly_balance_KZT"].mean()) if "avg_monthly_balance_KZT" in df.columns else None,
    }

    # Preview
    preview_cols = [c for c in [
        "client_code", "name", "status", "city", "age",
        "month", "avg_monthly_balance_KZT", "income_total", "spending_total", "product"
    ] if c in df.columns]
    preview = df[preview_cols].head(preview_rows)

    # ---- Save artifacts
    preview.to_csv(out_dir / "step1_preview.csv", index=False)
    month_counts.to_csv(out_dir / "step1_month_distribution.csv", index=False)
    nan_summary.to_csv(out_dir / "step1_missing_numeric.csv", index=False)
    neg_summary.to_csv(out_dir / "step1_negative_numeric.csv", index=False)

    report = {
        "shape": [int(df.shape[0]), int(df.shape[1])],
        "n_unique_clients": int(n_clients) if n_clients is not None else None,
        "n_unique_cities": int(n_cities) if n_cities is not None else None,
        "months_detected": month_counts.to_dict(orient="records"),
        "nan_top30": nan_summary.head(30).to_dict(orient="records"),
        "neg_top30": neg_summary.head(30).to_dict(orient="records"),
        "totals": totals,
        "notes": [
            "Ignore 'product' for labels — it's current ownership, not the optimal product.",
            "Assume KZT for amounts; currency normalization can be added later if needed."
        ],
    }
    with open(out_dir / "step1_validation_report.json", "w", encoding="utf-8") as f:
        json.dump(report, f, ensure_ascii=False, indent=2)

    # ---- Console summary
    print("=== STEP 1: Data Audit Summary ===")
    print(f"Rows: {n_rows}, Columns: {df.shape[1]}")
    if n_clients is not None:
        print(f"Unique clients: {n_clients}")
    if n_cities is not None:
        print(f"Unique cities: {n_cities}")
    if "month" in df.columns:
        print("Months detected:", ", ".join(month_counts['month'].astype(str).tolist()))
    print("Saved:")
    print(" - step1_preview.csv")
    print(" - step1_month_distribution.csv")
    print(" - step1_missing_numeric.csv")
    print(" - step1_negative_numeric.csv")
    print(" - step1_validation_report.json")

    return report


if __name__ == "__main__":
    # Example run:
    # python step1_audit.py
    # Or specify different paths:
    # python step1_audit.py /path/to/features.csv /tmp/out
    import sys

    features = "features.csv"
    out = '.'
    run_step1_audit(features, out)


=== STEP 1: Data Audit Summary ===
Rows: 178, Columns: 51
Unique clients: 59
Unique cities: 9
Months detected: 2025-06, 2025-07, 2025-08
Saved:
 - step1_preview.csv
 - step1_month_distribution.csv
 - step1_missing_numeric.csv
 - step1_negative_numeric.csv
 - step1_validation_report.json


In [12]:
import os
import json
from pathlib import Path
from datetime import datetime

import numpy as np
import pandas as pd


def parse_month_to_ym(m):
    """Parse to 'YYYY-MM' for counting distinct months; return None if unknown."""
    if pd.isna(m):
        return None
    dt = pd.to_datetime(str(m), errors="coerce")
    if pd.isna(dt):
        return None
    return f"{dt.year:04d}-{dt.month:02d}"


def run_step2_aggregate(features_path: str, out_dir: str = ".",
                        aggregated_filename: str = "aggregated_clients.csv",
                        schema_filename: str = "aggregated_feature_columns.json") -> pd.DataFrame:
    """
    Reads features.csv with monthly rows and returns a per-client aggregated dataframe.

    Outputs:
      - aggregated_clients.csv  (one row per client)
      - aggregated_feature_columns.json  (ordered list of numeric feature columns for modeling)
    """
    features_path = Path(features_path)
    out_dir = Path(out_dir)
    assert features_path.exists(), f"File not found: {features_path}"
    out_dir.mkdir(parents=True, exist_ok=True)

    df = pd.read_csv(features_path)

    # ---- Identify numeric columns
    trs_cols = [c for c in df.columns if c.startswith("trs_")]
    trf_cols = [c for c in df.columns if c.startswith("trf_")]
    base_nums = [c for c in ["income_total", "spending_total", "avg_monthly_balance_KZT", "age"] if c in df.columns]

    # Coerce numerics and fill NaN with 0 (for flows); age we'll handle separately
    num_to_fill = sorted(set(trs_cols + trf_cols + [c for c in base_nums if c != "age"]))
    for c in num_to_fill:
        df[c] = pd.to_numeric(df[c], errors="coerce").fillna(0.0)
    if "age" in df.columns:
        df["age"] = pd.to_numeric(df["age"], errors="coerce")

    # Parse month for counting
    if "month" in df.columns:
        df["_month_ym"] = df["month"].apply(parse_month_to_ym)
    else:
        df["_month_ym"] = None

    # ---- Aggregate to one row per client
    rows = []
    for client, g in df.groupby("client_code"):
        row = {"client_code": client}

        # Keep metadata (first non-null)
        for col in ["name", "status", "city"]:
            if col in g.columns:
                vals = g[col].dropna()
                row[col] = vals.iloc[0] if not vals.empty else None

        # Age: take first non-null
        if "age" in g.columns:
            vals = g["age"].dropna()
            row["age"] = float(vals.iloc[0]) if not vals.empty else None

        # Months observed
        n_months = g["_month_ym"].nunique()
        row["n_months"] = int(n_months)

        # Balances: average across months; totals: sum across months
        if "avg_monthly_balance_KZT" in g.columns:
            row["avg_monthly_balance_KZT"] = float(g["avg_monthly_balance_KZT"].mean())
        row["income_total_3m"] = float(g["income_total"].sum()) if "income_total" in g.columns else 0.0
        row["spending_total_3m"] = float(g["spending_total"].sum()) if "spending_total" in g.columns else 0.0

        # Sum all trs_/trf_ across months
        for c in trs_cols + trf_cols:
            row[c] = float(g[c].sum())

        # ---- Derived aggregates useful for downstream scoring
        # Withdrawals / cash-like outflows
        atm = row.get("trf_atm_withdrawal", 0.0)
        card_out = row.get("trf_card_out", 0.0)
        withdrawals_total = atm + card_out
        row["withdrawals_total"] = withdrawals_total
        row["withdrawals_to_spend_ratio"] = (
            withdrawals_total / row["spending_total_3m"] if row["spending_total_3m"] > 0 else 0.0
        )

        # FX / Invest / Deposits
        row["fx_turnover"] = row.get("trf_fx_buy", 0.0) + row.get("trf_fx_sell", 0.0)
        row["invest_turnover"] = row.get("trf_invest_in", 0.0) + row.get("trf_invest_out", 0.0)
        row["deposit_topups"] = row.get("trf_deposit_topup_out", 0.0) + row.get("trf_deposit_fx_topup_out", 0.0)
        row["cc_repayments"] = row.get("trf_cc_repayment_out", 0.0)
        row["loan_payments"] = row.get("trf_loan_payment_out", 0.0)

        # Product-oriented spend clusters
        TRAVEL = ["trs_Путешествия", "trs_Такси", "trs_Отели"]
        ONLINE = ["trs_Играем дома", "trs_Смотрим дома", "trs_Едим дома"]
        PREMIUM_BOOST = ["trs_Ювелирные украшения", "trs_Косметика и Парфюмерия", "trs_Кафе и рестораны"]

        row["travel_spend"] = float(sum(row.get(c, 0.0) for c in TRAVEL))
        row["online_spend"] = float(sum(row.get(c, 0.0) for c in ONLINE))
        row["premium_boost_spend"] = float(sum(row.get(c, 0.0) for c in PREMIUM_BOOST))

        # Shares per spending category
        spend = row["spending_total_3m"]
        for c in trs_cols:
            share_col = "share_" + c.replace("trs_", "")
            row[share_col] = (row.get(c, 0.0) / spend) if spend > 0 else 0.0

        # Top-3 spending categories (names without 'trs_')
        cat_pairs = [(c.replace("trs_", ""), row.get(c, 0.0)) for c in trs_cols]
        cat_pairs = [(n, v) for n, v in cat_pairs if v and v > 0]
        cat_pairs.sort(key=lambda kv: kv[1], reverse=True)
        row["top_cat1"] = cat_pairs[0][0] if len(cat_pairs) > 0 else None
        row["top_cat2"] = cat_pairs[1][0] if len(cat_pairs) > 1 else None
        row["top_cat3"] = cat_pairs[2][0] if len(cat_pairs) > 2 else None

        rows.append(row)

    agg = pd.DataFrame(rows)

    # ---- Column ordering: id/meta, core totals, derived, raw sums, shares, tops
    id_meta = [c for c in ["client_code", "name", "status", "city", "age", "n_months"] if c in agg.columns]
    core = [c for c in ["avg_monthly_balance_KZT", "income_total_3m", "spending_total_3m"] if c in agg.columns]
    derived = [
        "fx_turnover", "invest_turnover", "withdrawals_total", "withdrawals_to_spend_ratio",
        "deposit_topups", "cc_repayments", "loan_payments",
        "travel_spend", "online_spend", "premium_boost_spend"
    ]
    raw_sums = trs_cols + trf_cols
    share_cols = [c for c in agg.columns if c.startswith("share_")]
    tops = ["top_cat1", "top_cat2", "top_cat3"]

    ordered_cols = id_meta + core + derived + raw_sums + share_cols + tops
    # Keep only columns that exist
    ordered_cols = [c for c in ordered_cols if c in agg.columns]
    agg = agg[ordered_cols]

    # ---- Save outputs
    out_csv = out_dir / aggregated_filename
    agg.to_csv(out_csv, index=False)

    feature_cols = core + derived + raw_sums + share_cols  # numeric set for modeling later
    schema = {
        "feature_columns": feature_cols,
        "notes": [
            "Do NOT use 'name/city/status' as model inputs unless explicitly desired.",
            "We ignore the original 'product' column from features.csv for labels; labels will come from the EV-teacher.",
        ],
    }
    with open(out_dir / schema_filename, "w", encoding="utf-8") as f:
        json.dump(schema, f, ensure_ascii=False, indent=2)

    # Console summary
    print("=== STEP 2: Aggregation Summary ===")
    print(f"Clients: {agg.shape[0]}")
    print(f"Output columns: {agg.shape[1]}")
    print(f"Saved: {out_csv.name}, {schema_filename}")

    return agg


if __name__ == "__main__":
    # Usage:
    #   python step2_aggregate.py /mnt/data/features.csv /mnt/data
    import sys
    features = "features.csv"
    out = "."
    run_step2_aggregate(features, out)


=== STEP 2: Aggregation Summary ===
Clients: 59
Output columns: 80
Saved: aggregated_clients.csv, aggregated_feature_columns.json


In [13]:
import json
from pathlib import Path
from typing import Dict, List, Tuple

import numpy as np
import pandas as pd


PRODUCTS = [
    "Карта для путешествий",
    "Премиальная карта",
    "Кредитная карта",
    "Обмен валют",
    "Кредит наличными",
    "Депозит Мультивалютный",
    "Депозит Сберегательный",
    "Депозит Накопительный",
    "Инвестиции",
    "Золотые слитки",
]

CFG = {
    # Cashback / interest parameters
    "travel_cb_rate": 0.04,                     # 4% on travel+taxi+hotels
    "premium_base_thresholds": [                # (min_balance_or_topups, base_rate)
        (6_000_000, 0.04),
        (1_000_000, 0.03),
        (0,         0.02),
    ],
    "premium_boost_rate": 0.04,                 # boosted categories up to 4%
    "premium_monthly_cap": 100_000,             # per month cap
    "credit_card_rate": 0.10,                   # top-3 categories + online services
    "fx_saving_rate": 0.003,                    # 0.3% of FX turnover
    "dep_rates": {                              # annual nominal
        "Депозит Сберегательный": 0.165,
        "Депозит Накопительный": 0.155,
        "Депозит Мультивалютный": 0.145,
    },
    "invest_saving_rate": 0.002,                # 0.2% of invest turnover (proxy for fee save)
    "gold_saving_rate": 0.001,                  # 0.1% of gold turnover (very conservative)
    # Gating/penalties (apply if behavioral signals are weak)
    "dep_penalties": {
        # multipliers applied if condition is NOT met
        "Депозит Сберегательный": 0.30,         # needs stable large balance & low withdrawals
        "Депозит Накопительный": 0.50,          # prefers positive cashflow (income > spend)
        "Депозит Мультивалютный": 0.70,         # prefers FX activity
    },
    "loan_ev_base": 1.0,                        # tiny EV so loan wins only with strong need
    "loan_need": {                              
        "low_balance_threshold": 300_000,
        "outflow_multiplier": 1.10,             # spend > 1.1 * income
    },
}

# Helper: safe getter
def g(row, key, default=0.0):
    v = row.get(key, default)
    try:
        return float(v) if v is not None else float(default)
    except Exception:
        return float(default)

def months_frac(n_months: float) -> float:
    n = max(1.0, float(n_months) if pd.notna(n_months) else 3.0)
    return n / 12.0

def top3_spending_categories(row: dict) -> List[Tuple[str, float]]:
    # find aggregated trs_* columns
    entries = [(k, g(row, k)) for k in row.keys() if k.startswith("trs_")]
    entries = [(k, v) for k, v in entries if v > 0]
    entries.sort(key=lambda kv: kv[1], reverse=True)
    return entries[:3]  # [(col_name, spend), ...]

def ev_travel(row: dict) -> float:
    travel_spend = g(row, "trs_Путешествия") + g(row, "trs_Такси") + g(row, "trs_Отели")
    return CFG["travel_cb_rate"] * travel_spend

def ev_premium(row: dict) -> float:
    # base rate by avg balance OR deposit topups
    avg_bal = g(row, "avg_monthly_balance_KZT")
    dep_topups = g(row, "trf_deposit_topup_out") + g(row, "trf_deposit_fx_topup_out")
    base_rate = 0.02
    for thr, rate in CFG["premium_base_thresholds"]:
        if avg_bal >= thr or dep_topups >= thr:
            base_rate = rate
            break

    n_months = max(1.0, g(row, "n_months", 3.0))
    spend_3m = g(row, "spending_total_3m")
    spend_month = spend_3m / n_months

    # boosted categories per month
    booster_3m = g(row, "trs_Ювелирные украшения") + g(row, "trs_Косметика и Парфюмерия") + g(row, "trs_Кафе и рестораны")
    booster_month = booster_3m / n_months
    extra_rate = max(0.0, CFG["premium_boost_rate"] - base_rate)
    cb_month = base_rate * spend_month + extra_rate * booster_month
    cb_month = min(cb_month, CFG["premium_monthly_cap"])
    return cb_month * n_months

def ev_credit_card(row: dict) -> float:
    entries = top3_spending_categories(row)  # list of (trs_col, spend)
    top3_sum = sum(v for _, v in entries)
    # online services (exclude if already in top-3 to avoid double-count)
    online_cols = ["trs_Играем дома", "trs_Смотрим дома", "trs_Едим дома"]
    online_extra = 0.0
    top3_set = {k for k, _ in entries}
    for c in online_cols:
        if c not in top3_set:
            online_extra += g(row, c)
    eligible = top3_sum + online_extra
    return CFG["credit_card_rate"] * eligible

def ev_fx(row: dict) -> float:
    fx_turnover = g(row, "trf_fx_buy") + g(row, "trf_fx_sell")
    return CFG["fx_saving_rate"] * fx_turnover

def ev_deposit(row: dict, dep_name: str) -> float:
    rate = CFG["dep_rates"][dep_name]
    n_frac = months_frac(g(row, "n_months", 3.0))
    base_ev = rate * g(row, "avg_monthly_balance_KZT") * n_frac

    # gating penalties if behavior not aligned
    penalty = 1.0
    if dep_name == "Депозит Сберегательный":
        stable_enough = (g(row, "avg_monthly_balance_KZT") >= 2_000_000) and (g(row, "withdrawals_to_spend_ratio") < 0.20)
        if not stable_enough:
            penalty = CFG["dep_penalties"][dep_name]
    elif dep_name == "Депозит Накопительный":
        positive_net = g(row, "income_total_3m") > g(row, "spending_total_3m")
        if not positive_net:
            penalty = CFG["dep_penalties"][dep_name]
    elif dep_name == "Депозит Мультивалютный":
        fx_active = (g(row, "trf_fx_buy") + g(row, "trf_fx_sell")) > 0
        if not fx_active:
            penalty = CFG["dep_penalties"][dep_name]

    return base_ev * penalty

def ev_invest(row: dict) -> float:
    invest_turnover = g(row, "trf_invest_in") + g(row, "trf_invest_out")
    return CFG["invest_saving_rate"] * invest_turnover

def ev_gold(row: dict) -> float:
    # If transfers exist as trf_gold_buy_out / trf_gold_sell_in, use them; else fall back to 0.
    gold_buy = g(row, "trf_gold_buy_out")
    gold_sell = g(row, "trf_gold_sell_in")
    gold_turnover = gold_buy + gold_sell
    return CFG["gold_saving_rate"] * gold_turnover

def ev_cash_loan(row: dict) -> float:
    # Do not "sell" unless liquidity stress is visible
    low_balance = g(row, "avg_monthly_balance_KZT") < CFG["loan_need"]["low_balance_threshold"]
    repayments = g(row, "trf_loan_payment_out") + g(row, "trf_cc_repayment_out")
    income = g(row, "income_total_3m")
    spend = g(row, "spending_total_3m")
    large_outflows = (income > 0) and (spend > CFG["loan_need"]["outflow_multiplier"] * income)
    need = (repayments > 0 and low_balance) or large_outflows
    return CFG["loan_ev_base"] if need else 0.0


def score_row(row: dict) -> Dict[str, float]:
    ev = {}
    ev["Карта для путешествий"] = ev_travel(row)
    ev["Премиальная карта"]     = ev_premium(row)
    ev["Кредитная карта"]       = ev_credit_card(row)
    ev["Обмен валют"]           = ev_fx(row)
    ev["Кредит наличными"]      = ev_cash_loan(row)
    ev["Депозит Мультивалютный"]= ev_deposit(row, "Депозит Мультивалютный")
    ev["Депозит Сберегательный"]= ev_deposit(row, "Депозит Сберегательный")
    ev["Депозит Накопительный"] = ev_deposit(row, "Депозит Накопительный")
    ev["Инвестиции"]            = ev_invest(row)
    ev["Золотые слитки"]        = ev_gold(row)
    return ev


def run_step3_ev_teacher(aggregated_path: str, out_dir: str = ".",
                         ev_scores_filename="ev_scores.csv",
                         ev_labels_filename="ev_labels.csv",
                         config_filename="ev_teacher_config.json") -> None:
    aggregated_path = Path(aggregated_path)
    out_dir = Path(out_dir)
    assert aggregated_path.exists(), f"File not found: {aggregated_path}"
    out_dir.mkdir(parents=True, exist_ok=True)

    agg = pd.read_csv(aggregated_path)
    # ensure numeric where needed
    for c in agg.columns:
        if c.startswith(("trs_", "trf_")) or c in [
            "avg_monthly_balance_KZT", "spending_total_3m", "income_total_3m",
            "withdrawals_to_spend_ratio", "withdrawals_total",
            "fx_turnover", "invest_turnover", "deposit_topups",
            "cc_repayments", "loan_payments", "n_months", "age"
        ]:
            agg[c] = pd.to_numeric(agg[c], errors="coerce")

    # compute EV table
    ev_rows: List[Dict[str, float]] = []
    label_rows: List[Dict[str, object]] = []

    for _, r in agg.iterrows():
        row = r.to_dict()
        ev = score_row(row)  # dict product -> EV
        # EV soft labels row
        ev_row = {"client_code": r["client_code"]}
        ev_row.update({p: float(ev.get(p, 0.0)) for p in PRODUCTS})
        ev_rows.append(ev_row)

        # hard label (argmax) + Top-4
        sorted_items = sorted(ev.items(), key=lambda kv: kv[1], reverse=True)
        top4 = sorted_items[:4]
        label_rows.append({
            "client_code": r["client_code"],
            "label_best": top4[0][0],
            "ev_best": float(top4[0][1]),
            "label_2": top4[1][0], "ev_2": float(top4[1][1]),
            "label_3": top4[2][0], "ev_3": float(top4[2][1]),
            "label_4": top4[3][0], "ev_4": float(top4[3][1]),
        })

    ev_df = pd.DataFrame(ev_rows, columns=["client_code"] + PRODUCTS)
    labels_df = pd.DataFrame(label_rows)

    ev_path = out_dir / ev_scores_filename
    labels_path = out_dir / ev_labels_filename
    ev_df.to_csv(ev_path, index=False)
    labels_df.to_csv(labels_path, index=False)

    # Save config for reproducibility
    cfg_out = {
        "products": PRODUCTS,
        "config": CFG,
        "notes": [
            "EV are 3-month expected gains (cashback/interest/savings).",
            "Deposit EVs are gated by behavioral signals via penalties (see dep_penalties).",
            "Loan EV is tiny unless liquidity stress signals are present.",
            "Gold EV is conservative and only activates with gold turnover in transfers.",
        ],
        "inputs_expected": [
            "avg_monthly_balance_KZT", "spending_total_3m", "income_total_3m", "n_months",
            "withdrawals_to_spend_ratio", "trf_*", "trs_*"
        ],
        "columns_optional": ["trf_gold_buy_out", "trf_gold_sell_in"],  # only if present in your data
    }
    with open(out_dir / config_filename, "w", encoding="utf-8") as f:
        json.dump(cfg_out, f, ensure_ascii=False, indent=2)

    print("=== STEP 3: EV Teacher Summary ===")
    print(f"Saved: {ev_path.name}, {labels_path.name}, {config_filename}")
    # quick sanity: show how many times each product wins
    winners = labels_df["label_best"].value_counts().sort_values(ascending=False)
    print("Winners distribution:")
    for p, cnt in winners.items():
        print(f" - {p}: {int(cnt)}")


if __name__ == "__main__":
    # Usage:
    #   python step3_ev_teacher.py /mnt/data/aggregated_clients.csv /mnt/data
    import sys
    agg_path = "aggregated_clients.csv"
    out_dir = "."
    run_step3_ev_teacher(agg_path, out_dir)


=== STEP 3: EV Teacher Summary ===
Saved: ev_scores.csv, ev_labels.csv, ev_teacher_config.json
Winners distribution:
 - Кредитная карта: 47
 - Премиальная карта: 12


In [16]:
# step4_train_classifier.py
# STEP 4 — Train a baseline classifier from EV-teacher labels (FIXED).
#
# Inputs:
#   - aggregated_clients.csv            (from step 2)
#   - aggregated_feature_columns.json   (from step 2; optional but preferred)
#   - ev_labels.csv                     (from step 3; hard labels)
#   - ev_scores.csv                     (from step 3; soft labels / EV per product)
#
# Outputs:
#   - reco_model.pkl                    (scikit-learn model + metadata)
#   - reco_eval.csv                     (CV metrics per fold: top1, top3, EV-regret)
#   - reco_predictions.csv              (per-client out-of-fold predictions w/ EV regrets)
#
# Notes:
#   - Handles folds where the training set has only 1 class (falls back to DummyClassifier).
#   - Maps fold-local predict_proba to the GLOBAL class space before metrics (fixes label mismatch).

from pathlib import Path
import json
from typing import List

import numpy as np
import pandas as pd
import joblib

from sklearn.model_selection import GroupKFold
from sklearn.metrics import accuracy_score, top_k_accuracy_score
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.dummy import DummyClassifier


# -------------------- Utils --------------------
META_COLS = [
    "client_code", "name", "status", "city", "age", "n_months",
    "top_cat1", "top_cat2", "top_cat3"
]

def load_feature_list(schema_path: Path, agg_cols: List[str]) -> List[str]:
    """Load preferred feature columns; fall back to numeric-like columns if schema missing."""
    if schema_path.exists():
        with open(schema_path, "r", encoding="utf-8") as f:
            schema = json.load(f)
        feats = [c for c in schema.get("feature_columns", []) if c in agg_cols]
        if feats:
            return feats

    numeric_like = []
    for c in agg_cols:
        if c in META_COLS:
            continue
        if c.startswith(("trs_", "trf_", "share_")) or c in [
            "avg_monthly_balance_KZT", "income_total_3m", "spending_total_3m",
            "fx_turnover", "invest_turnover", "withdrawals_total", "withdrawals_to_spend_ratio",
            "deposit_topups", "cc_repayments", "loan_payments",
            "travel_spend", "online_spend", "premium_boost_spend"
        ]:
            numeric_like.append(c)
    return numeric_like


# -------------------- Main --------------------
def run_step4_train(
    aggregated_path: str,
    ev_labels_path: str,
    ev_scores_path: str,
    out_dir: str = ".",
    schema_path: str = "aggregated_feature_columns.json",
    model_filename: str = "reco_model.pkl",
    eval_filename: str = "reco_eval.csv",
    preds_filename: str = "reco_predictions.csv",
):
    out_dir = Path(out_dir); out_dir.mkdir(parents=True, exist_ok=True)
    agg_path = Path(aggregated_path)
    labels_path = Path(ev_labels_path)
    scores_path = Path(ev_scores_path)
    schema_path = Path(schema_path)

    assert agg_path.exists(), f"Not found: {agg_path}"
    assert labels_path.exists(), f"Not found: {labels_path}"
    assert scores_path.exists(), f"Not found: {scores_path}"

    # Load data
    agg = pd.read_csv(agg_path)
    labels = pd.read_csv(labels_path)   # client_code, label_best, ev_best, label_2..label_4...
    scores = pd.read_csv(scores_path)   # client_code + EV per product

    # Global product classes from ev_scores columns (order is fixed and global)
    classes = [c for c in scores.columns if c != "client_code"]
    class_to_idx = {c: i for i, c in enumerate(classes)}

    # Merge features + labels + scores
    df = agg.merge(labels[["client_code", "label_best", "ev_best"]], on="client_code", how="inner")
    df = df.merge(scores, on="client_code", how="inner", suffixes=("", ""))

    # Feature columns
    feature_cols = load_feature_list(schema_path, df.columns.tolist())
    if not feature_cols:
        raise RuntimeError("No feature columns selected. Check aggregated_feature_columns.json or your input schema.")
    for c in feature_cols:
        df[c] = pd.to_numeric(df[c], errors="coerce")

    # Target indices
    y_labels = df["label_best"].astype(str)
    y_idx = y_labels.map(class_to_idx)
    mask = y_idx.notna()
    df = df[mask].reset_index(drop=True)
    y_idx = y_idx[mask].astype(int)

    groups = df["client_code"].values
    X = df[feature_cols].copy()

    # Base estimator
    base_clf = HistGradientBoostingClassifier(
        max_depth=4,
        learning_rate=0.10,
        max_iter=300,
        l2_regularization=0.05,
        random_state=42
    )

    # CV splits
    n = len(df)
    n_splits = min(5, max(3, n // 15))  # heuristic for small data
    gkf = GroupKFold(n_splits=n_splits)

    records = []
    fold_preds = []

    for fold, (tr, te) in enumerate(gkf.split(X, y_idx, groups=groups), start=1):
        Xtr, Xte = X.iloc[tr], X.iloc[te]
        ytr, yte = y_idx.iloc[tr], y_idx.iloc[te]
        te_idx = df.index[te]

        # Fallback if only one class present in the training fold
        unique_train = np.unique(ytr)
        if len(unique_train) < 2:
            clf = DummyClassifier(strategy="most_frequent")
        else:
            clf = base_clf

        clf.fit(Xtr, ytr)

        # --- Map fold-local proba to GLOBAL class space ---
        # Some estimators don't have predict_proba (HGB does). For Dummy, it does as well.
        proba_local = clf.predict_proba(Xte)
        local_classes = clf.classes_.astype(int)              # indices in GLOBAL classes
        proba_full = np.zeros((Xte.shape[0], len(classes)))   # (n_test, n_global_classes)
        proba_full[:, local_classes] = proba_local            # place local probs into global columns

        # Global predictions
        yhat_global = proba_full.argmax(axis=1)

        # Metrics
        acc = accuracy_score(yte, yhat_global)
        top3 = top_k_accuracy_score(
            yte,
            proba_full,
            k=3,
            labels=list(range(len(classes)))  # [0..len-1], matches proba_full columns
        )

        # EV-regret
        ev_best = df.loc[te_idx, "ev_best"].values.astype(float)
        ev_pred = []
        for i, row_i in enumerate(te_idx):
            pred_class = classes[yhat_global[i]]              # global class -> product name
            ev_pred.append(float(df.loc[row_i, pred_class]))
        ev_pred = np.array(ev_pred, dtype=float)
        ev_regret = float((ev_best - ev_pred).mean())

        records.append({
            "fold": fold,
            "top1_acc": float(acc),
            "top3_acc": float(top3),
            "mean_ev_regret": ev_regret,
            "n_test": int(len(te_idx)),
        })

        # Save out-of-fold predictions
        fold_pred_df = pd.DataFrame({
            "client_code": df.loc[te_idx, "client_code"].values,
            "label_best_teacher": df.loc[te_idx, "label_best"].values,
            "recommended_product": [classes[i] for i in yhat_global],
            "ev_best_teacher": ev_best,
            "ev_predicted": ev_pred,
            "ev_regret": ev_best - ev_pred,
        })
        fold_preds.append(fold_pred_df)

    eval_df = pd.DataFrame(records)
    eval_df.to_csv(out_dir / eval_filename, index=False)

    # Train final model on ALL data (fallback to Dummy if only 1 class globally)
    if len(np.unique(y_idx)) < 2:
        final_clf = DummyClassifier(strategy="most_frequent")
    else:
        final_clf = base_clf
    final_clf.fit(X, y_idx)

    # Save artifacts
    model_pack = {
        "model": final_clf,
        "classes": classes,           # ordered product names
        "feature_cols": feature_cols,
        "metadata": {
            "n_clients": int(len(df)),
            "n_features": int(len(feature_cols)),
            "products": classes,
            "cv_summary": {
                "top1_acc_mean": float(eval_df["top1_acc"].mean()),
                "top3_acc_mean": float(eval_df["top3_acc"].mean()),
                "mean_ev_regret": float(eval_df["mean_ev_regret"].mean()),
                "n_splits": int(n_splits),
            }
        }
    }
    joblib.dump(model_pack, out_dir / model_filename)

    # Consolidated out-of-fold predictions
    preds_df = pd.concat(fold_preds, ignore_index=True).sort_values("client_code")
    preds_df.to_csv(out_dir / preds_filename, index=False)

    # Console summary
    print("=== STEP 4: Classifier Training Summary ===")
    print(f"Splits: {n_splits}")
    print(f"Top-1 mean: {eval_df['top1_acc'].mean():.3f} | "
          f"Top-3 mean: {eval_df['top3_acc'].mean():.3f} | "
          f"EV-regret mean (₸): {eval_df['mean_ev_regret'].mean():.1f}")
    print(f"Saved: {model_filename}, {eval_filename}, {preds_filename}")


if __name__ == "__main__":
    # Usage:
    #   python step4_train_classifier.py /path/aggregated_clients.csv /path/ev_labels.csv /path/ev_scores.csv /out/dir
    import sys
    agg = "aggregated_clients.csv"
    evl = "ev_labels.csv"
    evs = "ev_scores.csv"
    out = "."
    run_step4_train(agg, evl, evs, out)


=== STEP 4: Classifier Training Summary ===
Splits: 3
Top-1 mean: 0.744 | Top-3 mean: 1.000 | EV-regret mean (₸): 5496.4
Saved: reco_model.pkl, reco_eval.csv, reco_predictions.csv


In [None]:
# step5_infer.py
# STEP 5 — Inference stub & Top-N API.
#
# What it does:
#   • Loads reco_model.pkl (from step 4)
#   • Accepts an aggregated client CSV (from step 2) OR a single JSON row
#   • Aligns features, maps fold-local classes → global classes
#   • Returns Top-N recommendations per client (+ per-class probabilities)
#
# Outputs:
#   - reco_inference.csv  (client_code, top1, top1_conf, topN list, per-class probs)
#
# Usage examples:
#   python step5_infer.py /mnt/data/aggregated_clients.csv /mnt/data --topk 3
#   # Or read a single-row JSON:
#   echo '{"client_code":999, "avg_monthly_balance_KZT":1200000, ...}' | \
#     python step5_infer.py - - --topk 5
#
# Notes:
#   - This uses ONLY the trained classifier. Blending with EV-teacher is step 6+ (optional).

import sys
import json
from pathlib import Path
from typing import List, Dict, Tuple

import numpy as np
import pandas as pd
import joblib


def load_model(model_path: str):
    pack = joblib.load(model_path)
    model = pack["model"]
    classes = pack["classes"]           # global list of product names (ordered)
    feature_cols = pack["feature_cols"] # ordered features expected by the model
    meta = pack.get("metadata", {})
    return model, classes, feature_cols, meta


def ensure_dataframe(input_path: str) -> pd.DataFrame:
    """
    If input_path == '-', read one JSON object from stdin and make a 1-row DataFrame.
    Else, read CSV.
    """
    if input_path == "-":
        raw = sys.stdin.read().strip()
        row = json.loads(raw)
        return pd.DataFrame([row])
    p = Path(input_path)
    assert p.exists(), f"Input not found: {p}"
    return pd.read_csv(p)


def align_features(df: pd.DataFrame, feature_cols: List[str]) -> pd.DataFrame:
    """
    Ensure all required columns exist; fill missing with 0.0; cast to float where possible.
    """
    out = df.copy()
    for c in feature_cols:
        if c not in out.columns:
            out[c] = 0.0
        out[c] = pd.to_numeric(out[c], errors="coerce")
    return out[feature_cols]


def predict_topn(
    model,
    classes: List[str],
    X: pd.DataFrame,
    client_codes: np.ndarray,
    topk: int = 3
) -> pd.DataFrame:
    """
    Compute per-class probabilities mapped into the GLOBAL class space,
    then produce Top-N recommendations per row.
    """
    # Local probas → global map
    proba_local = model.predict_proba(X)
    local_classes_idx = model.classes_.astype(int)     # indices into GLOBAL classes
    proba_full = np.zeros((X.shape[0], len(classes)), dtype=float)
    proba_full[:, local_classes_idx] = proba_local

    # Top-N
    topk = int(max(1, min(topk, len(classes))))
    order = np.argsort(-proba_full, axis=1)  # descending
    top_indices = order[:, :topk]
    top_scores = np.take_along_axis(proba_full, top_indices, axis=1)

    # Build rows
    rows = []
    for i in range(X.shape[0]):
        top_products = [classes[j] for j in top_indices[i]]
        top_scores_i = [float(s) for s in top_scores[i]]
        row = {
            "client_code": client_codes[i] if "client_code" in df.columns else None,
            "top1_product": top_products[0],
            "top1_confidence": top_scores_i[0],
            "topN_products": "; ".join(top_products),
            "topN_confidences": "; ".join(f"{s:.4f}" for s in top_scores_i),
        }
        # Add per-class probability columns
        for j, name in enumerate(classes):
            row[f"proba::{name}"] = float(proba_full[i, j])
        rows.append(row)

    return pd.DataFrame(rows)


def run(
    aggregated_path: str,
    out_dir: str = ".",
    model_path: str = "reco_model.pkl",
    topk: int = 3,
) -> Path:
    out_dir = Path(out_dir); out_dir.mkdir(parents=True, exist_ok=True)
    model, classes, feature_cols, meta = load_model(model_path)

    df_in = ensure_dataframe(aggregated_path)

    # Keep client_code if present for output
    global df
    df = df_in.copy()

    X = align_features(df_in, feature_cols)
    client_codes = df_in["client_code"].values if "client_code" in df_in.columns else np.arange(len(df_in))

    pred_df = predict_topn(model, classes, X, client_codes, topk=topk)

    out_path = out_dir / "reco_inference.csv"
    pred_df.to_csv(out_path, index=False)

    # Console summary
    print("=== STEP 5: Inference Summary ===")
    print(f"Rows inferred: {len(pred_df)} | topk={topk}")
    print(f"Saved: {out_path.name}")
    if "cv_summary" in meta:
        s = meta["cv_summary"]
        print(f"Model CV — top1: {s.get('top1_acc_mean'):.3f}, top3: {s.get('top3_acc_mean'):.3f}, "
              f"EV-regret: {s.get('mean_ev_regret'):.1f}, splits: {s.get('n_splits')}")
    return out_path


if __name__ == "__main__":
    # CLI
    #   python step5_infer.py <aggregated_clients.csv|-> <out_dir> [--model reco_model.pkl] [--topk 3]
    import argparse

    ap = argparse.ArgumentParser(description="STEP 5 — Inference stub & Top-N API")
    ap.add_argument("input", help="Aggregated CSV from step 2, or '-' to read one JSON row from stdin")
    ap.add_argument("out_dir", help="Output directory for reco_inference.csv")
    ap.add_argument("--model", default="reco_model.pkl", help="Path to trained model .pkl")
    ap.add_argument("--topk", type=int, default=3, help="How many top products to output")
    args = ap.parse_args()

    run(args.input, args.out_dir, args.model, args.topk)
