In [5]:
from pathlib import Path
import sys, json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.metrics import (
    precision_recall_curve,
    average_precision_score,
    confusion_matrix,
)

PROJECT_ROOT = Path("/Users/jy/project_2nd/SKN23-2nd-3Team")
sys.path.insert(0, str(PROJECT_ROOT))

from app.utils.save import save_model_and_artifacts
from app.utils.paths import PATHS

# -------------------------
# (선택) 한글 폰트
# -------------------------
try:
    from app.utils.plotting import configure_matplotlib_korean
    configure_matplotlib_korean()
except Exception:
    pass



# JSON 저장 안전 변환 (np.int64 등 -> python int/float/bool)

def to_py(obj):
    if isinstance(obj, (np.integer,)):
        return int(obj)
    if isinstance(obj, (np.floating,)):
        return float(obj)
    if isinstance(obj, (np.bool_,)):
        return bool(obj)
    return obj

def dict_to_py(d: dict) -> dict:
    return {str(k): to_py(v) for k, v in d.items()}



# Streamlit percentile 파일 규칙 (app 코드 기준)
# models/metrics/{model_name}_score_percentiles.json

def write_streamlit_score_percentiles(model_name: str, payload: dict) -> str:
    out = PROJECT_ROOT / "models" / "metrics" / f"{model_name}_score_percentiles.json"
    out.parent.mkdir(parents=True, exist_ok=True)
    out.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")
    return str(out)

def score_percentiles_payload(model_id: str, split: str, y_prob, pcts=(1, 5, 10, 20, 30, 50)):
    y_prob = np.asarray(y_prob, dtype=float).reshape(-1)
    percentiles = [{"pct": int(p), "score": float(np.quantile(y_prob, 1.0 - p / 100.0))} for p in pcts]
    return {"model_id": model_id, "split": split, "percentiles": percentiles}



# Ranking metrics + selection score (빠짐없이)

def build_ranking_metrics(y_true, y_prob, k_list=(5, 10, 15, 30)):
    y_true = np.asarray(y_true).astype(int).reshape(-1)
    y_prob = np.asarray(y_prob).astype(float).reshape(-1)

    pr_auc = float(average_precision_score(y_true, y_prob))

    df_rank = pd.DataFrame({"y": y_true, "score": y_prob}).sort_values("score", ascending=False)
    base_rate = float(df_rank["y"].mean())
    total_pos = float(df_rank["y"].sum())
    n_total = int(len(df_rank))

    ranking = []
    for k in k_list:
        n_sel = max(int(np.floor(n_total * k / 100)), 1)
        selected = df_rank.iloc[:n_sel]

        precision_k = float(selected["y"].mean())
        recall_k = float(selected["y"].sum() / (total_pos + 1e-12))
        lift_k = float(precision_k / base_rate) if base_rate > 0 else 0.0

        ranking.append({
            "Top_K": f"{k}%",
            "n_selected": int(n_sel),
            "Precision": precision_k,
            "Recall": recall_k,
            "Lift": lift_k,
        })

    # selection score (네가 쓰던 가중치 그대로)
    def _get(k_pct: int):
        target = f"{k_pct}%"
        for row in ranking:
            if row["Top_K"] == target:
                return row
        return {"Recall": 0.0, "Lift": 0.0}

    r5  = _get(5)
    r10 = _get(10)
    r30 = _get(30)

    selection_score = (
        0.55 * pr_auc
        + 0.20 * float(r10["Recall"])
        + 0.15 * float(r30["Recall"])
        + 0.05 * float(r5["Recall"])
        + 0.03 * float(r10["Lift"])
        + 0.02 * float(r5["Lift"])
    )

    return {
        "PR-AUC (Average Precision)": pr_auc,
        "base_rate": base_rate,
        "n_total": n_total,
        "ranking": ranking,
        "score_for_selection": float(selection_score),
    }



# Figures (PR curve + confusion matrices by TopK)

def threshold_topk(y_prob, k_pct: int) -> float:
    y_prob = np.asarray(y_prob, dtype=float).reshape(-1)
    order = np.argsort(-y_prob)
    n_sel = max(int(np.floor(len(y_prob) * k_pct / 100)), 1)
    return float(y_prob[order[n_sel - 1]])
def plot_confusion_matrix(
    y_true,
    y_pred,
    title,
    labels=("비이탈(m1)", "이탈(m2)"),
    cmap="Blues",  # ✅ 기본값 Blues로 고정
):
    y_true = np.asarray(y_true).astype(int)
    y_pred = np.asarray(y_pred).astype(int)
    cm = confusion_matrix(y_true, y_pred)

    fig, ax = plt.subplots(figsize=(6, 5))
    im = ax.imshow(cm, cmap=cmap, interpolation="nearest", aspect="equal")  # ✅ cmap 적용
    fig.colorbar(im, ax=ax)

    ax.set_title(title)
    ax.set_xlabel("Predicted (예측값)")
    ax.set_ylabel("Actual (실제값)")
    ax.set_xticks([0, 1])
    ax.set_yticks([0, 1])
    ax.set_xticklabels(labels)
    ax.set_yticklabels(labels)

    thresh = cm.max() / 2.0 if cm.size else 0
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(
                j, i, f"{cm[i, j]}",
                ha="center", va="center",
                color="white" if cm[i, j] > thresh else "black"
            )

    fig.tight_layout()
    return fig


def make_figures(test_true, test_prob, k_list=(5, 10, 15, 30), cm_cmap="Blues"):
    precision, recall, _ = precision_recall_curve(test_true, test_prob)
    pr_auc_val = float(average_precision_score(test_true, test_prob))

    fig_pr, ax_pr = plt.subplots(figsize=(6, 5))
    ax_pr.plot(recall, precision, lw=2, label=f"PR-AUC = {pr_auc_val:.5f}")
    ax_pr.set_xlabel("Recall")
    ax_pr.set_ylabel("Precision")
    ax_pr.set_title("Precision-Recall Curve")
    ax_pr.legend()
    ax_pr.grid(alpha=0.3)
    fig_pr.tight_layout()

    figures = {"pr_curve": fig_pr}
    for k in k_list:
        thr = threshold_topk(test_prob, k)
        y_pred_k = (np.asarray(test_prob) >= thr).astype(int)

        figures[f"confusion_matrix_top{k}"] = plot_confusion_matrix(
            test_true,
            y_pred_k,
            title=f"Confusion Matrix (Top {k}%, thr={thr:.5f})",
            cmap=cm_cmap,  # ✅ 여기서도 Blues 고정
        )

    return figures



# 데이터 로드/병합 (팀 규칙: labels.parquet split 그대로)

DATA_DIR = PROJECT_ROOT / "data" / "processed"
features = pd.read_parquet(DATA_DIR / "features_ml_clean.parquet")
labels   = pd.read_parquet(DATA_DIR / "labels.parquet")

features["user_id"] = features["user_id"].astype(str)
labels["user_id"]   = labels["user_id"].astype(str)

df = features.merge(
    labels[["user_id", "anchor_time", "label", "split"]],
    on=["user_id", "anchor_time"],
    how="inner",
    validate="one_to_one",
)
df["y"] = (df["label"] == "m2").astype(int)

feature_cols = [c for c in df.columns if c not in ["user_id", "anchor_time", "label", "split", "y"]]

train_df = df[df["split"] == "train"]
val_df   = df[df["split"] == "val"]
test_df  = df[df["split"] == "test"]

X_train, y_train = train_df[feature_cols], train_df["y"].to_numpy()
X_val,   y_val   = val_df[feature_cols],   val_df["y"].to_numpy()
X_test,  y_test  = test_df[feature_cols],  test_df["y"].to_numpy()

X_tv = pd.concat([X_train, X_val], axis=0)
y_tv = np.concatenate([y_train, y_val])

neg = float((y_train == 0).sum())
pos = float((y_train == 1).sum())
ratio = neg / (pos + 1e-12)



# LightGBM 튜닝 (VAL PR-AUC) + early stopping

from lightgbm import LGBMClassifier, early_stopping, log_evaluation

def tune_lgbm(n_trials=30, seed=42):
    rng = np.random.default_rng(seed)

    space = {
        "learning_rate":      [0.01, 0.02, 0.03, 0.05, 0.07],
        "num_leaves":         [15, 31, 63, 127, 255],
        "min_child_samples":  [10, 20, 50, 100],
        "subsample":          [0.7, 0.8, 0.9, 1.0],
        "colsample_bytree":   [0.7, 0.8, 0.9, 1.0],
        "reg_alpha":          [0.0, 1e-4, 1e-3, 1e-2],
        "reg_lambda":         [0.0, 1e-4, 1e-3, 1e-2],
        "scale_pos_weight":   [1.0, 2.0, 5.0, 10.0, ratio],  # 불균형 보정
    }

    best = {"val_pr_auc": -1.0, "params": None, "best_iter": None}

    for t in range(n_trials):
        params = {k: rng.choice(v) for k, v in space.items()}

        model = LGBMClassifier(
            objective="binary",
            boosting_type="gbdt",
            n_estimators=5000,     # 크게 두고 early stopping으로 best_iter 결정
            random_state=42,
            n_jobs=-1,
            **params,
        )

        model.fit(
            X_train, y_train,
            eval_set=[(X_val, y_val)],
            eval_metric="aucpr",
            callbacks=[
                early_stopping(stopping_rounds=100, first_metric_only=True),
                log_evaluation(period=0),
            ],
        )

        val_prob = model.predict_proba(X_val)[:, 1]
        val_pr_auc = float(average_precision_score(y_val, val_prob))

        if val_pr_auc > best["val_pr_auc"]:
            best["val_pr_auc"] = val_pr_auc
            best["params"] = dict(params)
            best["best_iter"] = int(getattr(model, "best_iteration_", model.n_estimators))

        print(f"[trial {t+1:02d}] val_pr_auc={val_pr_auc:.6f} | best={best['val_pr_auc']:.6f}")

    return best

BEST = tune_lgbm(n_trials=30, seed=42)
print("BEST:", {**BEST, "params": dict_to_py(BEST["params"])})



# 최종 학습(TRAIN+VAL) -> TEST 1회 평가 + 저장(공통 함수)

MODEL_NAME = "lgbm"
MODEL_ID   = "ml__lgbm"
VERSION    = "v1_tuned"   # 폴더명/버전명

final = LGBMClassifier(
    objective="binary",
    boosting_type="gbdt",
    n_estimators=int(BEST["best_iter"]),
    random_state=42,
    n_jobs=-1,
    **BEST["params"],
)
final.fit(X_tv, y_tv)

test_prob = final.predict_proba(X_test)[:, 1]
test_true = np.asarray(y_test).astype(int)

metrics_payload = build_ranking_metrics(test_true, test_prob)
figures = make_figures(test_true, test_prob, k_list=(5, 10, 15, 30))

saved = save_model_and_artifacts(
    model=final,
    model_name=MODEL_NAME,
    model_type="ml",
    model_id=MODEL_ID,
    split="test",
    metrics=metrics_payload,
    y_true=test_true,
    y_prob=np.asarray(test_prob).astype(float),
    version=VERSION,
    scaler=None,
    figures=figures,
    # feature_cols 없어도 된다 했으니 최소 config만(필요하면 여기에 더 추가)
    config={
        "model_name": MODEL_NAME,
        "model_type": "ml",
        "version": VERSION,
        "feature_source": "features_ml_clean.parquet",
        "n_features": int(len(feature_cols)),
        "best_iter": int(BEST["best_iter"]),
    },
)

# figures close
plt.close(figures["pr_curve"])
for k in (5, 10, 15, 30):
    plt.close(figures[f"confusion_matrix_top{k}"])

# Streamlit percentile 저장 (app이 읽는 규칙)
sp = score_percentiles_payload(MODEL_ID, "test", test_prob)
p_streamlit = write_streamlit_score_percentiles(MODEL_NAME, sp)

# 튜닝 결과 저장 (PATHS 기준 폴더)
tuning_payload = {
    "model_id": MODEL_ID,
    "version": VERSION,
    "best_val_pr_auc": float(BEST["val_pr_auc"]),
    "best_params": dict_to_py(BEST["params"]),
    "best_iter": int(BEST["best_iter"]),
}
tuning_path = Path(PATHS["models_metrics"]) / MODEL_NAME / VERSION / "tuning.json"
tuning_path.parent.mkdir(parents=True, exist_ok=True)
tuning_path.write_text(json.dumps(tuning_payload, ensure_ascii=False, indent=2), encoding="utf-8")

print("\n=== metrics ===")
print(json.dumps(metrics_payload, ensure_ascii=False, indent=2))

print("\n=== saved paths ===")
for k, v in saved.items():
    print(f"{k}: {v}")

print("\nstreamlit percentiles:", p_streamlit)
print("tuning:", str(tuning_path))

[LightGBM] [Info] Number of positive: 461642, number of negative: 112450
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.013629 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2495
[LightGBM] [Info] Number of data points in the train set: 574092, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.804125 -> initscore=1.412281
[LightGBM] [Info] Start training from score 1.412281
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[40]	valid_0's binary_logloss: 0.430834
Evaluated only: binary_logloss
[trial 01] val_pr_auc=0.914389 | best=0.914389
[LightGBM] [Info] Number of positive: 461642, number of negative: 112450
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.013519 seconds.
You can set `force_row_wise=true` to remove the o