In [None]:
import sys
from pathlib import Path

# 현재 파일 위치 기준으로 위로 올라가며 "app" 폴더를 찾는다
p = Path.cwd().resolve()
for _ in range(6):  # 최대 6단계 위까지 탐색
    if (p / "app").exists() and (p / "models").exists():
        sys.path.insert(0, str(p))
        print("✅ project root added to sys.path:", p)
        break
    p = p.parent
else:
    raise RuntimeError("❌ 프로젝트 루트를 찾지 못했어요. app/ 와 models/ 가 있는 폴더에서 실행해야 합니다.")

In [1]:
from pathlib import Path
import sys
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import (
    precision_recall_curve,
    average_precision_score,
    confusion_matrix,
)

PROJECT_ROOT = Path("/Users/jy/project_2nd/SKN23-2nd-3Team")
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

from app.utils.save import save_model_and_artifacts

try:
    from app.utils.plotting import configure_matplotlib_korean
    configure_matplotlib_korean()
except Exception:
    pass


def recall_at_topk(y_true, y_prob, k_pct: int) -> float:
    y_true = np.asarray(y_true).astype(int).reshape(-1)
    y_prob = np.asarray(y_prob).astype(float).reshape(-1)
    order = np.argsort(-y_prob)
    n = max(int(np.floor(len(y_true) * (k_pct / 100.0))), 1)
    top_idx = order[:n]
    return float(y_true[top_idx].sum() / max(y_true.sum(), 1))


def precision_at_topk(y_true, y_prob, k_pct: int) -> float:
    y_true = np.asarray(y_true).astype(int).reshape(-1)
    y_prob = np.asarray(y_prob).astype(float).reshape(-1)
    order = np.argsort(-y_prob)
    n = max(int(np.floor(len(y_true) * (k_pct / 100.0))), 1)
    top_idx = order[:n]
    return float(y_true[top_idx].mean())


def lift_at_topk(y_true, y_prob, k_pct: int) -> float:
    y_true = np.asarray(y_true).astype(int).reshape(-1)
    base = float(y_true.mean())
    pk = precision_at_topk(y_true, y_prob, k_pct)
    return float(pk / base) if base > 0 else 0.0


def build_ranking_metrics(y_true, y_prob, k_list=(5, 10, 15, 20, 25, 30)):
    y_true = np.asarray(y_true).astype(int).reshape(-1)
    y_prob = np.asarray(y_prob).astype(float).reshape(-1)

    pr_auc = float(average_precision_score(y_true, y_prob))

    df_rank = pd.DataFrame({"y": y_true, "score": y_prob}).sort_values("score", ascending=False)
    base_rate = float(df_rank["y"].mean())
    total_pos = float(df_rank["y"].sum())
    n_total = len(df_rank)

    ranking = []
    for k in k_list:
        n_sel = max(int(np.floor(n_total * k / 100)), 1)
        selected = df_rank.iloc[:n_sel]

        precision_k = float(selected["y"].mean())
        recall_k = float(selected["y"].sum() / (total_pos + 1e-12))
        lift_k = float(precision_k / base_rate) if base_rate > 0 else 0.0

        ranking.append({"Top_K": f"{k}%", "Precision": precision_k, "Recall": recall_k, "Lift": lift_k})

    out = {"PR-AUC (Average Precision)": pr_auc, "ranking": ranking}
    for row in ranking:
        if row["Top_K"] == "5%":
            out["상위 5% 정밀도 (Precision)"] = row["Precision"]
            out["상위 5% 재현율 (Recall)"] = row["Recall"]
            out["상위 5% 리프트 (Lift)"] = row["Lift"]
            break
    return out


def score_percentiles_payload(model_id: str, split: str, y_prob, pcts=(1, 5, 10, 20, 30, 50)):
    y_prob = np.asarray(y_prob, dtype=float).reshape(-1)
    percentiles = [{"pct": int(p), "score": float(np.quantile(y_prob, 1.0 - p / 100.0))} for p in pcts]
    return {"model_id": model_id, "split": split, "percentiles": percentiles}


def _stem_from_model_id(model_id: str) -> str:
    if model_id.startswith("dl__mlp_"):
        return "mlp_" + model_id.split("dl__mlp_", 1)[1]
    if model_id.startswith("ml__"):
        return model_id.split("ml__", 1)[1]
    if model_id.startswith("dl__"):
        return model_id.split("dl__", 1)[1]
    return model_id


def save_and_print_score_percentiles(PROJECT_ROOT: Path, model_id: str, split: str, y_prob):
    metrics_dir = PROJECT_ROOT / "models" / "metrics"
    metrics_dir.mkdir(parents=True, exist_ok=True)

    payload = score_percentiles_payload(model_id, split, y_prob, pcts=(1, 5, 10, 20, 30, 50))
    out_path = metrics_dir / f"{_stem_from_model_id(model_id)}_score_percentiles.json"
    out_path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")

    print(json.dumps(payload, ensure_ascii=False, indent=2))
    print(str(out_path))
    return payload, out_path


def plot_confusion_matrix(y_true, y_pred, title, labels=("비이탈(m1)", "이탈(m2)"), cmap="Blues"):
    y_true = np.asarray(y_true).astype(int)
    y_pred = np.asarray(y_pred).astype(int)
    cm = confusion_matrix(y_true, y_pred)

    fig, ax = plt.subplots(figsize=(6, 5))
    im = ax.imshow(cm, cmap=cmap, interpolation="nearest", aspect="equal")
    fig.colorbar(im, ax=ax)

    ax.set_title(title)
    ax.set_xlabel("Predicted")
    ax.set_ylabel("Actual")

    ax.set_xticks([0, 1])
    ax.set_yticks([0, 1])
    ax.set_xticklabels(labels)
    ax.set_yticklabels(labels)

    thresh = cm.max() / 2.0 if cm.size else 0
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(
                j, i, f"{cm[i, j]}",
                ha="center", va="center",
                color="white" if cm[i, j] > thresh else "black",
                fontsize=12,
            )

    ax.set_xlim(-0.5, cm.shape[1] - 0.5)
    ax.set_ylim(cm.shape[0] - 0.5, -0.5)
    fig.tight_layout()
    return fig


def threshold_topk(y_prob, k_pct: int) -> float:
    y_prob = np.asarray(y_prob, dtype=float).reshape(-1)
    order = np.argsort(-y_prob)
    n_sel = max(int(np.floor(len(y_prob) * k_pct / 100)), 1)
    return float(y_prob[order[n_sel - 1]])


def make_sample_weight(y: np.ndarray) -> np.ndarray:
    y = np.asarray(y).astype(int).reshape(-1)
    pos = float((y == 1).sum())
    neg = float((y == 0).sum())
    if pos == 0:
        return np.ones_like(y, dtype=float)
    w_pos = neg / pos
    w = np.ones_like(y, dtype=float)
    w[y == 1] = w_pos
    return w


DATA_DIR = PROJECT_ROOT / "data" / "processed"
features = pd.read_parquet(DATA_DIR / "features_ml_clean.parquet")
labels = pd.read_parquet(DATA_DIR / "labels.parquet")

features["user_id"] = features["user_id"].astype(str)
labels["user_id"] = labels["user_id"].astype(str)

df = features.merge(
    labels[["user_id", "anchor_time", "label", "split"]],
    on=["user_id", "anchor_time"],
    how="inner",
    validate="one_to_one",
)

df["y"] = (df["label"] == "m2").astype(int)

feature_cols = [c for c in df.columns if c not in ["user_id", "anchor_time", "label", "split", "y"]]

train_df = df[df["split"] == "train"]
val_df = df[df["split"] == "val"]
test_df = df[df["split"] == "test"]

X_train, y_train = train_df[feature_cols], train_df["y"].to_numpy(dtype=int)
X_val, y_val = val_df[feature_cols], val_df["y"].to_numpy(dtype=int)
X_test, y_test = test_df[feature_cols], test_df["y"].to_numpy(dtype=int)


BEST_K_PCT = 10
N_TRIALS = 30
SEED = 42


def tune_hgb_on_val(X_train, y_train, X_val, y_val, n_trials=30, seed=42, best_k_pct=10):
    rng = np.random.default_rng(seed)

    best = {
        "val_recall_k": -1.0,
        "val_pr_auc": -1.0,
        "params": None,
    }

    sw_train = make_sample_weight(y_train)

    for t in range(int(n_trials)):
        max_depth_choice = rng.choice([3, 4, 5, 6, None])

        params = {
            "learning_rate": float(rng.choice([0.01, 0.02, 0.03, 0.05, 0.07])),
            "max_depth": None if max_depth_choice is None else int(max_depth_choice),
            "max_iter": int(rng.choice([300, 500, 800, 1200])),
            "min_samples_leaf": int(rng.choice([5, 10, 20, 30, 50, 100])),
            "l2_regularization": float(rng.choice([0.0, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1])),
            "max_bins": int(rng.choice([64, 128, 255])),
            "max_leaf_nodes": int(rng.choice([15, 31, 63, 127, 255])),
            "max_features": float(rng.choice([0.6, 0.8, 1.0])),
            "random_state": int(seed),
            "early_stopping": False,
        }

        model = HistGradientBoostingClassifier(**params)
        model.fit(X_train, y_train, sample_weight=sw_train)

        val_prob = model.predict_proba(X_val)[:, 1]
        val_pr_auc = float(average_precision_score(y_val, val_prob))
        val_recall_k = recall_at_topk(y_val, val_prob, best_k_pct)

        print(
            f"trial {t+1}/{n_trials} "
            f"val_recall_at_{best_k_pct}pct={val_recall_k:.5f} "
            f"val_pr_auc={val_pr_auc:.5f} "
            f"params={json.dumps(params, ensure_ascii=False)}"
        )

        better = (val_recall_k > best["val_recall_k"] + 1e-12) or (
            abs(val_recall_k - best["val_recall_k"]) <= 1e-12 and val_pr_auc > best["val_pr_auc"]
        )
        if better:
            best["val_recall_k"] = val_recall_k
            best["val_pr_auc"] = val_pr_auc
            best["params"] = params

    return best


tune_result = tune_hgb_on_val(
    X_train, y_train, X_val, y_val,
    n_trials=N_TRIALS,
    seed=SEED,
    best_k_pct=BEST_K_PCT,
)

best_params = tune_result["params"]
best_val_recall_k = tune_result["val_recall_k"]
best_val_pr_auc = tune_result["val_pr_auc"]

print("best_val_recall_k:", best_val_recall_k)
print("best_val_pr_auc:", best_val_pr_auc)
print("best_params:", json.dumps(best_params, ensure_ascii=False, indent=2))


X_tv = pd.concat([X_train, X_val], axis=0)
y_tv = np.concatenate([y_train, y_val], axis=0)

sw_tv = make_sample_weight(y_tv)

hgb = HistGradientBoostingClassifier(**best_params)
hgb.fit(X_tv, y_tv, sample_weight=sw_tv)

test_prob = hgb.predict_proba(X_test)[:, 1]
test_true = np.asarray(y_test).astype(int)

MODEL_ID = "ml__hgb"
SPLIT = "test"

save_and_print_score_percentiles(PROJECT_ROOT, MODEL_ID, SPLIT, test_prob)

metrics_payload = build_ranking_metrics(test_true, test_prob, k_list=(5, 10, 15, 20, 25, 30))
metrics_payload["selection_rule"] = f"val Recall@{BEST_K_PCT}% then PR-AUC"
metrics_payload["best_val_recall_at_k"] = float(best_val_recall_k)
metrics_payload["best_val_pr_auc"] = float(best_val_pr_auc)

print(json.dumps(metrics_payload, ensure_ascii=False, indent=2))

pr_auc_val = float(metrics_payload["PR-AUC (Average Precision)"])

precision, recall, _ = precision_recall_curve(test_true, test_prob)
fig_pr, ax_pr = plt.subplots(figsize=(6, 5))
ax_pr.plot(recall, precision, lw=2, label=f"PR-AUC = {pr_auc_val:.5f}")
ax_pr.set_xlabel("Recall")
ax_pr.set_ylabel("Precision")
ax_pr.set_title("Precision-Recall Curve")
ax_pr.legend()
ax_pr.grid(alpha=0.3)
fig_pr.tight_layout()

k_list = [5, 10, 15, 30]
figures = {"pr_curve": fig_pr}

for k in k_list:
    thr = threshold_topk(test_prob, k)
    y_pred_k = (np.asarray(test_prob) >= thr).astype(int)
    figures[f"confusion_matrix_top{k}"] = plot_confusion_matrix(
        test_true,
        y_pred_k,
        title=f"Confusion Matrix (Top {k}%, thr={thr:.5f})",
        labels=("비이탈(m1)", "이탈(m2)"),
        cmap="Blues",
    )

saved = save_model_and_artifacts(
    model=hgb,
    model_name="hgb",
    model_type="ml",
    model_id=MODEL_ID,
    split=SPLIT,
    metrics=metrics_payload,
    y_true=test_true,
    y_prob=np.asarray(test_prob).astype(float),
    version="tuned_on_val",
    scaler=None,
    figures=figures,
    config={
        "model_name": "hgb",
        "model_type": "ml",
        "version": "tuned_on_val",
        "feature_source": "features_ml_clean.parquet",
        "best_selection": f"val Recall@{BEST_K_PCT}%",
        "use_sample_weight": True,
        "note": "hp tuned on train/val, retrained train+val, test evaluated once",
    },
)

print("saved paths:")
for k, v in saved.items():
    print(f"{k}: {v}")

plt.close(fig_pr)
for k in k_list:
    plt.close(figures[f"confusion_matrix_top{k}"])

trial 1/30 val_recall_at_10pct=0.11272 val_pr_auc=0.91693 params={"learning_rate": 0.05, "max_depth": 3, "max_iter": 800, "min_samples_leaf": 20, "l2_regularization": 0.0001, "max_bins": 255, "max_leaf_nodes": 15, "max_features": 1.0, "random_state": 42, "early_stopping": false}
trial 2/30 val_recall_at_10pct=0.11276 val_pr_auc=0.91682 params={"learning_rate": 0.01, "max_depth": 4, "max_iter": 800, "min_samples_leaf": 100, "l2_regularization": 0.01, "max_bins": 255, "max_leaf_nodes": 127, "max_features": 1.0, "random_state": 42, "early_stopping": false}
trial 3/30 val_recall_at_10pct=0.11274 val_pr_auc=0.91688 params={"learning_rate": 0.01, "max_depth": 5, "max_iter": 1200, "min_samples_leaf": 20, "l2_regularization": 0.001, "max_bins": 128, "max_leaf_nodes": 15, "max_features": 1.0, "random_state": 42, "early_stopping": false}
trial 4/30 val_recall_at_10pct=0.11264 val_pr_auc=0.91647 params={"learning_rate": 0.05, "max_depth": 6, "max_iter": 500, "min_samples_leaf": 50, "l2_regulariza