In [None]:
import sys
from pathlib import Path

# 현재 파일 위치 기준으로 위로 올라가며 "app" 폴더를 찾는다
p = Path.cwd().resolve()
for _ in range(6):  # 최대 6단계 위까지 탐색
    if (p / "app").exists() and (p / "models").exists():
        sys.path.insert(0, str(p))
        print("✅ project root added to sys.path:", p)
        break
    p = p.parent
else:
    raise RuntimeError("❌ 프로젝트 루트를 찾지 못했어요. app/ 와 models/ 가 있는 폴더에서 실행해야 합니다.")

In [1]:
import os
import sys
from pathlib import Path
import warnings
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.metrics import precision_recall_curve, confusion_matrix, average_precision_score
from sklearn.exceptions import ConvergenceWarning

here = Path.cwd().resolve()
project_root = next(
    (
        p
        for p in [here, *here.parents]
        if (p / "app" / "utils" / "paths.py").is_file()
        and (p / "models").is_dir()
        and (p / "data").is_dir()
    ),
    None,
)
if project_root is None:
    raise FileNotFoundError(f"프로젝트 루트를 못 찾았어. 현재 위치={here}")
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))
os.chdir(project_root)

import importlib

importlib.invalidate_caches()
for k in list(sys.modules.keys()):
    if k == "app" or k.startswith("app."):
        del sys.modules[k]

from app.utils.paths import DEFAULT_PATHS as P, ensure_runtime_dirs
from app.utils.metrics import evaluate_churn_metrics
from app.utils.save import save_model_and_artifacts

try:
    from lightgbm import LGBMClassifier, early_stopping, log_evaluation
except Exception as e:
    raise ImportError("lightgbm이 설치되어 있어야 해. 예: pip install lightgbm") from e

warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=ConvergenceWarning)

try:
    from app.utils.plotting import configure_matplotlib_korean

    configure_matplotlib_korean()
except Exception:
    pass

ensure_runtime_dirs()

MODEL_NAME = "lgbm"
MODEL_ID = "ml__lgbm"
VERSION = "tuned_on_val"
N_TRIALS = 30
RANDOM_SEED = 42

TUNE_K = 5
ALPHA = 0.75
BETA = 0.25

np.random.seed(RANDOM_SEED)


def plot_confusion_matrix_figure(y_true, y_pred, title: str, labels=("non_m2", "m2")):
    y_true_arr = np.asarray(y_true, dtype=int).reshape(-1)
    y_pred_arr = np.asarray(y_pred, dtype=int).reshape(-1)
    cm = confusion_matrix(y_true_arr, y_pred_arr)

    fig, ax = plt.subplots(figsize=(6, 5))
    im = ax.imshow(cm, interpolation="nearest", aspect="equal", cmap="Blues")
    fig.colorbar(im, ax=ax)

    ax.set_title(title)
    ax.set_xlabel("Predicted")
    ax.set_ylabel("Actual")

    ax.set_xticks([0, 1])
    ax.set_yticks([0, 1])
    ax.set_xticklabels(list(labels))
    ax.set_yticklabels(list(labels))

    thresh = float(cm.max()) / 2.0 if cm.size else 0.0
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(
                j,
                i,
                str(cm[i, j]),
                ha="center",
                va="center",
                color="white" if float(cm[i, j]) > thresh else "black",
                fontsize=12,
            )

    ax.set_xlim(-0.5, cm.shape[1] - 0.5)
    ax.set_ylim(cm.shape[0] - 0.5, -0.5)
    fig.tight_layout()
    return fig


def topk_threshold(y_prob: np.ndarray, k_pct: int) -> float:
    prob = np.asarray(y_prob, dtype=float).reshape(-1)
    order = np.argsort(-prob)
    n_sel = int(np.floor(len(prob) * (float(k_pct) / 100.0)))
    n_sel = max(n_sel, 1)
    return float(prob[order[n_sel - 1]])


def plot_confusion_topk(y_true, y_prob, k_pct: int, labels=("non_m2", "m2")):
    thr = topk_threshold(np.asarray(y_prob, dtype=float), int(k_pct))
    y_pred = (np.asarray(y_prob, dtype=float) >= thr).astype(int)
    return plot_confusion_matrix_figure(
        y_true,
        y_pred,
        title=f"Confusion Matrix (Top {int(k_pct)}%, thr={thr:.5f})",
        labels=labels,
    )


def topk_stats(y_true: np.ndarray, y_prob: np.ndarray, k_pct: int) -> dict:
    y_true_arr = np.asarray(y_true, dtype=int).reshape(-1)
    y_prob_arr = np.asarray(y_prob, dtype=float).reshape(-1)

    thr = topk_threshold(y_prob_arr, int(k_pct))
    sel = (y_prob_arr >= thr).astype(int)

    base_rate = float(y_true_arr.mean())
    total_pos = float(y_true_arr.sum())
    tp = float(np.sum((sel == 1) & (y_true_arr == 1)))
    n_sel = float(np.sum(sel))

    recall_k = float(tp / (total_pos + 1e-12))
    precision_k = float(tp / (n_sel + 1e-12))
    lift_k = float(precision_k / base_rate) if base_rate > 0 else 0.0

    return {
        "k_pct": int(k_pct),
        "thr": float(thr),
        "recall": float(recall_k),
        "precision": float(precision_k),
        "lift": float(lift_k),
        "base_rate": float(base_rate),
        "n_sel": int(n_sel),
    }


def tune_lgbm_on_val(
    X_train,
    y_train,
    X_val,
    y_val,
    n_trials: int = 30,
    seed: int = 42,
    k_pct: int = 5,
    alpha: float = 0.75,
    beta: float = 0.25,
):
    rng = np.random.default_rng(seed)

    best = {
        "score": -1e18,
        "params": None,
        "best_iteration": None,
        "val_topk": None,
        "val_pr_auc": None,
    }

    base_params = {
        "objective": "binary",
        "random_state": seed,
        "n_estimators": 10000,
        "n_jobs": -1,
        "class_weight": "balanced",
        "verbose": -1,
        "force_row_wise": True,
    }

    for t in range(int(n_trials)):
        params = dict(base_params)

        params["learning_rate"] = float(rng.choice([0.01, 0.02, 0.03, 0.05]))
        params["num_leaves"] = int(rng.choice([31, 63, 127, 255]))
        params["max_depth"] = int(rng.choice([-1, 4, 6, 8, 10, 12]))
        params["min_child_samples"] = int(rng.choice([5, 10, 20, 30, 50, 100]))
        params["subsample"] = float(rng.choice([0.7, 0.8, 0.9, 1.0]))
        params["subsample_freq"] = int(rng.choice([0, 1, 5]))
        params["colsample_bytree"] = float(rng.choice([0.7, 0.8, 0.9, 1.0]))
        params["reg_alpha"] = float(rng.choice([0.0, 1e-4, 1e-3, 1e-2, 1e-1, 1.0]))
        params["reg_lambda"] = float(rng.choice([0.0, 1e-4, 1e-3, 1e-2, 1e-1, 1.0, 10.0]))
        params["min_split_gain"] = float(rng.choice([0.0, 1e-4, 1e-3, 1e-2, 5e-2]))
        params["min_child_weight"] = float(rng.choice([1e-3, 1e-2, 1e-1, 1.0, 10.0]))
        params["max_bin"] = int(rng.choice([63, 127, 255]))

        params["min_data_in_bin"] = int(rng.choice([1, 3, 5, 10]))
        params["feature_fraction_bynode"] = float(rng.choice([0.6, 0.8, 1.0]))

        m = LGBMClassifier(**params)
        m.fit(
            X_train,
            y_train,
            eval_set=[(X_val, y_val)],
            eval_metric="average_precision",
            callbacks=[early_stopping(stopping_rounds=80, verbose=False), log_evaluation(0)],
        )

        val_prob = m.predict_proba(X_val)[:, 1].astype(float).reshape(-1)
        topk = topk_stats(y_val, val_prob, k_pct=int(k_pct))
        pr_auc = float(average_precision_score(np.asarray(y_val, dtype=int), val_prob))

        score = float(alpha * topk["recall"] + beta * topk["lift"] + (1.0 - alpha - beta) * pr_auc)

        if score > best["score"]:
            best["score"] = score
            best["params"] = dict(params)
            best["val_topk"] = dict(topk)
            best["val_pr_auc"] = float(pr_auc)
            try:
                best["best_iteration"] = int(getattr(m, "best_iteration_", 0) or 0)
            except Exception:
                best["best_iteration"] = None

        if (t + 1) % max(1, int(n_trials // 5)) == 0:
            print(
                "trial",
                t + 1,
                "/",
                n_trials,
                "best_score=",
                float(best["score"]),
                "best_val_topk_recall=",
                float(best["val_topk"]["recall"]) if best["val_topk"] else None,
                "best_val_topk_lift=",
                float(best["val_topk"]["lift"]) if best["val_topk"] else None,
                "best_val_pr_auc=",
                float(best["val_pr_auc"]) if best["val_pr_auc"] is not None else None,
            )

    return best


print("Load parquet files")

anchors = pd.read_parquet(P.parquet_path("anchors"))
features = pd.read_parquet(P.parquet_path("features_ml_clean"))
labels = pd.read_parquet(P.parquet_path("labels"))

for df in (anchors, features, labels):
    df["user_id"] = df["user_id"].astype(str)

if "split" in anchors.columns:
    anchors = anchors.drop(columns=["split"])

if "split" not in labels.columns:
    raise KeyError(f"labels.parquet에 split 컬럼이 없습니다. labels columns head: {list(labels.columns)[:50]}")

data = anchors.merge(features, on=["user_id", "anchor_time"], how="inner")
data = data.merge(labels[["user_id", "anchor_time", "label", "split"]], on=["user_id", "anchor_time"], how="inner")

data["target"] = data["label"].astype(str).eq("m2").astype(int)
split_col = data["split"].astype(str)

feature_cols = [c for c in features.columns if c not in ("user_id", "anchor_time")]
X_all = data.loc[:, feature_cols].copy()
X_all = X_all.replace([np.inf, -np.inf], np.nan)
X_all = X_all.fillna(0.0)
y_all = data["target"].to_numpy(dtype=int)

idx_train = split_col.eq("train").to_numpy()
idx_val = split_col.eq("val").to_numpy()
idx_test = split_col.eq("test").to_numpy()

X_train = X_all.loc[idx_train].to_numpy(dtype=float)
y_train = y_all[idx_train]
X_val = X_all.loc[idx_val].to_numpy(dtype=float)
y_val = y_all[idx_val]
X_test = X_all.loc[idx_test].to_numpy(dtype=float)
y_test = y_all[idx_test]

print("rows:", len(data), "train/val/test:", int(idx_train.sum()), int(idx_val.sum()), int(idx_test.sum()))
print("n_features:", int(X_train.shape[1]))
print("train positives/negatives:", int(y_train.sum()), int((1 - y_train).sum()))

tune = tune_lgbm_on_val(
    X_train=X_train,
    y_train=y_train,
    X_val=X_val,
    y_val=y_val,
    n_trials=int(N_TRIALS),
    seed=int(RANDOM_SEED),
    k_pct=int(TUNE_K),
    alpha=float(ALPHA),
    beta=float(BETA),
)

best_params = tune["params"]
best_iter = tune["best_iteration"]
best_topk = tune["val_topk"]
best_pr_auc = tune["val_pr_auc"]

print("best_val_score:", float(tune["score"]))
print("best_val_topk:", json.dumps(best_topk, ensure_ascii=False, indent=2) if best_topk else None)
print("best_val_pr_auc:", float(best_pr_auc) if best_pr_auc is not None else None)
print("best_iter:", int(best_iter) if best_iter else None)
print("best_params:", json.dumps(best_params, ensure_ascii=False, indent=2) if best_params else None)

final_params = dict(best_params)
if best_iter and int(best_iter) > 0:
    final_params["n_estimators"] = int(best_iter)

X_trval = np.vstack([X_train, X_val])
y_trval = np.concatenate([y_train, y_val])

final_model = LGBMClassifier(**final_params)
final_model.fit(X_trval, y_trval)

y_prob = final_model.predict_proba(X_test)[:, 1].astype(float).reshape(-1)
y_true = np.asarray(y_test, dtype=int).reshape(-1)

metrics = evaluate_churn_metrics(y_true, y_prob)
print("Test PR-AUC:", float(metrics.get("PR-AUC (Average Precision)", 0.0)))

precision, recall, _ = precision_recall_curve(y_true, y_prob)
pr_auc_val = metrics.get("PR-AUC (Average Precision)")
pr_auc_val = float(average_precision_score(y_true, y_prob)) if pr_auc_val is None else float(pr_auc_val)

fig_pr, ax_pr = plt.subplots(figsize=(6, 5))
ax_pr.plot(recall, precision, lw=2, label=f"PR-AUC={pr_auc_val:.5f}")
ax_pr.set_xlabel("Recall")
ax_pr.set_ylabel("Precision")
ax_pr.set_title("Precision-Recall Curve")
ax_pr.grid(alpha=0.3)
ax_pr.legend()
fig_pr.tight_layout()

figures = {"pr_curve": fig_pr}
for k_pct in (5, 10, 15, 30):
    figures[f"confusion_matrix_top{k_pct}"] = plot_confusion_topk(
        y_true=y_true, y_prob=y_prob, k_pct=int(k_pct), labels=("non_m2", "m2")
    )

saved = save_model_and_artifacts(
    model=final_model,
    model_name=MODEL_NAME,
    model_type="ml",
    model_id=MODEL_ID,
    split="test",
    metrics=metrics,
    y_true=y_true,
    y_prob=y_prob,
    version=VERSION,
    scaler=None,
    figures=figures,
    config={
        "model_name": MODEL_NAME,
        "model_type": "ml",
        "version": VERSION,
        "feature_source": "features_ml_clean.parquet",
        "tuning": {
            "n_trials": int(N_TRIALS),
            "tune_k": int(TUNE_K),
            "alpha": float(ALPHA),
            "beta": float(BETA),
            "best_val_score": float(tune["score"]),
            "best_val_topk": best_topk,
            "best_val_pr_auc": float(best_pr_auc) if best_pr_auc is not None else None,
            "best_iteration": int(best_iter) if best_iter else None,
            "best_params": {k: final_params[k] for k in sorted(final_params.keys()) if k not in ["n_jobs"]},
        },
    },
)

print("saved keys:", list(saved.keys()))
for k, v in saved.items():
    print(k, "->", v)

plt.close(fig_pr)
for k_pct in (5, 10, 15, 30):
    plt.close(figures[f"confusion_matrix_top{k_pct}"])

print("Done")

Load parquet files
rows: 813540 train/val/test: 574092 137615 101833
n_features: 14
train positives/negatives: 461642 112450




trial 6 / 30 best_score= 0.33197340491099986 best_val_topk_recall= 0.06499516507804945 best_val_topk_lift= 1.1329081244098511 best_val_pr_auc= 0.9168106443120612




trial 12 / 30 best_score= 0.332996643951732 best_val_topk_recall= 0.06633340240364691 best_val_topk_lift= 1.1329863685959873 best_val_pr_auc= 0.9165917967219137




trial 18 / 30 best_score= 0.332996643951732 best_val_topk_recall= 0.06633340240364691 best_val_topk_lift= 1.1329863685959873 best_val_pr_auc= 0.9165917967219137




trial 24 / 30 best_score= 0.332996643951732 best_val_topk_recall= 0.06633340240364691 best_val_topk_lift= 1.1329863685959873 best_val_pr_auc= 0.9165917967219137




trial 30 / 30 best_score= 0.332996643951732 best_val_topk_recall= 0.06633340240364691 best_val_topk_lift= 1.1329863685959873 best_val_pr_auc= 0.9165917967219137
best_val_score: 0.332996643951732
best_val_topk: {
  "k_pct": 5,
  "thr": 0.7833491720416923,
  "recall": 0.06633340240364691,
  "precision": 0.9535807372471142,
  "lift": 1.1329863685959873,
  "base_rate": 0.8416524361443156,
  "n_sel": 8057
}
best_val_pr_auc: 0.9165917967219137
best_iter: 163
best_params: {
  "objective": "binary",
  "random_state": 42,
  "n_estimators": 10000,
  "n_jobs": -1,
  "class_weight": "balanced",
  "verbose": -1,
  "force_row_wise": true,
  "learning_rate": 0.03,
  "num_leaves": 127,
  "max_depth": -1,
  "min_child_samples": 10,
  "subsample": 1.0,
  "subsample_freq": 5,
  "colsample_bytree": 0.8,
  "reg_alpha": 0.1,
  "reg_lambda": 1.0,
  "min_split_gain": 0.0001,
  "min_child_weight": 10.0,
  "max_bin": 63,
  "min_data_in_bin": 1,
  "feature_fraction_bynode": 1.0
}




Test PR-AUC: 0.9334303518442961
saved keys: ['model', 'config', 'eval_dir', 'version_dir', 'figure_pr_curve', 'figure_confusion_matrix_top5', 'figure_confusion_matrix_top10', 'figure_confusion_matrix_top15', 'figure_confusion_matrix_top30']
model -> /Users/jy/project_2nd/SKN23-2nd-3Team/models/ml/lgbm/tuned_on_val/model.pkl
config -> /Users/jy/project_2nd/SKN23-2nd-3Team/models/configs/lgbm/tuned_on_val/config.json
eval_dir -> /Users/jy/project_2nd/SKN23-2nd-3Team/models/eval/mllgbm
version_dir -> tuned_on_val
figure_pr_curve -> /Users/jy/project_2nd/SKN23-2nd-3Team/assets/training/lgbm/tuned_on_val/pr_curve.png
figure_confusion_matrix_top5 -> /Users/jy/project_2nd/SKN23-2nd-3Team/assets/training/lgbm/tuned_on_val/confusion_matrix_top5.png
figure_confusion_matrix_top10 -> /Users/jy/project_2nd/SKN23-2nd-3Team/assets/training/lgbm/tuned_on_val/confusion_matrix_top10.png
figure_confusion_matrix_top15 -> /Users/jy/project_2nd/SKN23-2nd-3Team/assets/training/lgbm/tuned_on_val/confusion_mat