In [None]:
import sys
from pathlib import Path

# 현재 파일 위치 기준으로 위로 올라가며 "app" 폴더를 찾는다
p = Path.cwd().resolve()
for _ in range(6):  # 최대 6단계 위까지 탐색
    if (p / "app").exists() and (p / "models").exists():
        sys.path.insert(0, str(p))
        print("✅ project root added to sys.path:", p)
        break
    p = p.parent
else:
    raise RuntimeError("❌ 프로젝트 루트를 찾지 못했어요. app/ 와 models/ 가 있는 폴더에서 실행해야 합니다.")

In [1]:
import sys
from pathlib import Path
import warnings
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_recall_curve, confusion_matrix, average_precision_score
from sklearn.exceptions import ConvergenceWarning

here = Path.cwd().resolve()
project_root = next(
    (
        p
        for p in [here, *here.parents]
        if (p / "app").is_dir() and (p / "models").is_dir() and (p / "data").is_dir()
    ),
    None,
)
if project_root is None:
    raise FileNotFoundError(f"프로젝트 루트를 못 찾았어. 현재 위치={here}")

if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

for k in list(sys.modules.keys()):
    if k == "app" or k.startswith("app."):
        del sys.modules[k]

from app.utils.paths import DEFAULT_PATHS as P, ensure_runtime_dirs
from app.utils.metrics import evaluate_churn_metrics
from app.utils.save import save_model_and_artifacts

warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=ConvergenceWarning)

ensure_runtime_dirs()

np.random.seed(42)


def plot_confusion_matrix_figure(y_true, y_pred, title: str, labels=("non_m2", "m2")):
    y_true_arr = np.asarray(y_true, dtype=int).reshape(-1)
    y_pred_arr = np.asarray(y_pred, dtype=int).reshape(-1)
    cm = confusion_matrix(y_true_arr, y_pred_arr)

    fig, ax = plt.subplots(figsize=(6, 5))
    im = ax.imshow(cm, interpolation="nearest", aspect="equal", cmap="Blues")
    fig.colorbar(im, ax=ax)

    ax.set_title(title)
    ax.set_xlabel("Predicted")
    ax.set_ylabel("Actual")
    ax.set_xticks([0, 1])
    ax.set_yticks([0, 1])
    ax.set_xticklabels(list(labels))
    ax.set_yticklabels(list(labels))

    thresh = float(cm.max()) / 2.0 if cm.size else 0.0
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(
                j,
                i,
                str(cm[i, j]),
                ha="center",
                va="center",
                color="white" if float(cm[i, j]) > thresh else "black",
                fontsize=12,
            )

    ax.set_xlim(-0.5, cm.shape[1] - 0.5)
    ax.set_ylim(cm.shape[0] - 0.5, -0.5)
    fig.tight_layout()
    return fig


def topk_threshold(y_prob: np.ndarray, k_pct: int) -> float:
    prob = np.asarray(y_prob, dtype=float).reshape(-1)
    order = np.argsort(-prob)
    n_sel = int(np.floor(len(prob) * (float(k_pct) / 100.0)))
    n_sel = max(n_sel, 1)
    return float(prob[order[n_sel - 1]])


def plot_confusion_topk(y_true, y_prob, k_pct: int, labels=("non_m2", "m2")):
    thr = topk_threshold(np.asarray(y_prob, dtype=float), int(k_pct))
    y_pred = (np.asarray(y_prob, dtype=float) >= thr).astype(int)
    return plot_confusion_matrix_figure(
        y_true,
        y_pred,
        title=f"Confusion Matrix (Top {int(k_pct)}%, thr={thr:.5f})",
        labels=labels,
    )


def topk_precision_recall_lift(y_true: np.ndarray, y_prob: np.ndarray, k_pct: int = 5):
    y_true = np.asarray(y_true, dtype=int).reshape(-1)
    y_prob = np.asarray(y_prob, dtype=float).reshape(-1)

    thr = topk_threshold(y_prob, int(k_pct))
    y_pred = (y_prob >= thr).astype(int)

    if (y_pred == 1).any():
        precision_k = float(y_true[y_pred == 1].mean())
        selected_pos = float(y_true[y_pred == 1].sum())
    else:
        precision_k = 0.0
        selected_pos = 0.0

    total_pos = float(y_true.sum())
    recall_k = float(selected_pos / (total_pos + 1e-12))

    base_rate = float(y_true.mean())
    lift_k = float(precision_k / base_rate) if base_rate > 0 else 0.0

    return precision_k, recall_k, lift_k, float(thr)


def tune_lr_on_val_topk(
    X_train_s,
    y_train,
    X_val_s,
    y_val,
    seed=42,
    k_pct=10,
    alpha=0.85,
):
    C_list = [0.001, 0.003, 0.01, 0.03, 0.1, 0.3, 1.0, 3.0, 10.0, 30.0]
    candidates = []

    for C in C_list:
        candidates.append({"solver": "lbfgs", "penalty": "l2", "C": float(C), "l1_ratio": None})

    for C in C_list:
        candidates.append({"solver": "saga", "penalty": "l1", "C": float(C), "l1_ratio": None})

    for C in C_list:
        for l1r in [0.15, 0.3, 0.5, 0.7, 0.85]:
            candidates.append(
                {"solver": "saga", "penalty": "elasticnet", "C": float(C), "l1_ratio": float(l1r)}
            )

    best = {
        "score": -1.0,
        "params": None,
        "val_precision": None,
        "val_recall": None,
        "val_lift": None,
        "val_thr": None,
        "val_pr_auc": None,
        "k_pct": int(k_pct),
        "alpha": float(alpha),
    }

    y_val_int = np.asarray(y_val, dtype=int).reshape(-1)

    for cand in candidates:
        params = dict(
            solver=cand["solver"],
            penalty=cand["penalty"],
            C=cand["C"],
            max_iter=8000,
            class_weight="balanced",
            random_state=seed,
        )
        if cand["penalty"] == "elasticnet":
            params["l1_ratio"] = cand["l1_ratio"]
        if cand["solver"] == "saga":
            params["n_jobs"] = -1

        model = LogisticRegression(**params)
        model.fit(X_train_s, y_train)

        val_prob = model.predict_proba(X_val_s)[:, 1].astype(float)
        prec_k, rec_k, lift_k, thr = topk_precision_recall_lift(y_val_int, val_prob, k_pct=int(k_pct))
        pr_auc = float(average_precision_score(y_val_int, val_prob))

        score = float(alpha * rec_k + (1.0 - alpha) * lift_k)

        if score > best["score"]:
            best.update(
                {
                    "score": score,
                    "params": params,
                    "val_precision": prec_k,
                    "val_recall": rec_k,
                    "val_lift": lift_k,
                    "val_thr": thr,
                    "val_pr_auc": pr_auc,
                }
            )

    return best


print("Load parquet files")

anchors = pd.read_parquet(P.parquet_path("anchors"))
features = pd.read_parquet(P.parquet_path("features_ml_clean"))
labels = pd.read_parquet(P.parquet_path("labels"))

for df in (anchors, features, labels):
    df["user_id"] = df["user_id"].astype(str)

if "split" in anchors.columns:
    anchors = anchors.drop(columns=["split"])

if "split" not in labels.columns:
    raise KeyError(f"labels.parquet에 split 컬럼이 없습니다. labels columns head: {list(labels.columns)[:50]}")

data = anchors.merge(features, on=["user_id", "anchor_time"], how="inner")
data = data.merge(labels[["user_id", "anchor_time", "label", "split"]], on=["user_id", "anchor_time"], how="inner")

data["target"] = data["label"].astype(str).eq("m2").astype(int)
split_col = data["split"].astype(str)

feature_cols = [c for c in features.columns if c not in ("user_id", "anchor_time")]
X_all = data.loc[:, feature_cols].fillna(0.0)
y_all = data["target"].to_numpy(dtype=int)

idx_train = split_col.eq("train").to_numpy()
idx_val = split_col.eq("val").to_numpy()
idx_test = split_col.eq("test").to_numpy()

X_train = X_all.loc[idx_train].to_numpy(dtype=float)
y_train = y_all[idx_train]
X_val = X_all.loc[idx_val].to_numpy(dtype=float)
y_val = y_all[idx_val]
X_test = X_all.loc[idx_test].to_numpy(dtype=float)
y_test = y_all[idx_test]

print("rows:", len(data), "train/val/test:", int(idx_train.sum()), int(idx_val.sum()), int(idx_test.sum()))
print("n_features:", int(X_train.shape[1]))

scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_val_s = scaler.transform(X_val)
X_test_s = scaler.transform(X_test)

k_pct_for_selection = 10
alpha_for_selection = 0.85

tuned = tune_lr_on_val_topk(
    X_train_s,
    y_train,
    X_val_s,
    y_val,
    seed=42,
    k_pct=k_pct_for_selection,
    alpha=alpha_for_selection,
)

print("best_val_score:", float(tuned["score"]))
print("best_val_pr_auc:", float(tuned["val_pr_auc"]))
print("best_val_precision_k:", float(tuned["val_precision"]))
print("best_val_recall_k:", float(tuned["val_recall"]))
print("best_val_lift_k:", float(tuned["val_lift"]))
print("best_lr_params:", json.dumps(tuned["params"], ensure_ascii=False, indent=2))

X_tv_s = np.vstack([X_train_s, X_val_s])
y_tv = np.concatenate([np.asarray(y_train, dtype=int), np.asarray(y_val, dtype=int)])

final_params = tuned["params"].copy()
model = LogisticRegression(**final_params)
model.fit(X_tv_s, y_tv)

y_prob = model.predict_proba(X_test_s)[:, 1].astype(float).reshape(-1)
y_true = np.asarray(y_test, dtype=int).reshape(-1)

metrics = evaluate_churn_metrics(y_true, y_prob)
metrics["val_selection_k_pct"] = int(k_pct_for_selection)
metrics["val_selection_alpha"] = float(alpha_for_selection)
metrics["best_val_score"] = float(tuned["score"])
metrics["best_val_precision_k"] = float(tuned["val_precision"])
metrics["best_val_recall_k"] = float(tuned["val_recall"])
metrics["best_val_lift_k"] = float(tuned["val_lift"])
metrics["best_val_pr_auc"] = float(tuned["val_pr_auc"])

print("PR-AUC:", float(metrics.get("PR-AUC (Average Precision)", 0.0)))

precision, recall, _ = precision_recall_curve(y_true, y_prob)
pr_auc_val = metrics.get("PR-AUC (Average Precision)")
pr_auc_val = float(average_precision_score(y_true, y_prob)) if pr_auc_val is None else float(pr_auc_val)

fig_pr, ax_pr = plt.subplots(figsize=(6, 5))
ax_pr.plot(recall, precision, lw=2, label=f"PR-AUC={pr_auc_val:.5f}")
ax_pr.set_xlabel("Recall")
ax_pr.set_ylabel("Precision")
ax_pr.set_title("Precision-Recall Curve")
ax_pr.grid(alpha=0.3)
ax_pr.legend()
fig_pr.tight_layout()

figures = {"pr_curve": fig_pr}
for k_pct in (5, 10, 15, 30):
    figures[f"confusion_matrix_top{k_pct}"] = plot_confusion_topk(
        y_true=y_true,
        y_prob=y_prob,
        k_pct=int(k_pct),
        labels=("non_m2", "m2"),
    )

saved = save_model_and_artifacts(
    model=model,
    model_name="mllg",
    model_type="ml",
    model_id="ml__mllg",
    split="test",
    metrics=metrics,
    y_true=y_true,
    y_prob=y_prob,
    version="tuned_on_val",
    scaler=scaler,
    figures=figures,
    config={
        "model_name": "mllg",
        "model_type": "ml",
        "version": "tuned_on_val",
        "feature_source": "features_ml_clean.parquet",
        "selection_rule": f"val score = {alpha_for_selection}*Recall@Top{k_pct_for_selection}% + {1.0-alpha_for_selection}*Lift@Top{k_pct_for_selection}%",
        "best_lr_params": final_params,
    },
)

print("saved keys:", list(saved.keys()))
for k, v in saved.items():
    print(k, "->", v)

plt.close(fig_pr)
for k_pct in (5, 10, 15, 30):
    plt.close(figures[f"confusion_matrix_top{k_pct}"])

print("Done")

Load parquet files
rows: 813540 train/val/test: 574092 137615 101833
n_features: 14




best_val_score: 0.2624901150924944
best_val_pr_auc: 0.911759992868709
best_val_precision_k: 0.9401206307681128
best_val_recall_k: 0.11169533084680204
best_val_lift_k: 1.1169938924847513
best_lr_params: {
  "solver": "lbfgs",
  "penalty": "l2",
  "C": 0.003,
  "max_iter": 8000,
  "class_weight": "balanced",
  "random_state": 42
}
PR-AUC: 0.9302892269175418
saved keys: ['model', 'scaler', 'config', 'eval_dir', 'version_dir', 'figure_pr_curve', 'figure_confusion_matrix_top5', 'figure_confusion_matrix_top10', 'figure_confusion_matrix_top15', 'figure_confusion_matrix_top30']
model -> /Users/jy/project_2nd/SKN23-2nd-3Team/models/ml/mllg/tuned_on_val/model.pkl
scaler -> /Users/jy/project_2nd/SKN23-2nd-3Team/models/preprocessing/mllg/tuned_on_val/scaler.pkl
config -> /Users/jy/project_2nd/SKN23-2nd-3Team/models/configs/mllg/tuned_on_val/config.json
eval_dir -> /Users/jy/project_2nd/SKN23-2nd-3Team/models/eval/mlmllg
version_dir -> tuned_on_val
figure_pr_curve -> /Users/jy/project_2nd/SKN23-2nd