In [3]:
import sys
from pathlib import Path

# 현재 파일 위치 기준으로 위로 올라가며 "app" 폴더를 찾는다
p = Path.cwd().resolve()
for _ in range(6):  # 최대 6단계 위까지 탐색
    if (p / "app").exists() and (p / "models").exists():
        sys.path.insert(0, str(p))
        print("✅ project root added to sys.path:", p)
        break
    p = p.parent
else:
    raise RuntimeError("❌ 프로젝트 루트를 찾지 못했어요. app/ 와 models/ 가 있는 폴더에서 실행해야 합니다.")

✅ project root added to sys.path: /Users/jy/project_2nd/SKN23-2nd-3Team


In [4]:
from __future__ import annotations

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_recall_curve, confusion_matrix, average_precision_score

from app.utils.paths import DEFAULT_PATHS as P, ensure_runtime_dirs
from app.utils.metrics import evaluate_churn_metrics
from app.utils.save import save_model_and_artifacts
from models.model_definitions import MLP_advanced

try:
    from app.utils.plotting import configure_matplotlib_korean
    configure_matplotlib_korean()
except Exception:
    pass



# 0) Config

ensure_runtime_dirs()

SEED = 42
BATCH_SIZE = 512
LR = 1e-3
WEIGHT_DECAY = 1e-5
N_EPOCHS = 30

# 운영/캠페인 기준 K (best 선택 기준)
BEST_K_PCT = 10
REPORT_K_LIST = (5, 10, 15, 30)

# 불균형 보정 (필요하면 True)
USE_POS_WEIGHT = True


def seed_everything(seed: int = 42) -> None:
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)



# 1) Utility: Plot / TopK

def plot_confusion_matrix_figure(y_true, y_pred, title: str, labels=("non_m2", "m2")):
    y_true_arr = np.asarray(y_true, dtype=int).reshape(-1)
    y_pred_arr = np.asarray(y_pred, dtype=int).reshape(-1)
    cm = confusion_matrix(y_true_arr, y_pred_arr)

    fig, ax = plt.subplots(figsize=(6, 5))
    im = ax.imshow(cm, interpolation="nearest", aspect="equal", cmap="Blues")
    fig.colorbar(im, ax=ax)

    ax.set_title(title)
    ax.set_xlabel("Predicted")
    ax.set_ylabel("Actual")
    ax.set_xticks([0, 1])
    ax.set_yticks([0, 1])
    ax.set_xticklabels(list(labels))
    ax.set_yticklabels(list(labels))

    thresh = float(cm.max()) / 2.0 if cm.size else 0.0
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(
                j, i, str(cm[i, j]),
                ha="center", va="center",
                color="white" if float(cm[i, j]) > thresh else "black",
                fontsize=12,
            )

    ax.set_xlim(-0.5, cm.shape[1] - 0.5)
    ax.set_ylim(cm.shape[0] - 0.5, -0.5)
    fig.tight_layout()
    return fig


def topk_threshold(y_prob: np.ndarray, k_pct: int) -> float:
    prob = np.asarray(y_prob, dtype=float).reshape(-1)
    order = np.argsort(-prob)
    n_sel = int(np.floor(len(prob) * (float(k_pct) / 100.0)))
    n_sel = max(n_sel, 1)
    return float(prob[order[n_sel - 1]])


def plot_confusion_topk(y_true, y_prob, k_pct: int, labels=("non_m2", "m2")):
    thr = topk_threshold(np.asarray(y_prob, dtype=float), int(k_pct))
    y_pred = (np.asarray(y_prob, dtype=float) >= thr).astype(int)
    return plot_confusion_matrix_figure(
        y_true, y_pred,
        title=f"Confusion Matrix (Top {int(k_pct)}%, thr={thr:.5f})",
        labels=labels,
    )


def recall_at_topk(y_true: np.ndarray, y_prob: np.ndarray, k_pct: int) -> float:
    y_true = np.asarray(y_true).astype(int).reshape(-1)
    y_prob = np.asarray(y_prob).astype(float).reshape(-1)

    order = np.argsort(-y_prob)
    n = max(int(np.floor(len(y_true) * (k_pct / 100.0))), 1)
    top_idx = order[:n]

    return float(y_true[top_idx].sum() / max(y_true.sum(), 1))


@torch.no_grad()
def predict_probs(model: nn.Module, loader: DataLoader, device: str):
    model.eval()
    probs, trues = [], []
    for xb, yb in loader:
        xb = xb.to(device)
        logits = model(xb).view(-1)
        prob = torch.sigmoid(logits).detach().cpu().numpy().reshape(-1)
        probs.append(prob)
        trues.append(yb.detach().cpu().numpy().reshape(-1))
    y_true = np.concatenate(trues).astype(int).reshape(-1)
    y_prob = np.concatenate(probs).astype(float).reshape(-1)
    return y_true, y_prob



# 2) Load + Merge (anchors ⨝ features_ml_clean ⨝ labels)

def load_merged_dataset():
    print("Load parquet files")

    anchors = pd.read_parquet(P.parquet_path("anchors"))
    features = pd.read_parquet(P.parquet_path("features_ml_clean"))  # 팀 규칙
    labels = pd.read_parquet(P.parquet_path("labels"))

    for df in (anchors, features, labels):
        df["user_id"] = df["user_id"].astype(str)

    if "split" in anchors.columns:
        anchors = anchors.drop(columns=["split"])

    need = [c for c in ["user_id", "anchor_time", "label", "split"] if c in labels.columns]
    if "split" not in need:
        raise KeyError(f"labels.parquet에 split 컬럼이 없습니다. labels columns head: {list(labels.columns)[:50]}")

    data = anchors.merge(features, on=["user_id", "anchor_time"], how="inner")
    data = data.merge(labels[need], on=["user_id", "anchor_time"], how="inner")

    data["target"] = data["label"].astype(str).eq("m2").astype(int)
    split_col = data["split"].astype(str)

    feature_cols = [c for c in features.columns if c not in ("user_id", "anchor_time")]
    X_all = data.loc[:, feature_cols].fillna(0.0)
    y_all = data["target"].to_numpy(dtype=int)

    idx_train = split_col.eq("train").to_numpy()
    idx_val = split_col.eq("val").to_numpy()
    idx_test = split_col.eq("test").to_numpy()

    X_train = X_all.loc[idx_train].to_numpy(dtype=float)
    y_train = y_all[idx_train]
    X_val = X_all.loc[idx_val].to_numpy(dtype=float)
    y_val = y_all[idx_val]
    X_test = X_all.loc[idx_test].to_numpy(dtype=float)
    y_test = y_all[idx_test]

    print("rows:", len(data), "train/val/test:", int(idx_train.sum()), int(idx_val.sum()), int(idx_test.sum()))
    print("n_features:", int(X_train.shape[1]))

    return (X_train, y_train, X_val, y_val, X_test, y_test, feature_cols)



# 3) Train (best by val Recall@K, tie-break PR-AUC)

def train_with_best_selection(
    X_train: np.ndarray, y_train: np.ndarray,
    X_val: np.ndarray, y_val: np.ndarray,
    input_dim: int,
):
    def get_device() -> str:
        if torch.cuda.is_available():
            return "cuda"
        if hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
            return "mps"
        return "cpu"

    device = get_device()
    print("device:", device)

    scaler = StandardScaler()
    X_train_s = scaler.fit_transform(X_train)
    X_val_s = scaler.transform(X_val)

    train_ds = TensorDataset(torch.tensor(X_train_s, dtype=torch.float32),
                             torch.tensor(y_train, dtype=torch.float32))
    val_ds = TensorDataset(torch.tensor(X_val_s, dtype=torch.float32),
                           torch.tensor(y_val, dtype=torch.float32))

    train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
    val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False)

    model = MLP_advanced(input_dim)
    model.to(device)

    # --- loss (pos_weight 옵션) ---
    if USE_POS_WEIGHT:
        pos = int((y_train == 1).sum())
        neg = int((y_train == 0).sum())
        pw = torch.tensor([neg / max(pos, 1)], dtype=torch.float32).to(device)
        criterion = nn.BCEWithLogitsLoss(pos_weight=pw)
        print(f"pos_weight enabled: pos={pos} neg={neg} pos_weight={float(pw.item()):.4f}")
    else:
        criterion = nn.BCEWithLogitsLoss()

    optimizer = optim.Adam(model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)

    def run_epoch(loader, train_mode: bool) -> float:
        model.train() if train_mode else model.eval()
        loss_sum, n_sum = 0.0, 0
        with torch.set_grad_enabled(train_mode):
            for xb, yb in loader:
                xb = xb.to(device)
                yb = yb.to(device).view(-1)

                if train_mode:
                    optimizer.zero_grad()

                logits = model(xb).view(-1)
                loss = criterion(logits, yb)

                if train_mode:
                    loss.backward()
                    optimizer.step()

                bs = int(xb.shape[0])
                loss_sum += float(loss.item()) * bs
                n_sum += bs

        return float(loss_sum / max(n_sum, 1))

    best = {
        "epoch": 0,
        "recall_at_k": -1.0,
        "pr_auc": -1.0,
        "state_dict": None,
    }

    print("Train MLP_advanced (best by val Recall@{}%)".format(BEST_K_PCT))

    for epoch in range(1, N_EPOCHS + 1):
        tr_loss = run_epoch(train_loader, True)
        va_loss = run_epoch(val_loader, False)

        # val metric (prob)
        yv_true, yv_prob = predict_probs(model, val_loader, device)
        val_pr_auc = float(average_precision_score(yv_true, yv_prob))
        val_recall_k = recall_at_topk(yv_true, yv_prob, BEST_K_PCT)

        print(
            f"epoch {epoch}/{N_EPOCHS} "
            f"train_loss={tr_loss:.5f} val_loss={va_loss:.5f} "
            f"val_PR-AUC={val_pr_auc:.5f} val_Recall@{BEST_K_PCT}%={val_recall_k:.5f}"
        )

        # best update: Recall@K, tie-break PR-AUC
        better = (val_recall_k > best["recall_at_k"] + 1e-12) or (
            abs(val_recall_k - best["recall_at_k"]) <= 1e-12 and val_pr_auc > best["pr_auc"]
        )
        if better:
            best["epoch"] = epoch
            best["recall_at_k"] = val_recall_k
            best["pr_auc"] = val_pr_auc
            best["state_dict"] = {k: v.detach().cpu().clone() for k, v in model.state_dict().items()}

    # restore best
    if best["state_dict"] is not None:
        model.load_state_dict(best["state_dict"])
    print(f"BEST epoch={best['epoch']} val_Recall@{BEST_K_PCT}%={best['recall_at_k']:.5f} val_PR-AUC={best['pr_auc']:.5f}")

    return model, scaler, device



# 4) Test (single run) + Save artifacts

def evaluate_and_save(
    model: nn.Module,
    scaler: StandardScaler,
    device: str,
    X_test: np.ndarray,
    y_test: np.ndarray,
):
    print("Evaluate on test (single run)")

    X_test_s = scaler.transform(X_test)
    test_ds = TensorDataset(torch.tensor(X_test_s, dtype=torch.float32),
                            torch.tensor(y_test, dtype=torch.float32))
    test_loader = DataLoader(test_ds, batch_size=BATCH_SIZE, shuffle=False)

    y_true, y_prob = predict_probs(model, test_loader, device)

    metrics = evaluate_churn_metrics(y_true, y_prob)
    print("PR-AUC:", float(metrics.get("PR-AUC (Average Precision)", 0.0)))

    # PR curve fig
    precision, recall, _ = precision_recall_curve(y_true, y_prob)
    pr_auc_val = metrics.get("PR-AUC (Average Precision)")
    pr_auc_val = float(average_precision_score(y_true, y_prob)) if pr_auc_val is None else float(pr_auc_val)

    fig_pr, ax_pr = plt.subplots(figsize=(6, 5))
    ax_pr.plot(recall, precision, lw=2, label=f"PR-AUC={pr_auc_val:.5f}")
    ax_pr.set_xlabel("Recall")
    ax_pr.set_ylabel("Precision")
    ax_pr.set_title("Precision-Recall Curve")
    ax_pr.grid(alpha=0.3)
    ax_pr.legend()
    fig_pr.tight_layout()

    figures = {"pr_curve": fig_pr}
    for k_pct in REPORT_K_LIST:
        figures[f"confusion_matrix_top{k_pct}"] = plot_confusion_topk(
            y_true=y_true, y_prob=y_prob, k_pct=int(k_pct), labels=("non_m2", "m2")
        )

    print("Save artifacts")

    saved = save_model_and_artifacts(
        model=model,
        model_name="mlp_advanced",
        model_type="dl",
        model_id="dl__mlp_advanced",
        split="test",
        metrics=metrics,
        y_true=y_true,
        y_prob=y_prob,
        version="baseline",
        scaler=scaler,
        figures=figures,
        config={
            "model_name": "mlp_advanced",
            "model_type": "dl",
            "version": "baseline",
            "feature_source": "features_ml_clean.parquet",
            "best_selection": f"val Recall@{BEST_K_PCT}%",
            "use_pos_weight": bool(USE_POS_WEIGHT),
        },
    )

    # close figs
    plt.close(fig_pr)
    for k_pct in REPORT_K_LIST:
        plt.close(figures[f"confusion_matrix_top{k_pct}"])

    print("saved keys:", list(saved.keys()))
    for k, v in saved.items():
        print(k, "->", v)

    print("Done")


# 5) Main

if __name__ == "__main__":
    seed_everything(SEED)

    X_train, y_train, X_val, y_val, X_test, y_test, feature_cols = load_merged_dataset()
    input_dim = int(X_train.shape[1])

    model, scaler, device = train_with_best_selection(
        X_train=X_train, y_train=y_train,
        X_val=X_val, y_val=y_val,
        input_dim=input_dim,
    )

    evaluate_and_save(
        model=model,
        scaler=scaler,
        device=device,
        X_test=X_test,
        y_test=y_test,
    )

Load parquet files
rows: 813540 train/val/test: 574092 137615 101833
n_features: 14
device: mps
pos_weight enabled: pos=461642 neg=112450 pos_weight=0.2436
Train MLP_advanced (best by val Recall@10%)
epoch 1/30 train_loss=0.24806 val_loss=0.22607 val_PR-AUC=0.91440 val_Recall@10%=0.11215
epoch 2/30 train_loss=0.24720 val_loss=0.22655 val_PR-AUC=0.91464 val_Recall@10%=0.11221
epoch 3/30 train_loss=0.24711 val_loss=0.22512 val_PR-AUC=0.91486 val_Recall@10%=0.11224
epoch 4/30 train_loss=0.24695 val_loss=0.22445 val_PR-AUC=0.91533 val_Recall@10%=0.11231
epoch 5/30 train_loss=0.24685 val_loss=0.22543 val_PR-AUC=0.91480 val_Recall@10%=0.11223
epoch 6/30 train_loss=0.24676 val_loss=0.22582 val_PR-AUC=0.91473 val_Recall@10%=0.11225
epoch 7/30 train_loss=0.24677 val_loss=0.22693 val_PR-AUC=0.91545 val_Recall@10%=0.11218
epoch 8/30 train_loss=0.24667 val_loss=0.22447 val_PR-AUC=0.91595 val_Recall@10%=0.11251
epoch 9/30 train_loss=0.24667 val_loss=0.22619 val_PR-AUC=0.91531 val_Recall@10%=0.11226