In [15]:
from pathlib import Path
import sys
import json
import math
import random

import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader

from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.metrics import (
    average_precision_score,
    precision_recall_curve,
    confusion_matrix,
)

import matplotlib.pyplot as plt

In [None]:
PROJECT_ROOT = Path.cwd().parents[1]
sys.path.insert(0, str(PROJECT_ROOT))

def seed_everything(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

seed_everything(42)

def configure_matplotlib_korean():
    import platform
    import matplotlib as mpl
    from matplotlib import font_manager as fm

    system = platform.system()
    if system == "Darwin":
        candidates = ["AppleGothic", "Pretendard", "Noto Sans CJK KR", "NanumGothic"]
    elif system == "Windows":
        candidates = ["Malgun Gothic", "Pretendard", "Noto Sans CJK KR", "NanumGothic"]
    else:
        candidates = ["NanumGothic", "Noto Sans CJK KR", "Pretendard", "DejaVu Sans"]

    chosen = None
    for name in candidates:
        try:
            fm.findfont(name, fallback_to_default=False)
            chosen = name
            break
        except Exception:
            continue

    if chosen:
        mpl.rcParams["font.family"] = chosen
    mpl.rcParams["axes.unicode_minus"] = False
    return chosen

_ = configure_matplotlib_korean()

from app.utils.save import save_model_and_artifacts

In [17]:
SAMPLES_DIR = PROJECT_ROOT / "outputs" / "samples"
if not SAMPLES_DIR.exists():
    SAMPLES_DIR = PROJECT_ROOT / "data" / "processed"

anchors_path  = SAMPLES_DIR / "anchors.parquet"
features_path = SAMPLES_DIR / "features_ml_clean.parquet"
labels_path   = SAMPLES_DIR / "labels.parquet"

anchors  = pd.read_parquet(anchors_path)
features = pd.read_parquet(features_path)
labels   = pd.read_parquet(labels_path)

anchors["user_id"] = anchors["user_id"].astype(str)
features["user_id"] = features["user_id"].astype(str)
labels["user_id"] = labels["user_id"].astype(str)

if "split" not in labels.columns:
    raise ValueError("labels.parquet에 split 컬럼이 없습니다.")
labels = labels.rename(columns={"split": "split_label"})

data = anchors.merge(features, on=["user_id", "anchor_time"], how="inner")
data = data.merge(labels, on=["user_id", "anchor_time"], how="inner")

data["target"] = (data["label"] == "m2").astype(int)

feature_cols = [c for c in features.columns if c not in ["user_id", "anchor_time"]]
X_all = data[feature_cols].to_numpy()
y_all = data["target"].to_numpy().astype(int)

train_df = data[data["split_label"] == "train"].copy()
val_df   = data[data["split_label"] == "val"].copy()
test_df  = data[data["split_label"] == "test"].copy()

X_train = train_df[feature_cols].to_numpy()
y_train = train_df["target"].to_numpy().astype(int)

X_val = val_df[feature_cols].to_numpy()
y_val = val_df["target"].to_numpy().astype(int)

X_test = test_df[feature_cols].to_numpy()
y_test = test_df["target"].to_numpy().astype(int)

print("loaded shapes:", X_train.shape, X_val.shape, X_test.shape)
print("train pos rate:", y_train.mean(), "val:", y_val.mean(), "test:", y_test.mean())

loaded shapes: (574092, 14) (137615, 14) (101833, 14)
train pos rate: 0.8041254711788355 val: 0.8416524361443156 test: 0.8671255879724648


In [18]:
# SMOTE 제거: X_train_res, y_train_res 대신 원본 X_train, y_train 사용
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

train_y = torch.tensor(y_train, dtype=torch.float32)
val_y   = torch.tensor(y_val, dtype=torch.float32)
test_y  = torch.tensor(y_test, dtype=torch.float32)

train_loader = DataLoader(
    TensorDataset(torch.tensor(X_train_scaled, dtype=torch.float32), train_y),
    batch_size=256, shuffle=True
)
val_loader = DataLoader(
    TensorDataset(torch.tensor(X_val_scaled, dtype=torch.float32), val_y),
    batch_size=256, shuffle=False
)
test_loader = DataLoader(
    TensorDataset(torch.tensor(X_test_scaled, dtype=torch.float32), test_y),
    batch_size=256, shuffle=False
)

# loss 쪽에서 pos_weight 적용 (학습 코드에 넣기)
n_pos = float((y_train == 1).sum())
n_neg = float((y_train == 0).sum())
pos_weight = torch.tensor([n_neg / max(n_pos, 1.0)], dtype=torch.float32, device=device)

criterion = torch.nn.BCEWithLogitsLoss(pos_weight=pos_weight)


In [19]:
MODEL_NAME = "mlp_advanced"
MODEL_ID = "dl__mlp_advanced"
VERSION = "baseline"
SPLIT = "test"

device = "cuda" if torch.cuda.is_available() else "cpu"
print("device:", device)

try:
    from models.model_definitions import MLP_advanced
    MODEL_IMPORT_OK = True
except Exception:
    MODEL_IMPORT_OK = False

if not MODEL_IMPORT_OK:
    class ResidualBlock(nn.Module):
        def __init__(self, hidden_dim: int, dropout: float = 0.1):
            super().__init__()
            self.block = nn.Sequential(
                nn.Linear(hidden_dim, hidden_dim),
                nn.BatchNorm1d(hidden_dim),
                nn.ReLU(),
                nn.Dropout(dropout),
                nn.Linear(hidden_dim, hidden_dim),
                nn.BatchNorm1d(hidden_dim),
            )
            self.relu = nn.ReLU()

        def forward(self, x):
            return self.relu(self.block(x) + x)

    class MLP_advanced(nn.Module):
        def __init__(self, input_dim: int, hidden_dim: int = 256, num_blocks: int = 2):
            super().__init__()
            self.input_layer = nn.Sequential(
                nn.Linear(input_dim, hidden_dim),
                nn.BatchNorm1d(hidden_dim),
                nn.ReLU(),
            )
            self.blocks = nn.ModuleList([ResidualBlock(hidden_dim) for _ in range(num_blocks)])
            self.output_layer = nn.Sequential(
                nn.Linear(hidden_dim, 32),
                nn.ReLU(),
                nn.Linear(32, 1),
            )

        def forward(self, x):
            out = self.input_layer(x)
            for blk in self.blocks:
                out = blk(out)
            return self.output_layer(out)

class FocalLoss(nn.Module):
    def __init__(self, alpha=0.25, gamma=2.0):
        super().__init__()
        self.alpha = alpha
        self.gamma = gamma

    def forward(self, logits, targets):
        bce = F.binary_cross_entropy_with_logits(logits, targets, reduction="none")
        pt = torch.exp(-bce)
        loss = self.alpha * (1 - pt) ** self.gamma * bce
        return loss.mean()

input_dim = X_train_scaled.shape[1]

model = MLP_advanced(input_dim=input_dim, hidden_dim=256, num_blocks=2).to(device)
criterion = FocalLoss(alpha=0.25, gamma=2.0)

optimizer = torch.optim.AdamW(model.parameters(), lr=0.001, weight_decay=1e-4)
n_epochs = 15

device: cpu


In [20]:
def predict_proba_from_loader(model, loader, device):
    model.eval()
    probs = []
    trues = []
    with torch.no_grad():
        for xb, yb in loader:
            xb = xb.to(device)
            logits = model(xb).view(-1)
            prob = torch.sigmoid(logits).detach().cpu().numpy()
            probs.append(prob)
            trues.append(yb.detach().cpu().numpy())
    y_prob = np.concatenate(probs)
    y_true = np.concatenate(trues).astype(int)
    return y_true, y_prob

best_val = -1.0
best_state = None

for epoch in range(1, n_epochs + 1):
    model.train()
    losses = []
    for xb, yb in train_loader:
        xb = xb.to(device)
        yb = yb.to(device).view(-1)

        optimizer.zero_grad()
        logits = model(xb).view(-1)
        loss = criterion(logits, yb)
        loss.backward()
        optimizer.step()

        losses.append(loss.item())

    val_true, val_prob = predict_proba_from_loader(model, val_loader, device)
    val_pr_auc = float(average_precision_score(val_true, val_prob))

    if val_pr_auc > best_val:
        best_val = val_pr_auc
        best_state = {k: v.detach().cpu().clone() for k, v in model.state_dict().items()}

    print(f"epoch {epoch:02d} | train_loss={np.mean(losses):.5f} | val_pr_auc={val_pr_auc:.5f} | best={best_val:.5f}")

if best_state is not None:
    model.load_state_dict(best_state)

epoch 01 | train_loss=0.02969 | val_pr_auc=0.91337 | best=0.91337
epoch 02 | train_loss=0.02952 | val_pr_auc=0.91496 | best=0.91496
epoch 03 | train_loss=0.02948 | val_pr_auc=0.91532 | best=0.91532
epoch 04 | train_loss=0.02945 | val_pr_auc=0.91509 | best=0.91532
epoch 05 | train_loss=0.02944 | val_pr_auc=0.91534 | best=0.91534
epoch 06 | train_loss=0.02942 | val_pr_auc=0.91526 | best=0.91534
epoch 07 | train_loss=0.02941 | val_pr_auc=0.91574 | best=0.91574
epoch 08 | train_loss=0.02940 | val_pr_auc=0.91564 | best=0.91574
epoch 09 | train_loss=0.02939 | val_pr_auc=0.91565 | best=0.91574
epoch 10 | train_loss=0.02938 | val_pr_auc=0.91625 | best=0.91625
epoch 11 | train_loss=0.02938 | val_pr_auc=0.91568 | best=0.91625
epoch 12 | train_loss=0.02937 | val_pr_auc=0.91565 | best=0.91625
epoch 13 | train_loss=0.02937 | val_pr_auc=0.91589 | best=0.91625
epoch 14 | train_loss=0.02936 | val_pr_auc=0.91601 | best=0.91625
epoch 15 | train_loss=0.02935 | val_pr_auc=0.91607 | best=0.91625


In [21]:
def evaluate_churn_metrics(y_true, y_prob):
    y_true = np.asarray(y_true).astype(int)
    y_prob = np.asarray(y_prob).astype(float)

    pr_auc = float(average_precision_score(y_true, y_prob))
    base_rate = float(y_true.mean())

    df_res = pd.DataFrame({"label": y_true, "prob": y_prob}).sort_values("prob", ascending=False)

    def at_k(k):
        n = len(df_res)
        n_sel = max(int(np.floor(n * k / 100)), 1)
        top = df_res.iloc[:n_sel]
        prec = float(top["label"].mean())
        total_pos = float(df_res["label"].sum())
        cap_pos = float(top["label"].sum())
        rec = float(cap_pos / total_pos) if total_pos > 0 else 0.0
        lift = float(prec / base_rate) if base_rate > 0 else 0.0
        return prec, rec, lift

    ranking_list = []
    for k in [5, 10, 15, 20, 25, 30]:
        prec, rec, lift = at_k(k)
        ranking_list.append({"Top_K": f"{k}%", "Precision": prec, "Recall": rec, "Lift": lift})

    p5, r5, l5 = at_k(5)

    return {
        "PR-AUC (Average Precision)": pr_auc,
        "상위 5% 정밀도 (Precision)": p5,
        "상위 5% 재현율 (Recall)": r5,
        "상위 5% 리프트 (Lift)": l5,
        "ranking": ranking_list,
    }

def plot_confusion_matrix(y_true, y_pred, title, labels=("비이탈(m1)", "이탈(m2)"), cmap="Blues"):
    y_true = np.asarray(y_true).astype(int)
    y_pred = np.asarray(y_pred).astype(int)
    cm = confusion_matrix(y_true, y_pred)

    fig, ax = plt.subplots(figsize=(6, 5))
    im = ax.imshow(cm, cmap=cmap, interpolation="nearest", aspect="equal")
    fig.colorbar(im, ax=ax)

    ax.set_title(title)
    ax.set_xlabel("Predicted (예측값)")
    ax.set_ylabel("Actual (실제값)")

    ax.set_xticks([0, 1])
    ax.set_yticks([0, 1])
    ax.set_xticklabels(labels)
    ax.set_yticklabels(labels)

    thresh = cm.max() / 2.0 if cm.size else 0
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(
                j, i, f"{cm[i, j]}",
                ha="center", va="center",
                color="white" if cm[i, j] > thresh else "black",
                fontsize=12,
            )

    ax.set_xlim(-0.5, cm.shape[1] - 0.5)
    ax.set_ylim(cm.shape[0] - 0.5, -0.5)

    fig.tight_layout()
    return fig

def threshold_topk(y_prob, k_pct):
    y_prob = np.asarray(y_prob).astype(float)
    scores = np.sort(y_prob)[::-1]
    n = len(scores)
    n_sel = max(int(np.floor(n * k_pct / 100)), 1)
    return float(scores[n_sel - 1])

# test_true, test_prob
test_true, test_prob = predict_proba_from_loader(model, test_loader, device)

# metrics (json 저장용)
metrics = evaluate_churn_metrics(test_true, test_prob)
pr_auc_val = float(metrics["PR-AUC (Average Precision)"])

# PR curve figure (저장용)
precision, recall, _ = precision_recall_curve(test_true, test_prob)
fig_pr, ax_pr = plt.subplots(figsize=(6, 5))
ax_pr.plot(recall, precision, lw=2, label=f"PR-AUC = {pr_auc_val:.5f}")
ax_pr.set_xlabel("Recall")
ax_pr.set_ylabel("Precision")
ax_pr.set_title("Precision-Recall Curve")
ax_pr.legend()
ax_pr.grid(alpha=0.3)
fig_pr.tight_layout()

# Confusion matrices (저장용)
k_list = [5, 10, 15, 30]
figures = {"pr_curve": fig_pr}

for k in k_list:
    thr = threshold_topk(test_prob, k)
    y_pred_k = (test_prob >= thr).astype(int)
    fig_cm = plot_confusion_matrix(
        test_true,
        y_pred_k,
        title=f"Confusion Matrix (Top {k}%, thr={thr:.5f})",
        labels=("비이탈(m1)", "이탈(m2)"),
        cmap="Blues",
    )
    figures[f"confusion_matrix_top{k}"] = fig_cm

# 저장만 (출력 X)
_ = save_model_and_artifacts(
    model=model,
    model_name="mlp_enhance",     # 너 모델명으로
    model_type="dl",
    model_id="dl__mlp_enhance",   # 너 model_id로
    split="test",
    metrics=metrics,
    y_true=test_true,
    y_prob=test_prob,
    version="baseline",
    scaler=scaler,
    figures=figures,
)

# 메모리 정리 (노트북 출력도 안 뜨게 + 메모리 누수 방지)
plt.close(fig_pr)
for k in k_list:
    plt.close(figures[f"confusion_matrix_top{k}"])


In [22]:
saved = save_model_and_artifacts(
    model=model,
    model_name=MODEL_NAME,
    model_type="dl",
    model_id=MODEL_ID,
    split=SPLIT,
    metrics=metrics,
    y_true=test_true,
    y_prob=test_prob,
    version=VERSION,
    scaler=scaler,
    figures=figures,
)

print(json.dumps(metrics, indent=2, ensure_ascii=False))

print("saved paths:")
for k, v in saved.items():
    print(f"{k}: {v}")

plt.close(fig_pr)
for k in k_list:
    plt.close(figures[f"confusion_matrix_top{k}"])

{
  "PR-AUC (Average Precision)": 0.9338137050393234,
  "상위 5% 정밀도 (Precision)": 0.9693576900412493,
  "상위 5% 재현율 (Recall)": 0.05588774886186043,
  "상위 5% 리프트 (Lift)": 1.1178976880475022,
  "ranking": [
    {
      "Top_K": "5%",
      "Precision": 0.9693576900412493,
      "Recall": 0.05588774886186043,
      "Lift": 1.1178976880475022
    },
    {
      "Top_K": "10%",
      "Precision": 0.9637631346361583,
      "Recall": 0.11114131050259338,
      "Lift": 1.1114458482186576
    },
    {
      "Top_K": "15%",
      "Precision": 0.9593426738248003,
      "Recall": 0.16594188127109238,
      "Lift": 1.1063480159407588
    },
    {
      "Top_K": "20%",
      "Precision": 0.9557104978886379,
      "Recall": 0.22042535842902766,
      "Lift": 1.1021592617550413
    },
    {
      "Top_K": "25%",
      "Precision": 0.9510959226962055,
      "Recall": 0.27420669973500034,
      "Lift": 1.0968375698842914
    },
    {
      "Top_K": "30%",
      "Precision": 0.9482143441683852,
      "Reca

In [23]:
MODEL_ID = "dl__mlp_advanced"
SPLIT = "test"
PCTS = [1, 5, 10, 20, 30, 50]

scores = np.asarray(test_prob, dtype=float).reshape(-1)

percentiles = []
for pct in PCTS:
    cutoff = float(np.quantile(scores, 1.0 - pct / 100.0))
    percentiles.append({"pct": int(pct), "score": cutoff})

payload = {"model_id": MODEL_ID, "split": SPLIT, "percentiles": percentiles}

out_path = PROJECT_ROOT / "models" / "metrics" / "mlp_advanced_score_percentiles.json"
out_path.parent.mkdir(parents=True, exist_ok=True)
out_path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")

print(json.dumps(payload, ensure_ascii=False, indent=2))
print(out_path)

{
  "model_id": "dl__mlp_advanced",
  "split": "test",
  "percentiles": [
    {
      "pct": 1,
      "score": 0.7368711423873902
    },
    {
      "pct": 5,
      "score": 0.7281568169593811
    },
    {
      "pct": 10,
      "score": 0.7191614866256714
    },
    {
      "pct": 20,
      "score": 0.7029853701591492
    },
    {
      "pct": 30,
      "score": 0.6850807428359985
    },
    {
      "pct": 50,
      "score": 0.6561292409896851
    }
  ]
}
/Users/jy/project_2nd/SKN23-2nd-3Team/models/metrics/mlp_advanced_score_percentiles.json
