In [15]:
import sys
from pathlib import Path

PROJECT_ROOT = Path.cwd().parents[1]  # SKN23-2nd-3Team
sys.path.insert(0, str(PROJECT_ROOT))

print("PROJECT_ROOT:", PROJECT_ROOT)

PROJECT_ROOT: /Users/jy/project_2nd/SKN23-2nd-3Team


In [16]:
import json
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.metrics import precision_recall_curve
from app.utils.plotting import configure_matplotlib_korean
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_recall_curve, confusion_matrix
from imblearn.over_sampling import SMOTE
from IPython.display import display, Markdown

from models.model_definitions import MLP_enhance
from app.utils.metrics import evaluate_churn_metrics
from app.utils.paths import PATHS


chosen = configure_matplotlib_korean()

print("‚úÖ matplotlib font:", chosen)

‚úÖ matplotlib font: AppleGothic


In [17]:
print(">>> [MLP_enhance] Loading Data...")

base_path = PATHS["data_processed"]

anchors = pd.read_parquet(base_path / "anchors.parquet")
features = pd.read_parquet(base_path / "features_ml_clean.parquet")
labels = pd.read_parquet(base_path / "labels.parquet")

for df in [anchors, features, labels]:
    df["user_id"] = df["user_id"].astype(str)

data = anchors.merge(features, on=["user_id", "anchor_time"])
data = data.merge(labels, on=["user_id", "anchor_time"])
data["target"] = (data["label"] == "m2").astype(int)

feature_cols = [c for c in features.columns if c not in ["user_id", "anchor_time"]]
X = data[feature_cols].fillna(0)
y = data["target"].values

>>> [MLP_enhance] Loading Data...


In [18]:
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.4, stratify=y, random_state=42
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42
)

smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_res)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

train_loader = DataLoader(
    TensorDataset(torch.FloatTensor(X_train_scaled), torch.FloatTensor(y_train_res)),
    batch_size=256, shuffle=True
)
val_loader = DataLoader(
    TensorDataset(torch.FloatTensor(X_val_scaled), torch.FloatTensor(y_val)),
    batch_size=256
)
test_loader = DataLoader(
    TensorDataset(torch.FloatTensor(X_test_scaled), torch.FloatTensor(y_test)),
    batch_size=256
)

In [19]:
model = MLP_enhance(X.shape[1])
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.AdamW(model.parameters(), lr=0.001, weight_decay=1e-4)

print(">>> Training start...")
for epoch in range(15):
    model.train()
    total_loss = 0
    for xb, yb in train_loader:
        optimizer.zero_grad()
        out = model(xb).squeeze()
        loss = criterion(out, yb)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"Epoch {epoch+1:02d} | Loss: {total_loss/len(train_loader):.4f}")


>>> Training start...
Epoch 01 | Loss: 0.6312
Epoch 02 | Loss: 0.6254
Epoch 03 | Loss: 0.6227
Epoch 04 | Loss: 0.6210
Epoch 05 | Loss: 0.6193
Epoch 06 | Loss: 0.6182
Epoch 07 | Loss: 0.6174
Epoch 08 | Loss: 0.6164
Epoch 09 | Loss: 0.6156
Epoch 10 | Loss: 0.6148
Epoch 11 | Loss: 0.6144
Epoch 12 | Loss: 0.6138
Epoch 13 | Loss: 0.6136
Epoch 14 | Loss: 0.6131
Epoch 15 | Loss: 0.6129


In [20]:
model.eval()
all_targets, all_probs = [], []

with torch.no_grad():
    for xb, yb in test_loader:
        logits = model(xb).squeeze()
        probs = torch.sigmoid(logits)
        all_targets.extend(yb.numpy())
        all_probs.extend(probs.numpy())

y_test_np = np.array(all_targets)
y_prob_np = np.array(all_probs)

metrics = evaluate_churn_metrics(y_test_np, y_prob_np)
y_test_np = np.array(all_targets).reshape(-1)   # shape ÏïàÏ†Ñ
y_prob_np = np.array(all_probs).reshape(-1)

base_rate_test = float(y_test_np.mean())
print("base_rate_test:", base_rate_test)

base_rate_test: 0.8183555603027344


In [21]:
summary = {k: v for k, v in metrics.items() if k != "ranking"}
ranking_df = pd.DataFrame(metrics.get("ranking", []))

display(Markdown("### üìä Ï£ºÏöî ÏÑ±Îä• ÏßÄÌëú"))
display(pd.DataFrame(summary.items(), columns=["KPI", "Value"]))

display(Markdown("### üìà Top-K Îû≠ÌÇπ"))
display(ranking_df)

### üìä Ï£ºÏöî ÏÑ±Îä• ÏßÄÌëú

Unnamed: 0,KPI,Value
0,PR-AUC (Average Precision),0.898837
1,ÏÉÅÏúÑ 5% Ï†ïÎ∞ÄÎèÑ (Precision),0.942225
2,ÏÉÅÏúÑ 5% Ïû¨ÌòÑÏú® (Recall),0.057565
3,ÏÉÅÏúÑ 5% Î¶¨ÌîÑÌä∏ (Lift),1.151364


### üìà Top-K Îû≠ÌÇπ

Unnamed: 0,Top_K,Precision,Recall,Lift
0,5%,0.942225,0.057565,1.151364
1,10%,0.935464,0.114305,1.143102
2,15%,0.931615,0.170758,1.138399
3,20%,0.926032,0.226311,1.131576
4,25%,0.921946,0.281646,1.126584
5,30%,0.916414,0.335944,1.119824


In [22]:
MODEL_ID = "dl__mlp_enhance"
MODEL_NAME = "mlp_enhance"
SPLIT = "test"

EVAL_DIR = PATHS["models_eval"] / "dlmlp_enhance"
METRICS_DIR = PATHS["models_metrics"]
ASSETS_DIR = PATHS["assets_training"]

for d in [EVAL_DIR, METRICS_DIR, ASSETS_DIR]:
    d.mkdir(parents=True, exist_ok=True)

In [23]:
import joblib

In [24]:
torch.save(model.state_dict(), PATHS["models_dl"] / f"{MODEL_NAME}.pt")
joblib.dump(scaler, PATHS["models_preprocessing"] / f"{MODEL_NAME}_scaler.pkl")

['/Users/jy/project_2nd/SKN23-2nd-3Team/models/preprocessing/mlp_enhance_scaler.pkl']

In [25]:
precision, recall, _ = precision_recall_curve(y_test_np, y_prob_np)

# (ÏÑ†ÌÉù) ÎÑàÍ∞Ä metricsÏóê PR-AUC ÎÑ£Ïñ¥Îëî Í≤ΩÏö∞ Í∑∏ Í∞í Ïì∞Í∏∞
pr_auc_val = float(metrics.get("PR-AUC (Average Precision)", 0.0)) if "metrics" in globals() else None

fig = plt.figure(figsize=(6, 5))
if pr_auc_val is not None:
    plt.plot(recall, precision, lw=2, label=f"PR-AUC = {pr_auc_val:.4f}")
    plt.legend(loc="lower left")
else:
    plt.plot(recall, precision, lw=2)

plt.title("PR Í≥°ÏÑ† (Precision-Recall Curve)")
plt.xlabel("Ïû¨ÌòÑÏú® (Recall)")
plt.ylabel("Ï†ïÎ∞ÄÎèÑ (Precision)")
plt.grid(alpha=0.3)
plt.tight_layout()


plt.close(fig)

In [26]:
percentiles = [1, 5, 10, 20, 30, 50]
scores = np.percentile(y_prob_np, 100 - np.array(percentiles))

score_percentiles = {
    "model_id": MODEL_ID,
    "split": SPLIT,                 # "test"
    "base_rate": base_rate_test,    # ‚úÖ Î¨¥Ï°∞Í±¥ testÏóêÏÑúÎßå
    "percentiles": [
        {"pct": p, "score": float(s)}
        for p, s in zip(percentiles, scores)
    ],
}

with open(
    METRICS_DIR / f"{MODEL_NAME}_score_percentiles.json",
    "w",
    encoding="utf-8"
) as f:
    json.dump(score_percentiles, f, indent=2, ensure_ascii=False)

print("score_percentiles.json Ï†ÄÏû• ÏôÑÎ£å")

score_percentiles.json Ï†ÄÏû• ÏôÑÎ£å


In [27]:
from pathlib import Path
import sys
import numpy as np
import matplotlib.pyplot as plt

from sklearn.metrics import (
    precision_recall_curve,
    confusion_matrix,
    average_precision_score,
)

PROJECT_ROOT = Path.cwd().parents[1]
sys.path.insert(0, str(PROJECT_ROOT))

from app.utils.save import save_model_and_artifacts

try:
    from app.utils.plotting import configure_matplotlib_korean
    configure_matplotlib_korean()
except Exception:
    pass


def plot_confusion_matrix(
    y_true,
    y_pred,
    title="Confusion Matrix",
    labels=("ÎπÑÏù¥ÌÉà(m1)", "Ïù¥ÌÉà(m2)"),
    cmap="Blues",
):
    y_true = np.asarray(y_true).astype(int)
    y_pred = np.asarray(y_pred).astype(int)
    cm = confusion_matrix(y_true, y_pred)

    fig, ax = plt.subplots(figsize=(6, 5))

    im = ax.imshow(cm, cmap=cmap, interpolation="nearest", aspect="equal")
    fig.colorbar(im, ax=ax)

    ax.set_title(title)
    ax.set_xlabel("Predicted (ÏòàÏ∏°Í∞í)")
    ax.set_ylabel("Actual (Ïã§Ï†úÍ∞í)")

    ax.set_xticks([0, 1])
    ax.set_yticks([0, 1])
    ax.set_xticklabels(labels)
    ax.set_yticklabels(labels)

    thresh = cm.max() / 2.0 if cm.size else 0
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(
                j, i, f"{cm[i, j]}",
                ha="center", va="center",
                color="white" if cm[i, j] > thresh else "black",
                fontsize=12,
            )

    ax.set_xlim(-0.5, cm.shape[1] - 0.5)
    ax.set_ylim(cm.shape[0] - 0.5, -0.5)

    fig.tight_layout()
    return fig


def topk_threshold(y_prob: np.ndarray, k_pct: int) -> float:
    y_prob = np.asarray(y_prob, dtype=float)
    order = np.argsort(-y_prob)
    n_sel = max(int(np.floor(len(y_prob) * k_pct / 100)), 1)
    thr = float(y_prob[order[n_sel - 1]])
    return thr


def plot_confusion_matrix_topk(
    y_true,
    y_prob,
    k_pct: int,
    labels=("ÎπÑÏù¥ÌÉà(m1)", "Ïù¥ÌÉà(m2)"),
    cmap="Blues",
):
    y_true = np.asarray(y_true).astype(int)
    y_prob = np.asarray(y_prob).astype(float)

    thr = topk_threshold(y_prob, k_pct)
    y_pred = (y_prob >= thr).astype(int)

    fig = plot_confusion_matrix(
        y_true=y_true,
        y_pred=y_pred,
        title=f"Confusion Matrix (Top {k_pct}%, thr={thr:.5f})",
        labels=labels,
        cmap=cmap,
    )
    return fig


y_true_arr = np.asarray(y_test_np).astype(int)
y_prob_arr = np.asarray(y_prob_np).astype(float)

precision, recall, _ = precision_recall_curve(y_true_arr, y_prob_arr)

pr_auc_val = metrics.get("PR-AUC (Average Precision)")
if pr_auc_val is None:
    pr_auc_val = float(average_precision_score(y_true_arr, y_prob_arr))
else:
    pr_auc_val = float(pr_auc_val)

fig_pr, ax_pr = plt.subplots(figsize=(6, 5))
ax_pr.plot(recall, precision, lw=2, label=f"PR-AUC = {pr_auc_val:.5f}")
ax_pr.set_xlabel("Recall")
ax_pr.set_ylabel("Precision")
ax_pr.set_title("Precision-Recall Curve")
ax_pr.legend()
ax_pr.grid(alpha=0.3)
fig_pr.tight_layout()

k_list = [5, 10, 15, 30]

figures = {
    "pr_curve": fig_pr,
}

for k in k_list:
    figures[f"confusion_matrix_top{k}"] = plot_confusion_matrix_topk(
        y_true_arr,
        y_prob_arr,
        k_pct=k,
        labels=("ÎπÑÏù¥ÌÉà(m1)", "Ïù¥ÌÉà(m2)"),
        cmap="Blues",
    )

save_model_and_artifacts(
    model=model,
    model_name="mlp_enhance",
    model_type="dl",
    model_id="dl__mlp_enhance",
    split="test",
    metrics=metrics,
    y_true=y_true_arr,
    y_prob=y_prob_arr,
    version="baseline",
    scaler=scaler,
    figures=figures,
)

plt.close(fig_pr)
for k in k_list:
    plt.close(figures[f"confusion_matrix_top{k}"])

In [28]:
import numpy as np

def br(name, a):
    a = np.asarray(a).reshape(-1)
    print(f"{name:9s} base_rate={a.mean():.6f} (pos={a.sum():.0f}/{len(a)})")

br("overall", y)
br("train", y_train)
br("val", y_val)
br("test", y_test)
br("train_res", y_train_res)  # SMOTE Ïº∞ÏùÑ ÎïåÎßå

overall   base_rate=0.818359 (pos=665768/813540)
train     base_rate=0.818360 (pos=399461/488124)
val       base_rate=0.818362 (pos=133154/162708)
test      base_rate=0.818356 (pos=133153/162708)
train_res base_rate=0.500000 (pos=399461/798922)
