In [5]:
from pathlib import Path
import sys, json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_curve, average_precision_score, confusion_matrix
import warnings
from sklearn.exceptions import ConvergenceWarning

warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=ConvergenceWarning)
PROJECT_ROOT = Path("/Users/jy/project_2nd/SKN23-2nd-3Team")
sys.path.insert(0, str(PROJECT_ROOT))

from app.utils.save import save_model_and_artifacts
from app.utils.paths import PATHS

try:
    from app.utils.plotting import configure_matplotlib_korean
    configure_matplotlib_korean()
except Exception:
    pass


def to_py(obj):
    if isinstance(obj, (np.integer,)): return int(obj)
    if isinstance(obj, (np.floating,)): return float(obj)
    if isinstance(obj, (np.bool_,)): return bool(obj)
    return obj

def dict_to_py(d: dict) -> dict:
    return {str(k): to_py(v) for k, v in d.items()}

def write_streamlit_score_percentiles(model_name: str, payload: dict) -> str:
    out = PROJECT_ROOT / "models" / "metrics" / f"{model_name}_score_percentiles.json"
    out.parent.mkdir(parents=True, exist_ok=True)
    out.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")
    return str(out)

def score_percentiles_payload(model_id: str, split: str, y_prob, pcts=(1, 5, 10, 20, 30, 50)):
    y_prob = np.asarray(y_prob, dtype=float).reshape(-1)
    percentiles = [{"pct": int(p), "score": float(np.quantile(y_prob, 1.0 - p / 100.0))} for p in pcts]
    return {"model_id": model_id, "split": split, "percentiles": percentiles}

def build_ranking_metrics(y_true, y_prob, k_list=(5, 10, 15, 30)):
    y_true = np.asarray(y_true).astype(int).reshape(-1)
    y_prob = np.asarray(y_prob).astype(float).reshape(-1)

    pr_auc = float(average_precision_score(y_true, y_prob))

    df_rank = pd.DataFrame({"y": y_true, "score": y_prob}).sort_values("score", ascending=False)
    base_rate = float(df_rank["y"].mean())
    total_pos = float(df_rank["y"].sum())
    n_total = int(len(df_rank))

    ranking = []
    for k in k_list:
        n_sel = max(int(np.floor(n_total * k / 100)), 1)
        selected = df_rank.iloc[:n_sel]

        precision_k = float(selected["y"].mean())
        recall_k = float(selected["y"].sum() / (total_pos + 1e-12))
        lift_k = float(precision_k / base_rate) if base_rate > 0 else 0.0

        ranking.append({
            "Top_K": f"{k}%",
            "n_selected": int(n_sel),
            "Precision": precision_k,
            "Recall": recall_k,
            "Lift": lift_k,
        })

    def _get(k_pct: int):
        target = f"{k_pct}%"
        for row in ranking:
            if row["Top_K"] == target:
                return row
        return {"Recall": 0.0, "Lift": 0.0}

    r5, r10, r30 = _get(5), _get(10), _get(30)
    selection_score = (
        0.55 * pr_auc
        + 0.20 * float(r10["Recall"])
        + 0.15 * float(r30["Recall"])
        + 0.05 * float(r5["Recall"])
        + 0.03 * float(r10["Lift"])
        + 0.02 * float(r5["Lift"])
    )

    return {
        "PR-AUC (Average Precision)": pr_auc,
        "base_rate": base_rate,
        "n_total": n_total,
        "ranking": ranking,
        "score_for_selection": float(selection_score),
    }

def threshold_topk(y_prob, k_pct: int) -> float:
    y_prob = np.asarray(y_prob, dtype=float).reshape(-1)
    order = np.argsort(-y_prob)
    n_sel = max(int(np.floor(len(y_prob) * k_pct / 100)), 1)
    return float(y_prob[order[n_sel - 1]])

def plot_confusion_matrix(y_true, y_pred, title, labels=("비이탈(m1)", "이탈(m2)"), cmap="Blues"):
    y_true = np.asarray(y_true).astype(int)
    y_pred = np.asarray(y_pred).astype(int)
    cm = confusion_matrix(y_true, y_pred)

    fig, ax = plt.subplots(figsize=(6, 5))
    im = ax.imshow(cm, cmap=cmap, interpolation="nearest", aspect="equal")
    fig.colorbar(im, ax=ax)

    ax.set_title(title)
    ax.set_xlabel("Predicted (예측값)")
    ax.set_ylabel("Actual (실제값)")

    ax.set_xticks([0, 1])
    ax.set_yticks([0, 1])
    ax.set_xticklabels(labels)
    ax.set_yticklabels(labels)

    thresh = cm.max() / 2.0 if cm.size else 0.0
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(
                j, i, f"{cm[i, j]}",
                ha="center", va="center",
                color="white" if cm[i, j] > thresh else "black",
            )

    ax.set_xlim(-0.5, cm.shape[1] - 0.5)
    ax.set_ylim(cm.shape[0] - 0.5, -0.5)

    fig.tight_layout()
    return fig

def make_figures(test_true, test_prob, k_list=(5, 10, 15, 30), cmap="Blues"):
    test_true = np.asarray(test_true).astype(int).reshape(-1)
    test_prob = np.asarray(test_prob).astype(float).reshape(-1)

    precision, recall, _ = precision_recall_curve(test_true, test_prob)
    pr_auc_val = float(average_precision_score(test_true, test_prob))

    if len(recall) > 0 and recall[-1] == 0:
        precision, recall = precision[:-1], recall[:-1]

    fig_pr, ax_pr = plt.subplots(figsize=(6, 5))
    ax_pr.plot(recall, precision, lw=2, label=f"PR-AUC = {pr_auc_val:.5f}")

    base = float(np.mean(test_true))
    ax_pr.hlines(base, 0, 1, linestyles="--", label=f"baseline={base:.3f}")

    ax_pr.set_xlabel("Recall")
    ax_pr.set_ylabel("Precision")
    ax_pr.set_title("Precision-Recall Curve")
    ax_pr.legend()
    ax_pr.grid(alpha=0.3)
    fig_pr.tight_layout()

    figures = {"pr_curve": fig_pr}
    for k in k_list:
        thr = threshold_topk(test_prob, k)
        y_pred_k = (test_prob >= thr).astype(int)
        figures[f"confusion_matrix_top{k}"] = plot_confusion_matrix(
            test_true, y_pred_k,
            title=f"Confusion Matrix (Top {k}%, thr={thr:.5f})",
            cmap=cmap,
        )
    return figures

def tune_lg(X_train, y_train, X_val, y_val, n_trials=30, seed=42):
    rng = np.random.default_rng(seed)

    space = {
        "C": [0.01, 0.03, 0.1, 0.3, 1.0, 3.0, 10.0],
        "penalty": ["l2"],
        "class_weight": [None, "balanced"],
    }

    best_val = -1.0
    best_params = None
    history = []

    for _ in range(n_trials):
        params = {k: rng.choice(v) for k, v in space.items()}
        params = {
            "C": float(params["C"]),
            "penalty": str(params["penalty"]),
            "class_weight": (None if params["class_weight"] is None else str(params["class_weight"])),
        }

        model = LogisticRegression(
            solver="lbfgs",
            max_iter=2000,
            random_state=42,
            **params,
        )
        model.fit(X_train, y_train)
        val_prob = model.predict_proba(X_val)[:, 1]
        val_pr_auc = float(average_precision_score(y_val, val_prob))

        history.append({"val_pr_auc": val_pr_auc, "params": dict(params)})
        if val_pr_auc > best_val:
            best_val = val_pr_auc
            best_params = dict(params)

    return {"val_pr_auc": float(best_val), "params": best_params, "history": history}


DATA_DIR = PROJECT_ROOT / "data" / "processed"
features = pd.read_parquet(DATA_DIR / "features_ml_clean.parquet")
labels = pd.read_parquet(DATA_DIR / "labels.parquet")

features["user_id"] = features["user_id"].astype(str)
labels["user_id"] = labels["user_id"].astype(str)

df = features.merge(
    labels[["user_id", "anchor_time", "label", "split"]],
    on=["user_id", "anchor_time"],
    how="inner",
    validate="one_to_one",
)

df["y"] = (df["label"] == "m2").astype(int)

drop_cols = ["user_id", "anchor_time", "label", "split", "y"]
feature_cols = [c for c in df.columns if c not in drop_cols]

train_df = df[df["split"] == "train"]
val_df = df[df["split"] == "val"]
test_df = df[df["split"] == "test"]

X_train, y_train = train_df[feature_cols], train_df["y"].to_numpy()
X_val, y_val = val_df[feature_cols], val_df["y"].to_numpy()
X_test, y_test = test_df[feature_cols], test_df["y"].to_numpy()

X_tv = pd.concat([X_train, X_val], axis=0)
y_tv = np.concatenate([y_train, y_val])

BEST = tune_lg(X_train, y_train, X_val, y_val, n_trials=30, seed=42)

MODEL_NAME = "lg"
MODEL_ID = "ml__lg"
VERSION = "v1_tuned"

final = LogisticRegression(
    solver="lbfgs",
    max_iter=2000,
    random_state=42,
    **BEST["params"],
)
final.fit(X_tv, y_tv)

test_prob = final.predict_proba(X_test)[:, 1]
test_true = np.asarray(y_test).astype(int)

metrics_payload = build_ranking_metrics(test_true, test_prob)
figures = make_figures(test_true, test_prob, k_list=(5, 10, 15, 30), cmap="Blues")

saved = save_model_and_artifacts(
    model=final,
    model_name=MODEL_NAME,
    model_type="ml",
    model_id=MODEL_ID,
    split="test",
    metrics=metrics_payload,
    y_true=test_true,
    y_prob=np.asarray(test_prob).astype(float),
    version=VERSION,
    scaler=None,
    figures=figures,
    config={
        "model_name": MODEL_NAME,
        "model_type": "ml",
        "version": VERSION,
        "feature_source": "features_ml_clean.parquet",
        "n_features": int(len(feature_cols)),
        "best_params": dict_to_py(BEST["params"]),
        "best_val_pr_auc": float(BEST["val_pr_auc"]),
    },
)

plt.close(figures["pr_curve"])
for k in (5, 10, 15, 30):
    plt.close(figures[f"confusion_matrix_top{k}"])

sp = score_percentiles_payload(MODEL_ID, "test", test_prob)
p_streamlit = write_streamlit_score_percentiles(MODEL_NAME, sp)

tuning_payload = {
    "model_id": MODEL_ID,
    "version": VERSION,
    "best_val_pr_auc": float(BEST["val_pr_auc"]),
    "best_params": dict_to_py(BEST["params"]),
    "history": BEST["history"],
}
tuning_path = Path(PATHS["models_metrics"]) / MODEL_NAME / VERSION / "tuning.json"
tuning_path.parent.mkdir(parents=True, exist_ok=True)
tuning_path.write_text(json.dumps(tuning_payload, ensure_ascii=False, indent=2), encoding="utf-8")

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=2000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=2000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=2000).
You might also want to 

4842