In [2]:
import json
from pathlib import Path
import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import average_precision_score
import joblib

# =========================
# 경로 (너가 말한 고정 경로)
# =========================
PROJECT_ROOT = Path(r"C:\SKNfolder\SKN23-2nd-3Team")

DATA_DIR = PROJECT_ROOT / "data" / "processed"
APP_READY = PROJECT_ROOT / "models" / "eval" / "lg"
APP_READY.mkdir(parents=True, exist_ok=True)

paths = {
    "labels":   DATA_DIR / "labels.parquet",
    "anchors":  DATA_DIR / "anchors.parquet",
    "features": DATA_DIR / "features_ml_clean.parquet",  # 필요 시 features_ml.parquet로 변경
}

for k, p in paths.items():
    print(k, "->", p, "exists?", p.exists())


labels -> C:\SKNfolder\SKN23-2nd-3Team\data\processed\labels.parquet exists? True
anchors -> C:\SKNfolder\SKN23-2nd-3Team\data\processed\anchors.parquet exists? True
features -> C:\SKNfolder\SKN23-2nd-3Team\data\processed\features_ml_clean.parquet exists? True


In [3]:
labels = pd.read_parquet(paths["labels"])
features = pd.read_parquet(paths["features"])

# 타입 정리
labels["user_id"] = labels["user_id"].astype(str)
features["user_id"] = features["user_id"].astype(str)

labels["anchor_time"] = pd.to_datetime(labels["anchor_time"])
features["anchor_time"] = pd.to_datetime(features["anchor_time"])

df = features.merge(labels, on=["user_id", "anchor_time"], how="inner")
print("merged shape:", df.shape)
print(df['n_events_30d'].values)
print(df.head(1))


merged shape: (813540, 20)
[17  6  2 ...  0  0 14]
     user_id anchor_time  n_events_30d  active_days_30d  n_purchase_30d  \
0  113868975  2020-01-31            17                8               0   

   purchase_ratio  days_since_last_event  days_since_last_purchase  \
0             0.0                0.44691                      31.0   

   brand_concentration_ratio  brand_switch_count_30d  total_spend_30d  \
0                   0.588235                       9              0.0   

   activity_ratio_15d  price_volatility  n_events_7d  visit_regularity  \
0            0.705882               0.0          8.0          1.494799   

   activity_trend home_brand future_brand label  split  
0        0.470588      apple        apple    m0  train  


In [4]:
# 라벨 컬럼 후보
label_candidates = ["y", "label", "target", "is_dormant", "dormant", "churn"]
label_col = next((c for c in label_candidates if c in df.columns), None)

assert label_col is not None, f"라벨 컬럼을 못 찾았습니다. candidates={label_candidates} / df.columns 확인 필요"
print("label_col =", label_col)

POS_LABEL = "m2"  # 휴면 정의

y_raw = df[label_col].astype(str).str.strip().str.lower()
y = (y_raw == POS_LABEL).astype(int).values

print("base_rate(휴면 비율) =", y.mean())


label_col = label
base_rate(휴면 비율) = 0.8183592693659808


In [15]:
exclude_cols = {"user_id", "anchor_time", label_col}
if "split" in df.columns:
    exclude_cols.add("split")

feature_cols = [c for c in df.columns if c not in exclude_cols]

X = df[feature_cols].copy()

# 비수치형 처리
for c in X.columns:
    if not pd.api.types.is_numeric_dtype(X[c]):
        X[c] = pd.to_numeric(X[c], errors="coerce")
X = X.fillna(0)

print("X shape:", X.shape, "n_features:", len(feature_cols))


X shape: (813540, 16) n_features: 16


In [16]:
# split 사용
assert "split" in df.columns, "labels에 split이 없으면 train/val/test를 먼저 만들거나 셀을 수정해야 합니다."

split = df["split"].astype(str).str.lower().values
train_mask = split == "train"
val_mask = np.isin(split, ["val", "valid", "validation"])
test_mask = split == "test"

X_train, y_train = X.loc[train_mask], y[train_mask]
X_val, y_val     = X.loc[val_mask],   y[val_mask]
X_test, y_test   = X.loc[test_mask],  y[test_mask]

print("train/val/test:", len(X_train), len(X_val), len(X_test))


train/val/test: 574092 137615 101833


In [17]:
# 로지스틱 회귀(분류) - CPU 모델
clf = LogisticRegression(
    solver="liblinear",        # 안정적인 이진분류 솔버
    max_iter=2000,
    class_weight=None,         # 필요하면 "balanced"로 바꿔 비교
    n_jobs=1                   # liblinear는 n_jobs 영향 적음
)

clf.fit(X_train, y_train)
print("trained.")




trained.


In [18]:
proba_test = clf.predict_proba(X_test)[:, 1]   # 휴면(1) 확률(점수)
ap = average_precision_score(y_test, proba_test)
print("PR-AUC(AP) =", ap)

df_test = df.loc[X_test.index, ["user_id", "anchor_time"]].copy()
df_test["risk_score"] = proba_test
df_test["y_true"] = y_test

df_test.head()


PR-AUC(AP) = 0.9297679675811055


Unnamed: 0,user_id,anchor_time,risk_score,y_true
2,113868975,2020-03-31,0.902448,0
22,273568916,2020-03-31,0.940549,1
43,299358698,2020-03-31,0.793866,1
75,322630864,2020-03-31,0.839036,1
79,327023582,2020-03-31,0.807038,1


In [19]:
def topk_metrics(df_scored: pd.DataFrame, k_percent: int) -> dict:
    df_sorted = df_scored.sort_values("risk_score", ascending=False).reset_index(drop=True)
    n = len(df_sorted)
    k = max(int(np.ceil(n * (k_percent / 100))), 1)
    topk = df_sorted.head(k)

    precision_k = float(topk["y_true"].mean())
    total_pos = int(df_sorted["y_true"].sum())
    captured_pos = int(topk["y_true"].sum())
    recall_k = float(captured_pos / total_pos) if total_pos > 0 else 0.0

    base_rate = float(df_sorted["y_true"].mean())
    lift_k = float(precision_k / base_rate) if base_rate > 0 else float("nan")

    cutoff = float(topk["risk_score"].min())

    return {
        "k_percent": k_percent,
        "n_scored": n,
        "n_topk": k,
        "base_rate": base_rate,
        "precision": precision_k,
        "recall": recall_k,
        "lift": lift_k,
        "cutoff": cutoff,
        "tp_in_topk": captured_pos,
        "pos_total": total_pos,
    }

K_LIST = [5, 10, 15, 20, 30]
topk_rows = [topk_metrics(df_test, k) for k in K_LIST]
topk_df = pd.DataFrame(topk_rows)
topk_df


Unnamed: 0,k_percent,n_scored,n_topk,base_rate,precision,recall,lift,cutoff,tp_in_topk,pos_total
0,5,101833,5092,0.867126,0.959741,0.055344,1.106807,0.933527,4887,88302
1,10,101833,10184,0.867126,0.953849,0.110009,1.100013,0.92466,9714,88302
2,15,101833,15275,0.867126,0.95234,0.164741,1.098273,0.91659,14547,88302
3,20,101833,20367,0.867126,0.949821,0.219078,1.095367,0.908601,19345,88302
4,30,101833,30550,0.867126,0.944517,0.326776,1.089251,0.88942,28855,88302


In [20]:
# Top 5% 대상자(쿠폰 발송 대상)
k_target = 5
k_n = max(int(np.ceil(len(df_test) * (k_target / 100))), 1)

top5 = df_test.sort_values("risk_score", ascending=False).head(k_n).copy()
print("Top5 rows:", len(top5), "unique users:", top5["user_id"].nunique())

top5.head(10)


Top5 rows: 5092 unique users: 5092


Unnamed: 0,user_id,anchor_time,risk_score,y_true
612601,573096859,2020-03-31,0.944746,0
797045,622358971,2020-03-31,0.944746,1
664063,584301560,2020-03-31,0.944737,1
666205,584797434,2020-03-31,0.944734,1
797055,622366290,2020-03-31,0.94472,1
797056,622366296,2020-03-31,0.94472,1
786513,616913666,2020-03-31,0.944718,1
660686,583522566,2020-03-31,0.944703,1
679266,587848687,2020-03-31,0.944702,1
797060,622369718,2020-03-31,0.944702,1


In [1]:
# 1) scoring 결과(필수): Streamlit에서 TopK 추출/다운로드하려면 필요
(df_test[["user_id","anchor_time","risk_score","y_true"]]
 .to_parquet(APP_READY / "scoring.parquet", index=False))

# 2) Top5만 별도 저장(선택이지만 쿠폰 발송에 편함)
(top5[["user_id","anchor_time","risk_score","y_true"]]
 .to_parquet(APP_READY / "topk_users_5pct.parquet", index=False))

# 3) metrics.json (너희가 말한 4개 지표)
row5 = topk_df[topk_df["k_percent"] == 5].iloc[0].to_dict()
metrics = {
    "PR-AUC (Average Precision)": float(ap),
    "상위 5% 정밀도 (Precision)": float(row5["precision"]),
    "상위 5% 재현율 (Recall)": float(row5["recall"]),
    "상위 5% 리프트 (Lift)": float(row5["lift"]),
    # 해석 안정성을 위한 메타(권장)
    "base_rate": float(row5["base_rate"]),
    "n_scored": int(row5["n_scored"]),
    "unit": "user",
    "lift_definition": "precision@5 / base_rate",
}
(APP_READY / "metrics.json").write_text(json.dumps(metrics, ensure_ascii=False, indent=2), encoding="utf-8")

# 4) topk_metrics.json (K별)
topk_metrics_out = {"unit": "user", "table": topk_rows}
(APP_READY / "topk_metrics.json").write_text(json.dumps(topk_metrics_out, ensure_ascii=False, indent=2), encoding="utf-8")

# 5) topk_cutoffs.json (K별 cutoff만)
cutoffs_out = {str(r["k_percent"]): {"cutoff": r["cutoff"], "n_topk": r["n_topk"]} for r in topk_rows}
(APP_READY / "topk_cutoffs.json").write_text(json.dumps(cutoffs_out, ensure_ascii=False, indent=2), encoding="utf-8")

# 6) model_card.json (간단 메타)
model_card = {
    "model_type": "LogisticRegression(sklearn)",
    "task": "binary classification (dormant=1 vs non-dormant=0)",
    "n_features": len(feature_cols),
    "data_paths": {k: str(v) for k,v in paths.items()},
    "eval_split": "test",
    "k_list": K_LIST,
}
(APP_READY / "model_card.json").write_text(json.dumps(model_card, ensure_ascii=False, indent=2), encoding="utf-8")

# 7) (선택) 모델 저장(joblib) - 앱에서 재사용/재현을 위해 저장 권장
joblib.dump(clf, APP_READY / "model.joblib")
(APP_READY / "feature_list.json").write_text(json.dumps({"features": feature_cols}, ensure_ascii=False, indent=2), encoding="utf-8")

print("Saved outputs to:", APP_READY)


NameError: name 'df_test' is not defined

In [24]:
import pandas as pd
from pathlib import Path

scoring_path = Path(r"C:\SKNfolder\SKN23-2nd-3Team\models\eval\lg\scoring.parquet")
df_score = pd.read_parquet(scoring_path)

print(df_score.shape)
print(df_score.columns)

# 유저 중복 체크(유저-앵커 단위인지 확인)
print("n_rows:", len(df_score))
print("n_unique_users:", df_score["user_id"].nunique())

# split이 있으면 바로 확인
if "split" in df_score.columns:
    print(df_score["split"].value_counts())


(101833, 4)
Index(['user_id', 'anchor_time', 'risk_score', 'y_true'], dtype='object')
n_rows: 101833
n_unique_users: 101833
