In [1]:
# 범주형 데이터를 인코딩하여 학습 + Stratified K-Fold

In [13]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, StratifiedKFold
from catboost import CatBoostClassifier
from sklearn.metrics import classification_report, roc_auc_score, f1_score, average_precision_score

In [14]:
df = pd.read_csv('../data/okcupid_preprocessed.csv')

In [15]:
df["last_online_dt"] = pd.to_datetime(df["last_online"])
ref_date = df["last_online_dt"].max() # 마지막 관측 시간

# churn 정의
THRESHOLD_DAYS = 30
df["inactive_days"] = (ref_date - df["last_online_dt"]).dt.days
df["churn"] = (df["inactive_days"] >= THRESHOLD_DAYS).astype(int)

print("ref_date:", ref_date)
print("churn rate:", df["churn"].mean())

ref_date: 2012-07-01 00:00:00
churn rate: 0.259235158674542


In [16]:
# X 데이터 정리
drop_cols = [
    "Unnamed: 0", "job", "last_online", "last_online_dt", "inactive_days", "churn",
     "status", "diet", "location", "pets", "city" 
]

drop_list = []

for col in drop_cols:
    if col in df.columns:
        drop_list.append(col)

X = df.drop(columns=drop_list)

# 라벨 데이터
y = df["churn"]

print("X shape:", X.shape)

X shape: (59934, 35)


In [17]:
# 모델
cb = CatBoostClassifier(
    loss_function="Logloss",
    eval_metric="AUC",
    random_seed=0,
    verbose=0,
)

In [18]:
# 교차 검증
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

auc_list = []
best_f1_list = []
best_thres_list = []

thresholds = np.linspace(0.05, 0.95, 91)

for fold, (tr_idx, va_idx) in enumerate(skf.split(X, y), start=1):
    X_train = X.iloc[tr_idx]
    X_val =  X.iloc[va_idx]
    y_train = y.iloc[tr_idx]
    y_val  = y.iloc[va_idx]

    cb.fit(X_train, y_train)

    proba = cb.predict_proba(X_val)[:, 1]

    auc = roc_auc_score(y_val, proba)
    auc_list.append(auc)

    best_thres = None
    best_f1 = -1

    for t in thresholds:
        pred = (proba >= t).astype(int)
        f1 = f1_score(y_val, pred, zero_division=0)
        if f1 > best_f1:
            best_f1 = f1
            best_thres = t

    best_f1_list.append(best_f1)
    best_thres_list.append(best_thres)

    print(f"[Fold {fold}] AUC={auc:.4f}, bestF1={best_f1:.4f}, best_threshold={best_thres:.2f}")

[Fold 1] AUC=0.6291, bestF1(1)=0.4339, best_threshold=0.22
[Fold 2] AUC=0.6184, bestF1(1)=0.4284, best_threshold=0.20
[Fold 3] AUC=0.6216, bestF1(1)=0.4340, best_threshold=0.21
[Fold 4] AUC=0.6256, bestF1(1)=0.4321, best_threshold=0.19
[Fold 5] AUC=0.6278, bestF1(1)=0.4361, best_threshold=0.22


In [None]:
print("ROC-AUC mean/std:", np.mean(auc_list), np.std(auc_list))
print("BestF1(1) mean/std:", np.mean(best_f1_list), np.std(best_f1_list))
print("Best threshold avg:", np.mean(best_thres_list))


ROC-AUC mean/std: 0.6244826487398095 0.00398154285651089
BestF1(1) mean/std: 0.43290194002721893 0.0026118154541043873
Best threshold avg: 0.20799999999999996
