In [None]:
import pandas as pd
import xgboost as xgb
import matplotlib.pyplot as plt
import numpy as np
import optuna
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, roc_curve, precision_recall_curve

# 데이터 준비
y = df['churn']
X = df.drop(columns=['churn'])

# 학습 및 테스트 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# 🎯 **하이퍼파라미터 최적화 (Optuna)**
def objective(trial):
    params = {
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'n_estimators': trial.suggest_int('n_estimators', 100, 500),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'scale_pos_weight': trial.suggest_float('scale_pos_weight', 1, 3),  # 이탈 고객 비율 증가
        'objective': 'binary:logistic',
        'eval_metric': 'logloss',
        'use_label_encoder': False,
        'random_state': 42
    }
    
    # 모델 학습
    model = xgb.XGBClassifier(**params)
    model.fit(X_train, y_train)

    # 예측 확률
    y_probs = model.predict_proba(X_test)[:, 1]

    # AUC 평가 기준
    return roc_auc_score(y_test, y_probs)

# **Optuna 실행**
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=30)

# 최적 하이퍼파라미터 적용
best_params = study.best_params
best_xgb_model = xgb.XGBClassifier(**best_params)
best_xgb_model.fit(X_train, y_train)

# 예측 확률
y_probs = best_xgb_model.predict_proba(X_test)[:, 1]

# AUC Score 계산
auc_score = roc_auc_score(y_test, y_probs)

# 최적 Threshold 찾기
precision, recall, thresholds = precision_recall_curve(y_test, y_probs)
f1_scores = 2 * (precision * recall) / (precision + recall + 1e-6)
best_threshold = thresholds[np.argmax(f1_scores)]
y_pred_optimal = (y_probs > best_threshold).astype(int)

# 최적 Threshold 적용 후 평가
accuracy_optimal = accuracy_score(y_test, y_pred_optimal)
report_optimal = classification_report(y_test, y_pred_optimal, digits=4)

# Precision-Recall Curve 시각화
plt.figure(figsize=(8, 6))
plt.plot(thresholds, precision[:-1], label="Precision", linestyle="--")
plt.plot(thresholds, recall[:-1], label="Recall")
plt.axvline(x=best_threshold, color="red", linestyle="dashed", label=f"Best Threshold ({best_threshold:.4f})")
plt.xlabel("Threshold")
plt.ylabel("Score")
plt.title("Precision-Recall Tradeoff")
plt.legend()
plt.show()

# 결과 출력
print(f'Best Hyperparameters: {best_params}')
print(f'Best Threshold: {best_threshold:.4f}')
print(f'Adjusted Accuracy: {accuracy_optimal:.4f}')
print(f'AUC Score: {auc_score:.4f}')
print('Classification Report:\n', report_optimal)
