In [None]:
import pandas as pd
import xgboost as xgb
import matplotlib.pyplot as plt
import numpy as np
import optuna
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, roc_curve, precision_recall_curve

# Îç∞Ïù¥ÌÑ∞ Ï§ÄÎπÑ
y = df['churn']
X = df.drop(columns=['churn'])

# ÌïôÏäµ Î∞è ÌÖåÏä§Ìä∏ Îç∞Ïù¥ÌÑ∞ Î∂ÑÌï†
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# üéØ **ÌïòÏù¥ÌçºÌååÎùºÎØ∏ÌÑ∞ ÏµúÏ†ÅÌôî (Optuna)**
def objective(trial):
    params = {
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'n_estimators': trial.suggest_int('n_estimators', 100, 500),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'scale_pos_weight': trial.suggest_float('scale_pos_weight', 1, 3),  # Ïù¥ÌÉà Í≥†Í∞ù ÎπÑÏú® Ï¶ùÍ∞Ä
        'objective': 'binary:logistic',
        'eval_metric': 'logloss',
        'use_label_encoder': False,
        'random_state': 42
    }
    
    # Î™®Îç∏ ÌïôÏäµ
    model = xgb.XGBClassifier(**params)
    model.fit(X_train, y_train)

    # ÏòàÏ∏° ÌôïÎ•†
    y_probs = model.predict_proba(X_test)[:, 1]

    # AUC ÌèâÍ∞Ä Í∏∞Ï§Ä
    return roc_auc_score(y_test, y_probs)

# **Optuna Ïã§Ìñâ**
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=30)

# ÏµúÏ†Å ÌïòÏù¥ÌçºÌååÎùºÎØ∏ÌÑ∞ Ï†ÅÏö©
best_params = study.best_params
best_xgb_model = xgb.XGBClassifier(**best_params)
best_xgb_model.fit(X_train, y_train)

# ÏòàÏ∏° ÌôïÎ•†
y_probs = best_xgb_model.predict_proba(X_test)[:, 1]

# AUC Score Í≥ÑÏÇ∞
auc_score = roc_auc_score(y_test, y_probs)

# ÏµúÏ†Å Threshold Ï∞æÍ∏∞
precision, recall, thresholds = precision_recall_curve(y_test, y_probs)
f1_scores = 2 * (precision * recall) / (precision + recall + 1e-6)
best_threshold = thresholds[np.argmax(f1_scores)]
y_pred_optimal = (y_probs > best_threshold).astype(int)

# ÏµúÏ†Å Threshold Ï†ÅÏö© ÌõÑ ÌèâÍ∞Ä
accuracy_optimal = accuracy_score(y_test, y_pred_optimal)
report_optimal = classification_report(y_test, y_pred_optimal, digits=4)

# Precision-Recall Curve ÏãúÍ∞ÅÌôî
plt.figure(figsize=(8, 6))
plt.plot(thresholds, precision[:-1], label="Precision", linestyle="--")
plt.plot(thresholds, recall[:-1], label="Recall")
plt.axvline(x=best_threshold, color="red", linestyle="dashed", label=f"Best Threshold ({best_threshold:.4f})")
plt.xlabel("Threshold")
plt.ylabel("Score")
plt.title("Precision-Recall Tradeoff")
plt.legend()
plt.show()

# Í≤∞Í≥º Ï∂úÎ†•
print(f'Best Hyperparameters: {best_params}')
print(f'Best Threshold: {best_threshold:.4f}')
print(f'Adjusted Accuracy: {accuracy_optimal:.4f}')
print(f'AUC Score: {auc_score:.4f}')
print('Classification Report:\n', report_optimal)
