In [2]:
!pip install xgboost optuna pandas numpy

Collecting optuna
  Downloading optuna-4.7.0-py3-none-any.whl.metadata (17 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.10.1-py3-none-any.whl.metadata (11 kB)
Downloading optuna-4.7.0-py3-none-any.whl (413 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m413.9/413.9 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.10.1-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, optuna
Successfully installed colorlog-6.10.1 optuna-4.7.0


In [4]:
# =====================================================
# MAX PERFORMANCE XGBOOST COMPETITION PIPELINE
# =====================================================

import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

import xgboost
print("XGBoost version:", xgboost.__version__)

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import roc_auc_score, average_precision_score, classification_report, f1_score
from xgboost import XGBClassifier
import optuna

# =====================================================
# 1. LOAD DATA
# =====================================================

file_path = r"/Customer_Churn.xlsx"
df = pd.read_excel(file_path)

df.drop(columns=["customerID"], errors="ignore", inplace=True)

df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors="coerce")
df["TotalCharges"].fillna(0, inplace=True)
df["Churn"] = df["Churn"].map({"Yes":1, "No":0})

# =====================================================
# 2. STRONG FEATURE ENGINEERING
# =====================================================

df["avg_monthly"] = df["TotalCharges"] / (df["tenure"] + 1)
df["tenure_x_charge"] = df["tenure"] * df["MonthlyCharges"]
df["is_month_to_month"] = (df["Contract"] == "Month-to-month").astype(int)
df["fiber_flag"] = (df["InternetService"] == "Fiber optic").astype(int)

service_cols = [
    "PhoneService","MultipleLines","OnlineSecurity",
    "OnlineBackup","DeviceProtection",
    "TechSupport","StreamingTV","StreamingMovies"
]

existing_services = [c for c in service_cols if c in df.columns]
df["service_count"] = df[existing_services].apply(
    lambda row: sum(row == "Yes"), axis=1
)

# =====================================================
# 3. PREPARE DATA
# =====================================================

X = df.drop("Churn", axis=1)
y = df["Churn"]

X = pd.get_dummies(X, drop_first=True)

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

scale_pos_weight = (y_train == 0).sum() / (y_train == 1).sum()

# =====================================================
# 4. OPTUNA TUNING (WIDE SEARCH SPACE)
# =====================================================

def objective(trial):

    params = {
        "n_estimators": trial.suggest_int("n_estimators", 800, 2000),
        "max_depth": trial.suggest_int("max_depth", 2, 10),
        "learning_rate": trial.suggest_float("learning_rate", 0.005, 0.15),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "gamma": trial.suggest_float("gamma", 0, 10),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 15),
        "reg_alpha": trial.suggest_float("reg_alpha", 0, 20),
        "reg_lambda": trial.suggest_float("reg_lambda", 0.1, 20),
        "scale_pos_weight": scale_pos_weight,
        "tree_method": "hist",
        "predictor": "gpu_predictor",
        "eval_metric": "aucpr",
        "random_state": 42,
        "n_jobs": -1
    }

    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    scores = []

    for train_idx, val_idx in skf.split(X_train, y_train):
        X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

        model = XGBClassifier(**params)
        model.fit(X_tr, y_tr)

        preds = model.predict_proba(X_val)[:,1]
        score = average_precision_score(y_val, preds)
        scores.append(score)

    return np.mean(scores)

print("üîç Running aggressive Optuna search...")
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=150)

best_params = study.best_params

best_params.update({
    "scale_pos_weight": scale_pos_weight,
    "tree_method": "hist",
    "predictor": "gpu_predictor",
    "eval_metric": "aucpr",
    "random_state": 42,
    "n_jobs": -1
})

print("Best Params Found:")
print(best_params)

# =====================================================
# 5. FINAL TRAINING WITH EARLY STOPPING
# =====================================================

final_xgb = XGBClassifier(
    **best_params,
    early_stopping_rounds=150
)

final_xgb.fit(
    X_train,
    y_train,
    eval_set=[(X_test, y_test)],
    verbose=False
)

xgb_prob = final_xgb.predict_proba(X_test)[:,1]

auc_score = roc_auc_score(y_test, xgb_prob)
aucpr_score = average_precision_score(y_test, xgb_prob)

print("\nüî• FINAL XGBOOST AUC:", auc_score)
print("üî• FINAL AUC-PR:", aucpr_score)

# =====================================================
# 6. FEATURE PRUNING (REMOVE WEAK FEATURES)
# =====================================================

importances = final_xgb.feature_importances_
threshold = np.percentile(importances, 10)

important_cols = X_train.columns[importances > threshold]

X_train_pruned = X_train[important_cols]
X_test_pruned = X_test[important_cols]

print("Features kept:", len(important_cols))

# Retrain on pruned features
final_xgb_pruned = XGBClassifier(
    **best_params,
    early_stopping_rounds=150
)

final_xgb_pruned.fit(
    X_train_pruned,
    y_train,
    eval_set=[(X_test_pruned, y_test)],
    verbose=False
)

final_prob = final_xgb_pruned.predict_proba(X_test_pruned)[:,1]

final_auc = roc_auc_score(y_test, final_prob)
print("\nüî• PRUNED MODEL AUC:", final_auc)

# =====================================================
# 7. THRESHOLD OPTIMIZATION
# =====================================================

best_f1 = 0
best_threshold = 0.5

for t in np.arange(0.2, 0.8, 0.01):
    preds = (final_prob > t).astype(int)
    f1 = f1_score(y_test, preds)

    if f1 > best_f1:
        best_f1 = f1
        best_threshold = t

print("Best Threshold:", best_threshold)

y_pred_final = (final_prob > best_threshold).astype(int)

# =====================================================
# 8. FINAL REPORT
# =====================================================

print("\n===== FINAL CLASSIFICATION REPORT =====")
print(classification_report(y_test, y_pred_final))
print("FINAL AUC:", final_auc)



XGBoost version: 3.1.3


[I 2026-02-11 18:29:04,529] A new study created in memory with name: no-name-8c3c5f43-6570-4e2f-80d4-07cb479335c0


üîç Running aggressive Optuna search...


[I 2026-02-11 18:29:10,870] Trial 0 finished with value: 0.6672965597661786 and parameters: {'n_estimators': 1622, 'max_depth': 6, 'learning_rate': 0.047613071669619075, 'subsample': 0.6661542342918452, 'colsample_bytree': 0.7972263478884166, 'gamma': 6.886248516418133, 'min_child_weight': 13, 'reg_alpha': 2.8052032952017014, 'reg_lambda': 6.885409772285795}. Best is trial 0 with value: 0.6672965597661786.
[I 2026-02-11 18:29:14,530] Trial 1 finished with value: 0.6700298063885495 and parameters: {'n_estimators': 1451, 'max_depth': 2, 'learning_rate': 0.13894976190983852, 'subsample': 0.6761088639496132, 'colsample_bytree': 0.5092156866966919, 'gamma': 3.797336520982207, 'min_child_weight': 12, 'reg_alpha': 16.06376853445587, 'reg_lambda': 10.990307474420806}. Best is trial 1 with value: 0.6700298063885495.
[I 2026-02-11 18:29:26,871] Trial 2 finished with value: 0.6385863344964345 and parameters: {'n_estimators': 1486, 'max_depth': 5, 'learning_rate': 0.08103926292485233, 'subsample':

Best Params Found:
{'n_estimators': 1788, 'max_depth': 9, 'learning_rate': 0.12413495586925252, 'subsample': 0.5965306041991241, 'colsample_bytree': 0.6300681036179655, 'gamma': 3.7805016511251135, 'min_child_weight': 14, 'reg_alpha': 10.869924819619547, 'reg_lambda': 4.7449117733302595, 'scale_pos_weight': np.float64(2.768561872909699), 'tree_method': 'hist', 'predictor': 'gpu_predictor', 'eval_metric': 'aucpr', 'random_state': 42, 'n_jobs': -1}

üî• FINAL XGBOOST AUC: 0.8491565269058876
üî• FINAL AUC-PR: 0.663707286066256
Features kept: 31

üî• PRUNED MODEL AUC: 0.8500542509493916
Best Threshold: 0.5300000000000002

===== FINAL CLASSIFICATION REPORT =====
              precision    recall  f1-score   support

           0       0.91      0.76      0.83      1035
           1       0.54      0.79      0.64       374

    accuracy                           0.77      1409
   macro avg       0.72      0.77      0.73      1409
weighted avg       0.81      0.77      0.78      1409

FINA