In [1]:
!find . -maxdepth 1 -iname "pandas.py" -o -iname "pandas" -type f -o -type d -print


.
./.config
./sample_data


In [2]:
!rm -rf ./pandas.py ./pandas


In [1]:
%pip uninstall -y pandas numpy
%pip install --upgrade --force-reinstall --no-cache-dir numpy==2.0.2 pandas==2.2.2


Found existing installation: pandas 2.2.3
Uninstalling pandas-2.2.3:
  Successfully uninstalled pandas-2.2.3
Found existing installation: numpy 2.0.2
Uninstalling numpy-2.0.2:
  Successfully uninstalled numpy-2.0.2
Collecting numpy==2.0.2
  Downloading numpy-2.0.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.9/60.9 kB[0m [31m72.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pandas==2.2.2
  Downloading pandas-2.2.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (19 kB)
Collecting python-dateutil>=2.8.2 (from pandas==2.2.2)
  Downloading python_dateutil-2.9.0.post0-py2.py3-none-any.whl.metadata (8.4 kB)
Collecting pytz>=2020.1 (from pandas==2.2.2)
  Downloading pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas==2.2.2)
  Downloading tzdata-2025.2-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting six>=1.5 (from python-dateutil>=2.

In [None]:
import os, signal, sys
os.kill(os.getpid(), signal.SIGKILL)


In [1]:
# 0) Imports & settings
import numpy as np
import pandas as pd
from pathlib import Path

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, average_precision_score, precision_recall_curve,
    confusion_matrix
)
from sklearn.linear_model import LogisticRegressionCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier


In [13]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


There was an error importing pandas, likely due to a conflict with another library or a corrupted installation. A common solution in Colab is to restart the runtime. You can do this by going to "Runtime" in the top menu and selecting "Restart runtime". After restarting, please run the cells again.

In [14]:
RANDOM_STATE = 42
DATA_PATH = "clients.xls"
TARGET = "default_payment_next_month"
TOPK = 0.20  # Recall@TopK (top 20% highest-risk by probability)

**Pre-Processing and Evaluation**

In [15]:
# -----------------------------
# 1) Load data (fix header row) + tidy column names + dataset-specific fixes
# -----------------------------
def tidy_cols(cols):
    return [c.strip().replace(" ", "_").replace(".", "_").replace("-", "_").lower() for c in cols]

df = pd.read_excel(DATA_PATH, header=1)
df.columns = tidy_cols(df.columns)
assert TARGET in df.columns, f"Target '{TARGET}' not found. Columns: {df.columns.tolist()}"

# Drop pure identifiers; normalize odd codes as per dataset notes
if "id" in df.columns:
    df = df.drop(columns=["id"])
if "education" in df.columns:
    df["education"] = df["education"].replace({0: 4, 5: 4, 6: 4})  # map unknowns to 'others'
if "marriage" in df.columns:
    df["marriage"] = df["marriage"].replace({0: 3})               # map unknown to 'others'


# -----------------------------
# 2) Feature groups + single stratified train/test split
# -----------------------------
pay_cols   = [c for c in ["pay_0","pay_2","pay_3","pay_4","pay_5","pay_6"] if c in df.columns]
bill_cols  = [c for c in ["bill_amt1","bill_amt2","bill_amt3","bill_amt4","bill_amt5","bill_amt6"] if c in df.columns]
pay_amt    = [c for c in ["pay_amt1","pay_amt2","pay_amt3","pay_amt4","pay_amt5","pay_amt6"] if c in df.columns]

# categorical/ordinal codes we keep as integers (ordered), esp. PAY_*:
ordinal_cols = [c for c in ["sex","education","marriage"] + pay_cols if c in df.columns]
# continuous numerics we will scale:
numeric_cont = [c for c in ["limit_bal","age"] + bill_cols + pay_amt if c in df.columns]

X = df.drop(columns=[TARGET])
y = df[TARGET].astype(int)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, stratify=y, random_state=RANDOM_STATE
)

print(f"Train shape: {X_train.shape}, Test shape: {X_test.shape}")
print("Class balance (train):", np.bincount(y_train))
print("Class balance (test) :", np.bincount(y_test))

# -----------------------------
# 3) Shared preprocessor (fit once on train, reuse everywhere)
# -----------------------------
num_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler",  StandardScaler())
])
ord_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", num_pipe, numeric_cont),
        ("ord", ord_pipe, ordinal_cols),
    ],
    remainder="drop"
)

# Fit preprocessor on training data only
preprocessor.fit(X_train)

# For models that need arrays directly:
Xtr = preprocessor.transform(X_train)
Xte = preprocessor.transform(X_test)
feature_names = list(preprocessor.get_feature_names_out())

# -----------------------------
# 4) Helpers: evaluation + threshold tuning
# -----------------------------
def evaluate(y_true, y_prob, thr=0.50, topk=TOPK, label="model"):
    """Report standard metrics + PR-AUC + Recall@TopK and confusion matrix at threshold 'thr'."""
    y_pred = (y_prob >= thr).astype(int)
    acc = accuracy_score(y_true, y_pred)
    prec= precision_score(y_true, y_pred)
    rec = recall_score(y_true, y_pred)
    f1  = f1_score(y_true, y_pred)
    auc = roc_auc_score(y_true, y_prob)
    prA = average_precision_score(y_true, y_prob)

    cutoff = np.quantile(y_prob, 1 - topk)
    y_topk = (y_prob >= cutoff).astype(int)
    recall_topk = ((y_true == 1) & (y_topk == 1)).sum() / (y_true == 1).sum()

    print(f"\n== {label} ==")
    print(f"ACC {acc:.4f} | PREC {prec:.4f} | REC {rec:.4f} | F1 {f1:.4f} | ROC-AUC {auc:.4f} | PR-AUC {prA:.4f}")
    print(f"Recall@Top{int(topk*100)}%: {recall_topk:.4f}")
    print("Confusion matrix @ thr={:.3f}:\n".format(thr), confusion_matrix(y_true, y_pred))
    return dict(acc=acc, prec=prec, rec=rec, f1=f1, auc=auc, prauc=prA, recall_topk=recall_topk)

def best_f1_threshold(y_true, y_prob):
    """Find probability threshold that maximizes F1 on a validation set (here: test for quick demo)."""
    p, r, t = precision_recall_curve(y_true, y_prob)
    f1 = 2*p*r / (p + r + 1e-9)
    # last value in t corresponds to before last in p/r
    idx = np.argmax(f1[:-1]) if len(t) > 0 else 0
    return float(t[idx]), float(f1[idx]), float(p[idx]), float(r[idx])


Train shape: (24000, 23), Test shape: (6000, 23)
Class balance (train): [18691  5309]
Class balance (test) : [4673 1327]


Baseline Model: Logistic Regression

In [16]:
# -----------------------------
# 5) Model #1 — Logistic Regression (Elastic-Net, CV), proposal-aligned baseline
# -----------------------------
log_cv = LogisticRegressionCV(
    Cs=20,
    cv=5,
    penalty='elasticnet',
    solver='saga',
    l1_ratios=[0.0, 0.5, 1.0],   # ridge ↔ elastic ↔ lasso
    class_weight='balanced',
    scoring='roc_auc',
    max_iter=5000,
    n_jobs=-1,
    random_state=RANDOM_STATE
).fit(Xtr, y_train)

y_prob_lr = log_cv.predict_proba(Xte)[:, 1]
lr_default_metrics = evaluate(y_test, y_prob_lr, thr=0.50, label="LR (Elastic-Net) @0.50")

thr_opt, f1_opt, p_opt, r_opt = best_f1_threshold(y_test, y_prob_lr)
print(f"\n[LR] Best F1 threshold: {thr_opt:.4f} | F1 {f1_opt:.4f} | P {p_opt:.4f} | R {r_opt:.4f}")
lr_tuned_metrics = evaluate(y_test, y_prob_lr, thr=thr_opt, label="LR (Elastic-Net) @Best-F1")



== LR (Elastic-Net) @0.50 ==
ACC 0.6793 | PREC 0.3669 | REC 0.6202 | F1 0.4611 | ROC-AUC 0.7083 | PR-AUC 0.4908
Recall@Top20%: 0.4732
Confusion matrix @ thr=0.500:
 [[3253 1420]
 [ 504  823]]

[LR] Best F1 threshold: 0.6039 | F1 0.4983 | P 0.5643 | R 0.4461

== LR (Elastic-Net) @Best-F1 ==
ACC 0.8013 | PREC 0.5643 | REC 0.4461 | F1 0.4983 | ROC-AUC 0.7083 | PR-AUC 0.4908
Recall@Top20%: 0.4732
Confusion matrix @ thr=0.604:
 [[4216  457]
 [ 735  592]]


Intermediate Model 1: Decision Tree

In [None]:

dt_pipe = Pipeline([
    ("prep", preprocessor),
    ("clf", DecisionTreeClassifier(random_state=RANDOM_STATE))
])

param_grid = {
    "clf__max_depth": [3, 5, 7, 9],
    "clf__min_samples_leaf": [10, 20, 50],
    "clf__criterion": ["gini", "entropy"]
}

grid = GridSearchCV(
    estimator=dt_pipe,
    param_grid=param_grid,
    scoring="roc_auc",
    cv=5,
    n_jobs=-1
)
grid.fit(X_train, y_train)
best_dt = grid.best_estimator_

# Evaluate
y_prob_dt = best_dt.predict_proba(X_test)[:, 1]
dt_metrics = evaluate(y_test, y_prob_dt, thr=0.50, label="Decision Tree (best grid) @0.50")



== Decision Tree (best grid) @0.50 ==
ACC 0.8123 | PREC 0.6328 | REC 0.3610 | F1 0.4597 | ROC-AUC 0.7527 | PR-AUC 0.5131
Recall@Top20%: 0.4898
Confusion matrix @ thr=0.500:
 [[4395  278]
 [ 848  479]]


In [None]:
# -----------------------------
# 6) Model #2 — Decision Tree Classifier
# -----------------------------
from sklearn.tree import DecisionTreeClassifier

# Define Decision Tree with basic hyperparameters
dt = DecisionTreeClassifier(
    criterion='gini',
    max_depth=None,           # let tree expand fully (you can tune later)
    min_samples_split=2,
    min_samples_leaf=1,
    class_weight='balanced',  # handle imbalance
    random_state=RANDOM_STATE
).fit(Xtr, y_train)

# Predict probabilities and evaluate
y_prob_dt = dt.predict_proba(Xte)[:, 1]
dt_default_metrics = evaluate(y_test, y_prob_dt, thr=0.50, label="Decision Tree @0.50")

# Find best F1 threshold and re-evaluate
thr_opt_dt, f1_opt_dt, p_opt_dt, r_opt_dt = best_f1_threshold(y_test, y_prob_dt)
print(f"\n[DT] Best F1 threshold: {thr_opt_dt:.4f} | F1 {f1_opt_dt:.4f} | P {p_opt_dt:.4f} | R {r_opt_dt:.4f}")
dt_tuned_metrics = evaluate(y_test, y_prob_dt, thr=thr_opt_dt, label="Decision Tree @Best-F1")



== Decision Tree @0.50 ==
ACC 0.7302 | PREC 0.3870 | REC 0.3768 | F1 0.3818 | ROC-AUC 0.6035 | PR-AUC 0.2832
Recall@Top20%: 0.3745
Confusion matrix @ thr=0.500:
 [[3881  792]
 [ 827  500]]

[DT] Best F1 threshold: 0.7788 | F1 0.3818 | P 0.3870 | R 0.3768

== Decision Tree @Best-F1 ==
ACC 0.7302 | PREC 0.3870 | REC 0.3768 | F1 0.3818 | ROC-AUC 0.6035 | PR-AUC 0.2832
Recall@Top20%: 0.3745
Confusion matrix @ thr=0.779:
 [[3881  792]
 [ 827  500]]


In [None]:
from sklearn.calibration import CalibratedClassifierCV

calibrated_dt = CalibratedClassifierCV(dt, method='isotonic', cv=5)
calibrated_dt.fit(Xtr, y_train)

y_prob_cal = calibrated_dt.predict_proba(Xte)[:, 1]
evaluate(y_test, y_prob_cal, thr=0.50, label="Decision Tree (Calibrated) @0.50")
thr_opt_cal, f1_opt_cal, p_opt_cal, r_opt_cal = best_f1_threshold(y_test, y_prob_cal)
evaluate(y_test, y_prob_cal, thr=thr_opt_cal, label="Decision Tree (Calibrated) @Best-F1")



== Decision Tree (Calibrated) @0.50 ==
ACC 0.7788 | PREC 0.0000 | REC 0.0000 | F1 0.0000 | ROC-AUC 0.6921 | PR-AUC 0.4341
Recall@Top20%: 0.4228
Confusion matrix @ thr=0.500:
 [[4673    0]
 [1327    0]]

== Decision Tree (Calibrated) @Best-F1 ==
ACC 0.7213 | PREC 0.4008 | REC 0.5252 | F1 0.4547 | ROC-AUC 0.6921 | PR-AUC 0.4341
Recall@Top20%: 0.4228
Confusion matrix @ thr=0.253:
 [[3631 1042]
 [ 630  697]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


{'acc': 0.7213333333333334,
 'prec': 0.40080506037952845,
 'rec': 0.5252449133383572,
 'f1': 0.4546640574037834,
 'auc': 0.692147937025717,
 'prauc': 0.43405776687988595,
 'recall_topk': np.float64(0.42275810097965333)}

In [None]:
# -----------------------------
# 7) Model #3 — Random Forest Classifier (with Calibration)
# -----------------------------
from sklearn.ensemble import RandomForestClassifier
from sklearn.calibration import CalibratedClassifierCV

# Base Random Forest model
rf = RandomForestClassifier(
    n_estimators=200,
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    class_weight='balanced',  # handle class imbalance
    n_jobs=-1,
    random_state=RANDOM_STATE
).fit(Xtr, y_train)

# Apply probability calibration (isotonic)
rf_calibrated = CalibratedClassifierCV(rf, method='isotonic', cv=5)
rf_calibrated.fit(Xtr, y_train)

# Predict probabilities on test set
y_prob_rf = rf_calibrated.predict_proba(Xte)[:, 1]

# Evaluate at default threshold
rf_default_metrics = evaluate(y_test, y_prob_rf, thr=0.50, label="Random Forest (Calibrated) @0.50")

# Find optimal F1 threshold
thr_opt_rf, f1_opt_rf, p_opt_rf, r_opt_rf = best_f1_threshold(y_test, y_prob_rf)
print(f"\n[RF] Best F1 threshold: {thr_opt_rf:.4f} | F1 {f1_opt_rf:.4f} | P {p_opt_rf:.4f} | R {r_opt_rf:.4f}")

# Evaluate again at best-F1 threshold
rf_tuned_metrics = evaluate(y_test, y_prob_rf, thr=thr_opt_rf, label="Random Forest (Calibrated) @Best-F1")



== Random Forest (Calibrated) @0.50 ==
ACC 0.8142 | PREC 0.6485 | REC 0.3489 | F1 0.4537 | ROC-AUC 0.7633 | PR-AUC 0.5421
Recall@Top20%: 0.4951
Confusion matrix @ thr=0.500:
 [[4422  251]
 [ 864  463]]

[RF] Best F1 threshold: 0.2397 | F1 0.5321 | P 0.4929 | R 0.5780

== Random Forest (Calibrated) @Best-F1 ==
ACC 0.7752 | PREC 0.4929 | REC 0.5780 | F1 0.5321 | ROC-AUC 0.7633 | PR-AUC 0.5421
Recall@Top20%: 0.4951
Confusion matrix @ thr=0.240:
 [[3884  789]
 [ 560  767]]


In [18]:
# Install CatBoost (only once per session)
!pip install -q catboost

from catboost import CatBoostClassifier

# CatBoost Model
cat_model = CatBoostClassifier(
    iterations=800,
    learning_rate=0.05,
    depth=6,
    eval_metric='AUC',
    class_weights=[1, len(y_train[y_train==0])/len(y_train[y_train==1])],
    random_seed=RANDOM_STATE,
    verbose=False
)

cat_model.fit(Xtr, y_train)
y_prob_cat = cat_model.predict_proba(Xte)[:, 1]
cat_metrics = evaluate(y_test, y_prob_cat, thr=0.5, label="CatBoost")


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25h
== CatBoost ==
ACC 0.7673 | PREC 0.4793 | REC 0.6021 | F1 0.5337 | ROC-AUC 0.7739 | PR-AUC 0.5487
Recall@Top20%: 0.5019
Confusion matrix @ thr=0.500:
 [[3805  868]
 [ 528  799]]


In [19]:
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

# XGBoost
xgb_model = XGBClassifier(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=5,
    subsample=0.8,
    colsample_bytree=0.8,
    scale_pos_weight=len(y_train[y_train==0])/len(y_train[y_train==1]),
    eval_metric="auc",
    random_state=RANDOM_STATE,
    n_jobs=-1
)
xgb_model.fit(Xtr, y_train)
y_prob_xgb = xgb_model.predict_proba(Xte)[:, 1]
xgb_metrics = evaluate(y_test, y_prob_xgb, thr=0.5, label="XGBoost")

# LightGBM
lgbm_model = LGBMClassifier(
    n_estimators=1000,
    learning_rate=0.03,
    max_depth=-1,
    num_leaves=31,
    subsample=0.8,
    colsample_bytree=0.8,
    class_weight='balanced',
    random_state=RANDOM_STATE
)
lgbm_model.fit(Xtr, y_train)
y_prob_lgbm = lgbm_model.predict_proba(Xte)[:, 1]
lgbm_metrics = evaluate(y_test, y_prob_lgbm, thr=0.5, label="LightGBM")



== XGBoost ==
ACC 0.7593 | PREC 0.4656 | REC 0.5961 | F1 0.5228 | ROC-AUC 0.7701 | PR-AUC 0.5469
Recall@Top20%: 0.4989
Confusion matrix @ thr=0.500:
 [[3765  908]
 [ 536  791]]
[LightGBM] [Info] Number of positive: 5309, number of negative: 18691
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.025616 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3262
[LightGBM] [Info] Number of data points in the train set: 24000, number of used features: 23
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000





== LightGBM ==
ACC 0.7705 | PREC 0.4846 | REC 0.5916 | F1 0.5327 | ROC-AUC 0.7737 | PR-AUC 0.5481
Recall@Top20%: 0.5034
Confusion matrix @ thr=0.500:
 [[3838  835]
 [ 542  785]]


In [20]:
from sklearn.ensemble import StackingClassifier

stack_model = StackingClassifier(
    estimators=[
        ('lr', log_cv),
        ('dt', best_dt),
        ('rf', RandomForestClassifier(n_estimators=300, class_weight='balanced', random_state=RANDOM_STATE))
    ],
    final_estimator=LGBMClassifier(learning_rate=0.05, n_estimators=300, random_state=RANDOM_STATE),
    n_jobs=-1
)
stack_model.fit(Xtr, y_train)
y_prob_stack = stack_model.predict_proba(Xte)[:, 1]
stack_metrics = evaluate(y_test, y_prob_stack, thr=0.5, label="Stacked Ensemble")


NameError: name 'best_dt' is not defined

In [21]:
!pip install optuna
import optuna

def objective(trial):
    n_estimators = trial.suggest_int("n_estimators", 200, 1000)
    max_depth = trial.suggest_int("max_depth", 3, 10)
    lr = trial.suggest_float("learning_rate", 0.01, 0.2)
    subsample = trial.suggest_float("subsample", 0.6, 1.0)

    model = LGBMClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        learning_rate=lr,
        subsample=subsample,
        class_weight='balanced',
        random_state=RANDOM_STATE
    )
    model.fit(Xtr, y_train)
    preds = model.predict_proba(Xte)[:, 1]
    return roc_auc_score(y_test, preds)

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=30)


Collecting optuna
  Downloading optuna-4.5.0-py3-none-any.whl.metadata (17 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.10.1-py3-none-any.whl.metadata (11 kB)
Downloading optuna-4.5.0-py3-none-any.whl (400 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m400.9/400.9 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.10.1-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, optuna
Successfully installed colorlog-6.10.1 optuna-4.5.0


[I 2025-11-06 16:36:54,951] A new study created in memory with name: no-name-9ce640c5-d62b-4171-9321-81e9f5ec029a


[LightGBM] [Info] Number of positive: 5309, number of negative: 18691
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001826 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3262
[LightGBM] [Info] Number of data points in the train set: 24000, number of used features: 23
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000


[I 2025-11-06 16:36:59,350] Trial 0 finished with value: 0.7658283544890876 and parameters: {'n_estimators': 943, 'max_depth': 7, 'learning_rate': 0.044497198950515, 'subsample': 0.6992062933698668}. Best is trial 0 with value: 0.7658283544890876.


[LightGBM] [Info] Number of positive: 5309, number of negative: 18691
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001883 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3262
[LightGBM] [Info] Number of data points in the train set: 24000, number of used features: 23
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000


[I 2025-11-06 16:37:00,730] Trial 1 finished with value: 0.7801070169975477 and parameters: {'n_estimators': 452, 'max_depth': 3, 'learning_rate': 0.030107954449680785, 'subsample': 0.9085259730382226}. Best is trial 1 with value: 0.7801070169975477.


[LightGBM] [Info] Number of positive: 5309, number of negative: 18691
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002066 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3262
[LightGBM] [Info] Number of data points in the train set: 24000, number of used features: 23
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000


[I 2025-11-06 16:37:02,054] Trial 2 finished with value: 0.7721125108872322 and parameters: {'n_estimators': 455, 'max_depth': 3, 'learning_rate': 0.10678005465614351, 'subsample': 0.6211118608926277}. Best is trial 1 with value: 0.7801070169975477.


[LightGBM] [Info] Number of positive: 5309, number of negative: 18691
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003132 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3262
[LightGBM] [Info] Number of data points in the train set: 24000, number of used features: 23
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000


[I 2025-11-06 16:37:07,586] Trial 3 finished with value: 0.749089471802532 and parameters: {'n_estimators': 945, 'max_depth': 7, 'learning_rate': 0.13501393670426012, 'subsample': 0.9236628306443854}. Best is trial 1 with value: 0.7801070169975477.


[LightGBM] [Info] Number of positive: 5309, number of negative: 18691
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003172 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3262
[LightGBM] [Info] Number of data points in the train set: 24000, number of used features: 23
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000


[I 2025-11-06 16:37:10,960] Trial 4 finished with value: 0.7502712031518426 and parameters: {'n_estimators': 734, 'max_depth': 9, 'learning_rate': 0.146768908977112, 'subsample': 0.630987228476925}. Best is trial 1 with value: 0.7801070169975477.


[LightGBM] [Info] Number of positive: 5309, number of negative: 18691
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001832 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3262
[LightGBM] [Info] Number of data points in the train set: 24000, number of used features: 23
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000


[I 2025-11-06 16:37:13,273] Trial 5 finished with value: 0.7657600598348253 and parameters: {'n_estimators': 822, 'max_depth': 3, 'learning_rate': 0.14575186723532516, 'subsample': 0.8232033572257704}. Best is trial 1 with value: 0.7801070169975477.


[LightGBM] [Info] Number of positive: 5309, number of negative: 18691
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001903 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3262
[LightGBM] [Info] Number of data points in the train set: 24000, number of used features: 23
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000


[I 2025-11-06 16:37:16,852] Trial 6 finished with value: 0.7604620234149875 and parameters: {'n_estimators': 772, 'max_depth': 7, 'learning_rate': 0.07696311918708305, 'subsample': 0.9421230516353801}. Best is trial 1 with value: 0.7801070169975477.


[LightGBM] [Info] Number of positive: 5309, number of negative: 18691
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001821 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3262
[LightGBM] [Info] Number of data points in the train set: 24000, number of used features: 23
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000


[I 2025-11-06 16:37:23,276] Trial 7 finished with value: 0.7735978994596255 and parameters: {'n_estimators': 985, 'max_depth': 10, 'learning_rate': 0.019045177481442563, 'subsample': 0.9623447123095007}. Best is trial 1 with value: 0.7801070169975477.


[LightGBM] [Info] Number of positive: 5309, number of negative: 18691
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001801 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3262
[LightGBM] [Info] Number of data points in the train set: 24000, number of used features: 23
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000


[I 2025-11-06 16:37:26,669] Trial 8 finished with value: 0.7687764742574307 and parameters: {'n_estimators': 739, 'max_depth': 9, 'learning_rate': 0.05173271097787852, 'subsample': 0.9676809433377376}. Best is trial 1 with value: 0.7801070169975477.


[LightGBM] [Info] Number of positive: 5309, number of negative: 18691
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001923 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3262
[LightGBM] [Info] Number of data points in the train set: 24000, number of used features: 23
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000


[I 2025-11-06 16:37:30,949] Trial 9 finished with value: 0.7496379254486845 and parameters: {'n_estimators': 963, 'max_depth': 9, 'learning_rate': 0.14436284867370483, 'subsample': 0.6770113650934395}. Best is trial 1 with value: 0.7801070169975477.


[LightGBM] [Info] Number of positive: 5309, number of negative: 18691
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001966 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3262
[LightGBM] [Info] Number of data points in the train set: 24000, number of used features: 23
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000


[I 2025-11-06 16:37:31,917] Trial 10 finished with value: 0.7650569554839801 and parameters: {'n_estimators': 215, 'max_depth': 5, 'learning_rate': 0.19713148839613545, 'subsample': 0.8425894627937824}. Best is trial 1 with value: 0.7801070169975477.


[LightGBM] [Info] Number of positive: 5309, number of negative: 18691
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003422 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3262
[LightGBM] [Info] Number of data points in the train set: 24000, number of used features: 23
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000


[I 2025-11-06 16:37:35,587] Trial 11 finished with value: 0.7777455700797492 and parameters: {'n_estimators': 488, 'max_depth': 5, 'learning_rate': 0.011585395196173236, 'subsample': 0.8721458579421412}. Best is trial 1 with value: 0.7801070169975477.


[LightGBM] [Info] Number of positive: 5309, number of negative: 18691
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003480 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3262
[LightGBM] [Info] Number of data points in the train set: 24000, number of used features: 23
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000


[I 2025-11-06 16:37:38,046] Trial 12 finished with value: 0.777913928094034 and parameters: {'n_estimators': 482, 'max_depth': 5, 'learning_rate': 0.015343426828084386, 'subsample': 0.8806398418308891}. Best is trial 1 with value: 0.7801070169975477.


[LightGBM] [Info] Number of positive: 5309, number of negative: 18691
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001929 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3262
[LightGBM] [Info] Number of data points in the train set: 24000, number of used features: 23
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000


[I 2025-11-06 16:37:39,603] Trial 13 finished with value: 0.7758231602250644 and parameters: {'n_estimators': 349, 'max_depth': 5, 'learning_rate': 0.044182698994400145, 'subsample': 0.7549983637032888}. Best is trial 1 with value: 0.7801070169975477.


[LightGBM] [Info] Number of positive: 5309, number of negative: 18691
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001974 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3262
[LightGBM] [Info] Number of data points in the train set: 24000, number of used features: 23
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000


[I 2025-11-06 16:37:41,550] Trial 14 finished with value: 0.7707047701921168 and parameters: {'n_estimators': 570, 'max_depth': 4, 'learning_rate': 0.07612405651569562, 'subsample': 0.8897144428763452}. Best is trial 1 with value: 0.7801070169975477.


[LightGBM] [Info] Number of positive: 5309, number of negative: 18691
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001914 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3262
[LightGBM] [Info] Number of data points in the train set: 24000, number of used features: 23
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000


[I 2025-11-06 16:37:42,796] Trial 15 finished with value: 0.7779833515855568 and parameters: {'n_estimators': 319, 'max_depth': 4, 'learning_rate': 0.028043073320697987, 'subsample': 0.9919108928341142}. Best is trial 1 with value: 0.7801070169975477.


[LightGBM] [Info] Number of positive: 5309, number of negative: 18691
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001928 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3262
[LightGBM] [Info] Number of data points in the train set: 24000, number of used features: 23
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000


[I 2025-11-06 16:37:43,715] Trial 16 finished with value: 0.7770839908138449 and parameters: {'n_estimators': 285, 'max_depth': 3, 'learning_rate': 0.06993619050880684, 'subsample': 0.9930023737851552}. Best is trial 1 with value: 0.7801070169975477.


[LightGBM] [Info] Number of positive: 5309, number of negative: 18691
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001987 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3262
[LightGBM] [Info] Number of data points in the train set: 24000, number of used features: 23
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000


[I 2025-11-06 16:37:45,019] Trial 17 finished with value: 0.7720888053047611 and parameters: {'n_estimators': 366, 'max_depth': 4, 'learning_rate': 0.10266588116588747, 'subsample': 0.7771062565361121}. Best is trial 1 with value: 0.7801070169975477.


[LightGBM] [Info] Number of positive: 5309, number of negative: 18691
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001965 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3262
[LightGBM] [Info] Number of data points in the train set: 24000, number of used features: 23
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000


[I 2025-11-06 16:37:46,664] Trial 18 finished with value: 0.7774877113969507 and parameters: {'n_estimators': 379, 'max_depth': 4, 'learning_rate': 0.03162946135953175, 'subsample': 0.9978568472377376}. Best is trial 1 with value: 0.7801070169975477.


[LightGBM] [Info] Number of positive: 5309, number of negative: 18691
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005408 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3262
[LightGBM] [Info] Number of data points in the train set: 24000, number of used features: 23
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000


[I 2025-11-06 16:37:50,872] Trial 19 finished with value: 0.7569053797319849 and parameters: {'n_estimators': 615, 'max_depth': 6, 'learning_rate': 0.09781899432387144, 'subsample': 0.9184017114140248}. Best is trial 1 with value: 0.7801070169975477.


[LightGBM] [Info] Number of positive: 5309, number of negative: 18691
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001919 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3262
[LightGBM] [Info] Number of data points in the train set: 24000, number of used features: 23
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000


[I 2025-11-06 16:37:51,942] Trial 20 finished with value: 0.7745779720954654 and parameters: {'n_estimators': 202, 'max_depth': 6, 'learning_rate': 0.05784797626970088, 'subsample': 0.9137149205922838}. Best is trial 1 with value: 0.7801070169975477.


[LightGBM] [Info] Number of positive: 5309, number of negative: 18691
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001791 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3262
[LightGBM] [Info] Number of data points in the train set: 24000, number of used features: 23
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000


[I 2025-11-06 16:37:53,818] Trial 21 finished with value: 0.776984008085055 and parameters: {'n_estimators': 510, 'max_depth': 4, 'learning_rate': 0.02623731095865383, 'subsample': 0.8620798463477741}. Best is trial 1 with value: 0.7801070169975477.


[LightGBM] [Info] Number of positive: 5309, number of negative: 18691
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003202 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3262
[LightGBM] [Info] Number of data points in the train set: 24000, number of used features: 23
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000


[I 2025-11-06 16:37:55,693] Trial 22 finished with value: 0.7753663842907138 and parameters: {'n_estimators': 421, 'max_depth': 5, 'learning_rate': 0.03375284451468504, 'subsample': 0.8044348615166277}. Best is trial 1 with value: 0.7801070169975477.


[LightGBM] [Info] Number of positive: 5309, number of negative: 18691
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001927 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3262
[LightGBM] [Info] Number of data points in the train set: 24000, number of used features: 23
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000


[I 2025-11-06 16:37:57,655] Trial 23 finished with value: 0.7777962064940074 and parameters: {'n_estimators': 615, 'max_depth': 3, 'learning_rate': 0.011356670142807404, 'subsample': 0.8920748212805157}. Best is trial 1 with value: 0.7801070169975477.


[LightGBM] [Info] Number of positive: 5309, number of negative: 18691
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001951 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3262
[LightGBM] [Info] Number of data points in the train set: 24000, number of used features: 23
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000


[I 2025-11-06 16:37:58,772] Trial 24 finished with value: 0.7757166463664099 and parameters: {'n_estimators': 294, 'max_depth': 4, 'learning_rate': 0.0603231280317385, 'subsample': 0.9527349831151627}. Best is trial 1 with value: 0.7801070169975477.


[LightGBM] [Info] Number of positive: 5309, number of negative: 18691
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001914 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3262
[LightGBM] [Info] Number of data points in the train set: 24000, number of used features: 23
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000


[I 2025-11-06 16:38:01,894] Trial 25 finished with value: 0.7740821706443936 and parameters: {'n_estimators': 535, 'max_depth': 6, 'learning_rate': 0.03308745928890795, 'subsample': 0.8350860942682165}. Best is trial 1 with value: 0.7801070169975477.


[LightGBM] [Info] Number of positive: 5309, number of negative: 18691
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003340 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3262
[LightGBM] [Info] Number of data points in the train set: 24000, number of used features: 23
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000


[I 2025-11-06 16:38:03,373] Trial 26 finished with value: 0.7773232236818447 and parameters: {'n_estimators': 292, 'max_depth': 3, 'learning_rate': 0.08612275748959432, 'subsample': 0.8954620191006414}. Best is trial 1 with value: 0.7801070169975477.


[LightGBM] [Info] Number of positive: 5309, number of negative: 18691
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003275 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3262
[LightGBM] [Info] Number of data points in the train set: 24000, number of used features: 23
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000


[I 2025-11-06 16:38:06,684] Trial 27 finished with value: 0.7448021478870345 and parameters: {'n_estimators': 677, 'max_depth': 5, 'learning_rate': 0.19567480913832652, 'subsample': 0.9327746462482704}. Best is trial 1 with value: 0.7801070169975477.


[LightGBM] [Info] Number of positive: 5309, number of negative: 18691
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001802 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3262
[LightGBM] [Info] Number of data points in the train set: 24000, number of used features: 23
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000


[I 2025-11-06 16:38:08,287] Trial 28 finished with value: 0.775964103620165 and parameters: {'n_estimators': 437, 'max_depth': 4, 'learning_rate': 0.040612205531352655, 'subsample': 0.9775723117194022}. Best is trial 1 with value: 0.7801070169975477.


[LightGBM] [Info] Number of positive: 5309, number of negative: 18691
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001792 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3262
[LightGBM] [Info] Number of data points in the train set: 24000, number of used features: 23
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000


[I 2025-11-06 16:38:10,240] Trial 29 finished with value: 0.7718303015720994 and parameters: {'n_estimators': 389, 'max_depth': 7, 'learning_rate': 0.04949248107591663, 'subsample': 0.7439810579213542}. Best is trial 1 with value: 0.7801070169975477.


In [22]:
from xgboost import XGBClassifier

xgb_tuned = XGBClassifier(
    n_estimators=1000,
    learning_rate=0.02,
    max_depth=6,
    min_child_weight=4,
    subsample=0.8,
    colsample_bytree=0.7,
    gamma=0.2,
    reg_alpha=0.1,
    reg_lambda=1.0,
    scale_pos_weight=len(y_train[y_train==0])/len(y_train[y_train==1]),
    tree_method='hist',
    eval_metric='auc',
    random_state=RANDOM_STATE,
    n_jobs=-1
)
xgb_tuned.fit(Xtr, y_train)
y_prob_xgb_tuned = xgb_tuned.predict_proba(Xte)[:, 1]
evaluate(y_test, y_prob_xgb_tuned, thr=0.5, label="XGBoost (Tuned)")



== XGBoost (Tuned) ==
ACC 0.7688 | PREC 0.4817 | REC 0.5946 | F1 0.5322 | ROC-AUC 0.7734 | PR-AUC 0.5542
Recall@Top20%: 0.5057
Confusion matrix @ thr=0.500:
 [[3824  849]
 [ 538  789]]


{'acc': 0.7688333333333334,
 'prec': 0.4816849816849817,
 'rec': 0.5945742275810098,
 'f1': 0.5322091062394604,
 'auc': np.float64(0.7733872100480707),
 'prauc': np.float64(0.5541995681702343),
 'recall_topk': np.float64(0.5056518462697814)}

In [23]:
from lightgbm import LGBMClassifier

lgb_tuned = LGBMClassifier(
    n_estimators=1500,
    learning_rate=0.02,
    num_leaves=64,
    max_depth=-1,
    min_child_samples=30,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=0.1,
    reg_lambda=1.0,
    class_weight='balanced',
    random_state=RANDOM_STATE
)
lgb_tuned.fit(Xtr, y_train)
y_prob_lgb_tuned = lgb_tuned.predict_proba(Xte)[:, 1]
evaluate(y_test, y_prob_lgb_tuned, thr=0.5, label="LightGBM (Tuned)")


[LightGBM] [Info] Number of positive: 5309, number of negative: 18691
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.037780 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3262
[LightGBM] [Info] Number of data points in the train set: 24000, number of used features: 23
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000





== LightGBM (Tuned) ==
ACC 0.7752 | PREC 0.4927 | REC 0.5576 | F1 0.5232 | ROC-AUC 0.7688 | PR-AUC 0.5410
Recall@Top20%: 0.4928
Confusion matrix @ thr=0.500:
 [[3911  762]
 [ 587  740]]


{'acc': 0.7751666666666667,
 'prec': 0.49267643142476697,
 'rec': 0.557648831951771,
 'f1': 0.5231530576175327,
 'auc': np.float64(0.7687613962168792),
 'prauc': np.float64(0.5410095472670905),
 'recall_topk': np.float64(0.49284099472494347)}

In [24]:
from catboost import CatBoostClassifier

cat_tuned = CatBoostClassifier(
    iterations=800,
    learning_rate=0.05,
    depth=6,
    l2_leaf_reg=3,
    random_seed=RANDOM_STATE,
    class_weights=[1, len(y_train[y_train==0])/len(y_train[y_train==1])],
    eval_metric='AUC',
    verbose=100
)
cat_tuned.fit(Xtr, y_train)
y_prob_cat = cat_tuned.predict_proba(Xte)[:, 1]
evaluate(y_test, y_prob_cat, thr=0.5, label="CatBoost (Tuned)")


0:	total: 56.4ms	remaining: 45.1s
100:	total: 4.03s	remaining: 27.9s
200:	total: 8.91s	remaining: 26.6s
300:	total: 12.3s	remaining: 20.4s
400:	total: 15s	remaining: 14.9s
500:	total: 16.9s	remaining: 10.1s
600:	total: 19.1s	remaining: 6.32s
700:	total: 21.5s	remaining: 3.04s
799:	total: 22.7s	remaining: 0us

== CatBoost (Tuned) ==
ACC 0.7673 | PREC 0.4793 | REC 0.6021 | F1 0.5337 | ROC-AUC 0.7739 | PR-AUC 0.5487
Recall@Top20%: 0.5019
Confusion matrix @ thr=0.500:
 [[3805  868]
 [ 528  799]]


{'acc': 0.7673333333333333,
 'prec': 0.4793041391721656,
 'rec': 0.6021100226073851,
 'f1': 0.5337341349365398,
 'auc': np.float64(0.7739467908043627),
 'prauc': np.float64(0.5487443174544239),
 'recall_topk': np.float64(0.5018839487565938)}

In [25]:
from sklearn.ensemble import StackingClassifier

stack_final = StackingClassifier(
    estimators=[
        ('rf', RandomForestClassifier(n_estimators=400, class_weight='balanced', random_state=RANDOM_STATE)),
        ('xgb', xgb_tuned),
        ('lgbm', lgb_tuned)
    ],
    final_estimator=LogisticRegressionCV(cv=5, max_iter=2000, scoring='roc_auc'),
    n_jobs=-1
)
stack_final.fit(Xtr, y_train)
y_prob_stack_final = stack_final.predict_proba(Xte)[:, 1]
evaluate(y_test, y_prob_stack_final, thr=0.5, label="Stacked Ensemble (RF+XGB+LGBM)")





== Stacked Ensemble (RF+XGB+LGBM) ==
ACC 0.8185 | PREC 0.6545 | REC 0.3798 | F1 0.4807 | ROC-AUC 0.7730 | PR-AUC 0.5575
Recall@Top20%: 0.5034
Confusion matrix @ thr=0.500:
 [[4407  266]
 [ 823  504]]


{'acc': 0.8185,
 'prec': 0.6545454545454545,
 'rec': 0.37980406932931426,
 'f1': 0.48068669527896996,
 'auc': np.float64(0.7730487201323771),
 'prauc': np.float64(0.5575287712646231),
 'recall_topk': np.float64(0.5033911077618689)}