# EDA 

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

  from pandas.core import (


In [2]:
train = pd.read_csv('train.csv')

In [3]:
# Identify categorical and numerical features
TARGET = 'Heart Disease'
ID_COL = 'id'

train_cols = [col for col in train.columns if col not in [ID_COL, TARGET]]
CATS = train[train_cols].select_dtypes(include=['object']).columns.tolist()
NUMS = train[train_cols].select_dtypes(exclude=['object']).columns.tolist()

In [4]:
skew_kurt_df = pd.DataFrame({
    'Feature': NUMS,
    'Skewness': [train[col].skew() for col in NUMS],
    'Kurtosis': [train[col].kurtosis() for col in NUMS],
    'Mean': [train[col].mean() for col in NUMS],
    'Std': [train[col].std() for col in NUMS]
})

skew_kurt_df['Skew_Type'] = skew_kurt_df['Skewness'].apply(
    lambda x: 'Highly Right' if x > 1 else ('Right' if x > 0.5 else ('Symmetric' if abs(x) <= 0.5 else ('Left' if x < -0.5 else 'Highly Left')))
)

skew_kurt_df = skew_kurt_df.sort_values('Skewness', key=abs, ascending=False)

print("="*80)
print("SKEWNESS AND KURTOSIS ANALYSIS")
print("="*80)
display(skew_kurt_df.style.background_gradient(subset=['Skewness', 'Kurtosis'], cmap='coolwarm'))

print("\nInterpretation:")
print("  • Skewness > 1 or < -1: Highly skewed (consider log transform)")
print("  • Skewness between -0.5 and 0.5: Approximately symmetric")
print("  • Kurtosis > 3: Heavy tails (more outliers)")
print("  • Kurtosis < 3: Light tails (fewer outliers)")

SKEWNESS AND KURTOSIS ANALYSIS


Unnamed: 0,Feature,Skewness,Kurtosis,Mean,Std,Skew_Type
5,FBS over 120,3.096606,7.588996,0.079987,0.271274,Highly Right
11,Number of vessels fluro,1.736669,2.096608,0.45104,0.798549,Highly Right
9,ST depression,1.328429,1.374855,0.716028,0.948472,Highly Right
2,Chest pain type,-1.085939,0.368411,3.312752,0.851615,Left
8,Exercise angina,1.014983,-0.969812,0.273725,0.44587,Highly Right
1,Sex,-0.951123,-1.095368,0.714735,0.451541,Left
7,Max HR,-0.754556,0.203852,152.816763,19.112927,Left
10,Slope of ST,0.630518,-0.736092,1.455871,0.545192,Right
3,BP,0.629283,0.933183,130.497433,14.975802,Right
12,Thallium,0.381173,-1.844162,4.618873,1.950007,Symmetric



Interpretation:
  • Skewness > 1 or < -1: Highly skewed (consider log transform)
  • Skewness between -0.5 and 0.5: Approximately symmetric
  • Kurtosis > 3: Heavy tails (more outliers)
  • Kurtosis < 3: Light tails (fewer outliers)


In [5]:
print("="*80)
print("STATISTICAL SIGNIFICANCE TESTS (Numerical Features vs Target)")
print("="*80)
print("\nUsing Mann-Whitney U Test (non-parametric)")
print("H0: The distributions are the same")
print("H1: The distributions are different")
print(f"Significance Level: α = 0.05\n")

significance_results = []

for col in NUMS:
    groups = [train[train[TARGET] == val][col].values for val in sorted(train[TARGET].unique())]
    
    if len(groups) == 2:
        stat, p_value = stats.mannwhitneyu(groups[0], groups[1], alternative='two-sided')
        
        significance_results.append({
            'Feature': col,
            'Test_Statistic': stat,
            'P_Value': p_value,
            'Significant': 'Yes' if p_value < 0.05 else 'No',
            'Effect': 'Strong' if p_value < 0.01 else ('Moderate' if p_value < 0.05 else 'Weak')
        })

sig_df = pd.DataFrame(significance_results).sort_values('P_Value')
display(sig_df.style.background_gradient(subset=['P_Value'], cmap='RdYlGn_r'))

print(f"\nSignificant Features (p < 0.05): {sig_df[sig_df['Significant'] == 'Yes'].shape[0]}/{len(NUMS)}")

STATISTICAL SIGNIFICANCE TESTS (Numerical Features vs Target)

Using Mann-Whitney U Test (non-parametric)
H0: The distributions are the same
H1: The distributions are different
Significance Level: α = 0.05



Unnamed: 0,Feature,Test_Statistic,P_Value,Significant,Effect
0,Age,36747944418.5,0.0,Yes,Strong
1,Sex,33821991183.0,0.0,Yes,Strong
2,Chest pain type,22747841890.5,0.0,Yes,Strong
4,Cholesterol,43889513411.0,0.0,Yes,Strong
6,EKG results,38268651767.0,0.0,Yes,Strong
7,Max HR,74204284951.5,0.0,Yes,Strong
8,Exercise angina,29638821411.0,0.0,Yes,Strong
9,ST depression,26143741179.5,0.0,Yes,Strong
10,Slope of ST,28023624776.5,0.0,Yes,Strong
11,Number of vessels fluro,27992132670.5,0.0,Yes,Strong



Significant Features (p < 0.05): 12/13


In [6]:
#Try a BP binning 
def bp_flag(age, sbp):
    """
    age: years (int or float)
    sbp: systolic blood pressure (mm Hg)
    """

    # --- Determine age-based normal SBP ---
    if age < 40:
        normal_upper = 119
    elif age < 60:
        normal_upper = 124
    else:
        normal_upper = 139

    # --- BP category based on systolic ---
    if sbp < normal_upper:
        return 'normal'
    elif sbp < 129:
        return 'elevated'
    elif sbp < 139:
        return 'stage_1'
    elif sbp < 180:
        return 'stage_2'
    else:
        return 'hypertensive_crisis'
    
train['bp_category'] = train.apply(lambda row: bp_flag(row['Age'], row['BP']), axis=1)
#map to numerical
bp_mapping = {
    'normal': 0,
    'elevated': 1,
    'stage_1': 2,
    'stage_2': 3,
    'hypertensive_crisis': 4
}

train['bp_category_num'] = train['bp_category'].map(bp_mapping)

In [7]:
# try again the statistical significance test with the new feature
print("="*80)
print("STATISTICAL SIGNIFICANCE TEST FOR NEW FEATURE 'bp_category_num'")
print("="*80)
print("\nUsing Mann-Whitney U Test (non-parametric)")
print("H0: The distributions are the same")
print("H1: The distributions are different")
print(f"Significance Level: α = 0.05\n")
groups = [train[train[TARGET] == val]['bp_category_num'].values for val in sorted(train[TARGET].unique())]
stat, p_value = stats.mannwhitneyu(groups[0], groups[1], alternative='two-sided')
print(f"Test Statistic: {stat:.4f}, P-Value: {p_value:.4f}")
if p_value < 0.05:
    print("Result: Significant difference between groups (reject H0)")
    if p_value < 0.01:
        effect = 'Strong'
    else:
        effect = 'Moderate'
    print(f"Effect Size: {effect}")
else:
    print("Result: No significant difference between groups (fail to reject H0)")
print("="*80)


STATISTICAL SIGNIFICANCE TEST FOR NEW FEATURE 'bp_category_num'

Using Mann-Whitney U Test (non-parametric)
H0: The distributions are the same
H1: The distributions are different
Significance Level: α = 0.05

Test Statistic: 50121966375.5000, P-Value: 0.0000
Result: Significant difference between groups (reject H0)
Effect Size: Strong


In [8]:
#drop bp
train = train.drop(columns=['BP', 'bp_category'])

In [9]:
# run the skewness and kurtosis analysis again
train_cols = [col for col in train.columns if col not in [ID_COL, TARGET]]
CATS = train[train_cols].select_dtypes(include=['object']).columns.tolist()
NUMS = train[train_cols].select_dtypes(exclude=['object']).columns.tolist()
skew_kurt_df = pd.DataFrame({
    'Feature': NUMS,
    'Skewness': [train[col].skew() for col in NUMS],
    'Kurtosis': [train[col].kurtosis() for col in NUMS],
    'Mean': [train[col].mean() for col in NUMS],
    'Std': [train[col].std() for col in NUMS]
})
skew_kurt_df['Skew_Type'] = skew_kurt_df['Skewness'].apply(
    lambda x: 'Highly Right' if x > 1 else ('Right' if x > 0.5 else ('Symmetric' if abs(x) <= 0.5 else ('Left' if x < -0.5 else 'Highly Left')))
)
skew_kurt_df = skew_kurt_df.sort_values('Skewness', key=abs, ascending=False)
print("="*80)
print("SKEWNESS AND KURTOSIS ANALYSIS AFTER BP FEATURE ENGINEERING")
print("="*80)
display(skew_kurt_df.style.background_gradient(subset=['Skewness', 'Kurtosis'], cmap='coolwarm'))
print("\nInterpretation:")
print("  • Skewness > 1 or < -1: Highly skewed (consider log transform)")
print("  • Skewness between -0.5 and 0.5: Approximately symmetric")
print("  • Kurtosis > 3: Heavy tails (more outliers)")
print("  • Kurtosis < 3: Light tails (fewer outliers)")
print("="*80)

SKEWNESS AND KURTOSIS ANALYSIS AFTER BP FEATURE ENGINEERING


Unnamed: 0,Feature,Skewness,Kurtosis,Mean,Std,Skew_Type
4,FBS over 120,3.096606,7.588996,0.079987,0.271274,Highly Right
10,Number of vessels fluro,1.736669,2.096608,0.45104,0.798549,Highly Right
8,ST depression,1.328429,1.374855,0.716028,0.948472,Highly Right
2,Chest pain type,-1.085939,0.368411,3.312752,0.851615,Left
7,Exercise angina,1.014983,-0.969812,0.273725,0.44587,Highly Right
1,Sex,-0.951123,-1.095368,0.714735,0.451541,Left
6,Max HR,-0.754556,0.203852,152.816763,19.112927,Left
9,Slope of ST,0.630518,-0.736092,1.455871,0.545192,Right
11,Thallium,0.381173,-1.844162,4.618873,1.950007,Symmetric
3,Cholesterol,0.27315,0.068237,245.011814,33.681581,Symmetric



Interpretation:
  • Skewness > 1 or < -1: Highly skewed (consider log transform)
  • Skewness between -0.5 and 0.5: Approximately symmetric
  • Kurtosis > 3: Heavy tails (more outliers)
  • Kurtosis < 3: Light tails (fewer outliers)


In [10]:
train['stress_score'] = (
    train['Exercise angina'].astype(int)
    + (train['ST depression'] > 1).astype(int)
    + (train['Slope of ST'] == 2).astype(int)
)

# run the skewness and kurtosis analysis again with the new feature and statistical test
train_cols = [col for col in train.columns if col not in [ID_COL, TARGET]]
CATS = train[train_cols].select_dtypes(include=['object']).columns.tolist()
NUMS = train[train_cols].select_dtypes(exclude=['object']).columns.tolist()

skew_kurt_df = pd.DataFrame({
    'Feature': NUMS,
    'Skewness': [train[col].skew() for col in NUMS],
    'Kurtosis': [train[col].kurtosis() for col in NUMS],
    'Mean': [train[col].mean() for col in NUMS],
    'Std': [train[col].std() for col in NUMS]
})
skew_kurt_df['Skew_Type'] = skew_kurt_df['Skewness'].apply(
    lambda x: 'Highly Right' if x > 1 else ('Right' if x > 0.5 else ('Symmetric' if abs(x) <= 0.5 else ('Left' if x < -0.5 else 'Highly Left')))
)
skew_kurt_df = skew_kurt_df.sort_values('Skewness', key=abs, ascending=False)
print("="*80)
print("SKEWNESS AND KURTOSIS ANALYSIS AFTER STRESS SCORE FEATURE ENGINEERING")
print("="*80)
display(skew_kurt_df.style.background_gradient(subset=['Skewness', 'Kurtosis'], cmap='coolwarm'))
print("\nInterpretation:")
print("  • Skewness > 1 or < -1: Highly skewed (consider log transform)")
print("  • Skewness between -0.5 and 0.5: Approximately symmetric")
print("  • Kurtosis > 3: Heavy tails (more outliers)")
print("  • Kurtosis < 3: Light tails (fewer outliers)")
print("="*80)
print("STATISTICAL SIGNIFICANCE TEST FOR NEW FEATURE 'stress_score'")
print("="*80)
print("\nUsing Mann-Whitney U Test (non-parametric)")
print("H0: The distributions are the same")
print("H1: The distributions are different")
print(f"Significance Level: α = 0.05\n")
groups = [train[train[TARGET] == val]['stress_score'].values for val in sorted(train[TARGET].unique())]
stat, p_value = stats.mannwhitneyu(groups[0], groups[1], alternative='two-sided')
print(f"Test Statistic: {stat:.4f}, P-Value: {p_value:.4f}")
if p_value < 0.05:
    print("Result: Significant difference between groups (reject H0)")
    if p_value < 0.01:
        effect = 'Strong'
    else:
        effect = 'Moderate'
    print(f"Effect Size: {effect}")
else:
    print("Result: No significant difference between groups (fail to reject H0)")
print("="*80)


SKEWNESS AND KURTOSIS ANALYSIS AFTER STRESS SCORE FEATURE ENGINEERING


Unnamed: 0,Feature,Skewness,Kurtosis,Mean,Std,Skew_Type
4,FBS over 120,3.096606,7.588996,0.079987,0.271274,Highly Right
10,Number of vessels fluro,1.736669,2.096608,0.45104,0.798549,Highly Right
8,ST depression,1.328429,1.374855,0.716028,0.948472,Highly Right
2,Chest pain type,-1.085939,0.368411,3.312752,0.851615,Left
7,Exercise angina,1.014983,-0.969812,0.273725,0.44587,Highly Right
1,Sex,-0.951123,-1.095368,0.714735,0.451541,Left
6,Max HR,-0.754556,0.203852,152.816763,19.112927,Left
9,Slope of ST,0.630518,-0.736092,1.455871,0.545192,Right
13,stress_score,0.613636,-0.876668,0.992684,1.029864,Right
11,Thallium,0.381173,-1.844162,4.618873,1.950007,Symmetric



Interpretation:
  • Skewness > 1 or < -1: Highly skewed (consider log transform)
  • Skewness between -0.5 and 0.5: Approximately symmetric
  • Kurtosis > 3: Heavy tails (more outliers)
  • Kurtosis < 3: Light tails (fewer outliers)
STATISTICAL SIGNIFICANCE TEST FOR NEW FEATURE 'stress_score'

Using Mann-Whitney U Test (non-parametric)
H0: The distributions are the same
H1: The distributions are different
Significance Level: α = 0.05

Test Statistic: 18268384951.0000, P-Value: 0.0000
Result: Significant difference between groups (reject H0)
Effect Size: Strong


In [11]:
# now columns used in stress score drop the raw
train.drop(columns=['Exercise angina','ST depression','Slope of ST'],axis=1,inplace=True)

In [12]:
# checks on VIF of total dataset
from statsmodels.stats.outliers_influence import variance_inflation_factor

# X_train should be a DataFrame (not numpy array)
# Example: X_train = pd.DataFrame(X_train, columns=feature_names)

# 1️⃣ Add constant (important for VIF)
X_vif = train.drop(columns=['Heart Disease','id'],axis=1).copy()
X_vif['const'] = 1

# 2️⃣ Calculate VIF
vif_data = pd.DataFrame()
vif_data['feature'] = X_vif.columns
vif_data['VIF'] = [
    variance_inflation_factor(X_vif.values, i)
    for i in range(X_vif.shape[1])
]

# 3️⃣ Remove constant row & sort
vif_data = vif_data[vif_data['feature'] != 'const']
vif_data = vif_data.sort_values(by='VIF', ascending=False)

print(vif_data)

                    feature       VIF
10             stress_score  1.418755
8                  Thallium  1.417016
7   Number of vessels fluro  1.229944
2           Chest pain type  1.198967
6                    Max HR  1.177682
1                       Sex  1.122575
0                       Age  1.058514
5               EKG results  1.054602
9           bp_category_num  1.022215
3               Cholesterol  1.006699
4              FBS over 120  1.002177


In [13]:
train.columns

Index(['id', 'Age', 'Sex', 'Chest pain type', 'Cholesterol', 'FBS over 120',
       'EKG results', 'Max HR', 'Number of vessels fluro', 'Thallium',
       'Heart Disease', 'bp_category_num', 'stress_score'],
      dtype='object')

# Now build A 5 fold cross validation using Optuna XGBOOST

In [14]:
#build a 5 fold cross validation model using ooptuna xgboost
import optuna
import numpy as np
import xgboost as xgb

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

In [15]:
train['Heart Disease'] = train['Heart Disease'].map({'Absence': 0, 'Presence': 1})

In [16]:
# make a train test split
from sklearn.model_selection import train_test_split

X = train.drop(columns=['Heart Disease','id'])   # replace 'target' with your label column
y = train['Heart Disease']

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)

Train shape: (504000, 11)
Test shape: (126000, 11)


In [21]:
!pip install -U xgboost --user





In [25]:
import optuna
import numpy as np
import xgboost as xgb

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score


def objective(trial):

    params = {
        "objective": "binary:logistic",
        "eval_metric": "auc",

        "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
        "subsample": trial.suggest_float("subsample", 0.6, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
        "gamma": trial.suggest_float("gamma", 0, 5),
        "reg_alpha": trial.suggest_float("reg_alpha", 0, 5),
        "reg_lambda": trial.suggest_float("reg_lambda", 0, 5),

        # 🔥 GPU SETTINGS
        "tree_method": "gpu_hist",
        "predictor": "gpu_predictor",
        # "max_bin": 256,

        "random_state": 42
    }

    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    auc_scores = []

    for train_idx, val_idx in skf.split(X_train, y_train):
        X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

        model = xgb.XGBClassifier(**params,early_stopping_rounds=50)

        model.fit(
            X_tr, y_tr,
            eval_set=[(X_val, y_val)],
            # early_stopping_rounds=50,
            verbose=False
        )

        preds = model.predict_proba(X_val)[:, 1]
        auc = roc_auc_score(y_val, preds)
        auc_scores.append(auc)

    return np.mean(auc_scores)


# ---------------- RUN OPTUNA ---------------- #
import optuna
optuna.logging.set_verbosity(optuna.logging.INFO)

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)

print("Best CV AUC:", study.best_value)
print("Best Params:", study.best_params)

[I 2026-02-01 16:58:54,431] A new study created in memory with name: no-name-fa8c2f66-cdd8-4564-b1ad-ec83a1f1dac2

    E.g. tree_method = "hist", device = "cuda"

  ----------
Parameters: { "predictor" } are not used.

  ----------

    E.g. tree_method = "hist", device = "cuda"

  ----------
Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


  ----------

    E.g. tree_method = "hist", device = "cuda"

  ----------
Parameters: { "predictor" } are not used.

  ----------

    E.g. tree_method = "hist", device = "cuda"

  ----------

    E.g. tree_method = "hist", device = "cuda"

  ----------
Parameters: { "predictor" } are not used.

  ----------

    E.g. tree_method = "hist", device = "cuda"

  ----------

    E.g. tree_method = "hist", device = "cuda"

  ----------
Parameters: { "predictor" } are not used.

  ----------

    E.g. tree_method = "hist", device = "cuda"

  ---------

Best CV AUC: 0.9541019212653428
Best Params: {'n_estimators': 848, 'learning_rate': 0.07146382172480288, 'max_depth': 3, 'min_child_weight': 3, 'subsample': 0.9529260156040184, 'colsample_bytree': 0.6226559354348953, 'gamma': 0.5873845515983713, 'reg_alpha': 0.6978543881450656, 'reg_lambda': 0.1406103727086555}


In [None]:
# Best Params: {'n_estimators': 848, 'learning_rate': 0.07146382172480288, 'max_depth': 3, 'min_child_weight': 3, 'subsample': 0.9529260156040184, 'colsample_bytree': 0.6226559354348953, 'gamma': 0.5873845515983713, 'reg_alpha': 0.6978543881450656, 'reg_lambda': 0.1406103727086555}

In [26]:
best_params = study.best_params

final_model = xgb.XGBClassifier(
    objective="binary:logistic",
    eval_metric="auc",
    random_state=42,
    **best_params
)

final_model.fit(X,y)

In [27]:
#prepare test data
testing=pd.read_csv("test.csv")

In [28]:
#Try a BP binning 
def bp_flag(age, sbp):
    """
    age: years (int or float)
    sbp: systolic blood pressure (mm Hg)
    """

    # --- Determine age-based normal SBP ---
    if age < 40:
        normal_upper = 119
    elif age < 60:
        normal_upper = 124
    else:
        normal_upper = 139

    # --- BP category based on systolic ---
    if sbp < normal_upper:
        return 'normal'
    elif sbp < 129:
        return 'elevated'
    elif sbp < 139:
        return 'stage_1'
    elif sbp < 180:
        return 'stage_2'
    else:
        return 'hypertensive_crisis'
    
testing['bp_category'] = testing.apply(lambda row: bp_flag(row['Age'], row['BP']), axis=1)
#map to numerical
bp_mapping = {
    'normal': 0,
    'elevated': 1,
    'stage_1': 2,
    'stage_2': 3,
    'hypertensive_crisis': 4
}

testing['bp_category_num'] = testing['bp_category'].map(bp_mapping)


testing['stress_score'] = (
    testing['Exercise angina'].astype(int)
    + (testing['ST depression'] > 1).astype(int)
    + (testing['Slope of ST'] == 2).astype(int)
)

testing.drop(columns=['BP','Exercise angina','ST depression','Slope of ST','bp_category'],axis=1,inplace=True)

In [31]:
predictions=final_model.predict(testing.drop(columns='id'))

In [34]:
d={'id': testing['id'], 'Heart Disease': predictions}
submission=pd.DataFrame(data=d)
submission.to_csv("submission.csv",index=False)

In [36]:
from scipy.stats import ks_2samp

for col in X.columns:
    stat, p = ks_2samp(X[col], testing[col])
    if p < 0.05:
        print(f"Drift detected in {col}")
    else:
        print("all fine")

all fine
all fine
all fine
all fine
all fine
all fine
all fine
all fine
all fine
all fine
all fine


In [None]:
import numpy as np
import optuna
import xgboost as xgb

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

# ---------------- CONFIG ---------------- #
N_OUTER_FOLDS = 3
N_INNER_FOLDS = 2
N_TRIALS = 50
RANDOM_STATE = 42

optuna.logging.set_verbosity(optuna.logging.WARNING)

# ---------------- OOF STORAGE ---------------- #
oof_preds = np.zeros(len(X_train))

outer_cv = StratifiedKFold(
    n_splits=N_OUTER_FOLDS,
    shuffle=True,
    random_state=RANDOM_STATE
)

# ---------------- OUTER CV ---------------- #
for outer_fold, (train_idx, val_idx) in enumerate(
    outer_cv.split(X_train, y_train)
):
    print(f"\n🚀 Outer Fold {outer_fold + 1}/{N_OUTER_FOLDS}")

    X_tr_outer = X_train.iloc[train_idx]
    y_tr_outer = y_train.iloc[train_idx]
    X_val_outer = X_train.iloc[val_idx]
    y_val_outer = y_train.iloc[val_idx]

    # -------- OPTUNA OBJECTIVE (INNER CV) -------- #
    def objective(trial):

        params = {
            "objective": "binary:logistic",
            "eval_metric": "auc",

            "n_estimators": trial.suggest_int("n_estimators", 200, 800),
            "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.2, log=True),
            "max_depth": trial.suggest_int("max_depth", 3, 6),
            "min_child_weight": trial.suggest_int("min_child_weight", 3, 10),
            "subsample": trial.suggest_float("subsample", 0.7, 1.0),
            "colsample_bytree": trial.suggest_float("colsample_bytree", 0.7, 1.0),
            "gamma": trial.suggest_float("gamma", 0, 5),
            "reg_alpha": trial.suggest_float("reg_alpha", 0, 5),
            "reg_lambda": trial.suggest_float("reg_lambda", 1, 10),

            # GPU
            "tree_method": "hist",
            "device": "cuda",
            # "max_bin": 256,

            # XGBoost 2.x early stopping
            "early_stopping_rounds": 50,
            "random_state": RANDOM_STATE
        }

        inner_cv = StratifiedKFold(
            n_splits=N_INNER_FOLDS,
            shuffle=True,
            random_state=RANDOM_STATE
        )

        aucs = []

        for tr_idx, va_idx in inner_cv.split(X_tr_outer, y_tr_outer):
            X_tr = X_tr_outer.iloc[tr_idx]
            y_tr = y_tr_outer.iloc[tr_idx]
            X_va = X_tr_outer.iloc[va_idx]
            y_va = y_tr_outer.iloc[va_idx]

            model = xgb.XGBClassifier(**params)

            model.fit(
                X_tr,
                y_tr,
                eval_set=[(X_va, y_va)],
                verbose=False
            )

            # dval = xgb.DMatrix(X_va)
            # preds = model.predict(dval)
            preds = model.predict_proba(X_va)[:, 1]
            aucs.append(roc_auc_score(y_va, preds))

        return np.mean(aucs)

    # -------- RUN OPTUNA -------- #
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=N_TRIALS)

    best_params = study.best_params

    # -------- TRAIN FINAL MODEL ON OUTER TRAIN -------- #
    final_model = xgb.XGBClassifier(
        **best_params,
        objective="binary:logistic",
        eval_metric="auc",
        tree_method="hist",
        device="cuda",
        # max_bin=256,
        random_state=RANDOM_STATE
    )

    final_model.fit(X_tr_outer, y_tr_outer)

    # -------- OOF PREDICTIONS -------- #
    # dval_outer = xgb.DMatrix(X_val_outer)
    # oof_preds[val_idx] = final_model.predict(dval_outer)
    oof_preds[val_idx] = final_model.predict_proba(X_val_outer)[:, 1]

# ---------------- FINAL HONEST AUC ---------------- #
oof_auc = roc_auc_score(y_train, oof_preds)
print(f"\n🎯 FINAL OOF AUC (HONEST): {oof_auc:.4f}")



🚀 Outer Fold 1/3

🚀 Outer Fold 2/3
