In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    classification_report, roc_auc_score, confusion_matrix, log_loss
)
import xgboost as xgb
from xgboost import XGBClassifier
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe, space_eval 
from hyperopt.early_stop import no_progress_loss
import os
import pickle
import shap

In [3]:
train_df = pd.read_csv("../../Data/BRFSS_2024_model_ready_train.csv")
test_df  = pd.read_csv("../../Data/BRFSS_2024_model_ready_test.csv")

X_train = train_df.drop('DIABETE4', axis=1)
y_train = train_df['DIABETE4'].astype(int)

X_test = test_df.drop('DIABETE4', axis=1)
y_test = test_df['DIABETE4'].astype(int)

In [4]:
# Since XBG expects labels starting from 0, we need to map our labels accordingly
label_map = {1: 0, 3: 1, 4: 2}

y_train_enc = y_train.map(label_map)
y_test_enc  = y_test.map(label_map)

In [5]:
build_info = xgb.build_info()
use_cuda = build_info.get("USE_CUDA", False)
device = "cuda" if use_cuda else "cpu"

xgb_clf = XGBClassifier(
    device = device,
    tree_method='approx',
    booster='gbtree',
    objective='multi:softmax',
    sampling_method='gradient_based',
    eval_metric='mlogloss',
    num_class=3,
    random_state=42,
    validate_parameters=True
)
xgb_clf.fit(X_train, y_train_enc)

0,1,2
,objective,'multi:softmax'
,base_score,
,booster,'gbtree'
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,'cuda'
,early_stopping_rounds,
,enable_categorical,False


In [6]:
y_pred_baseline = xgb_clf.predict(X_test)
y_proba_baseline = xgb_clf.predict_proba(X_test)

print("BASELINE MODEL PERFORMANCE")
print(f"Accuracy: {accuracy_score(y_test_enc, y_pred_baseline):.4f}")
print(f"Precision (macro): {precision_score(y_test_enc, y_pred_baseline, average='macro', zero_division=0):.4f}")
print(f"Recall (macro): {recall_score(y_test_enc, y_pred_baseline, average='macro', zero_division=0):.4f}")
print(f"F1 Score (macro): {f1_score(y_test_enc, y_pred_baseline, average='macro', zero_division=0):.4f}")
print(f"Log Loss: {log_loss(y_test_enc, y_proba_baseline):.4f}")

print("\nClassification Report:\n", classification_report(y_test_enc, y_pred_baseline, zero_division=0))
print("\nConfusion Matrix:\n", confusion_matrix(y_test_enc, y_pred_baseline))

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


  return func(**kwargs)


BASELINE MODEL PERFORMANCE
Accuracy: 0.8365
Precision (macro): 0.4635
Recall (macro): 0.3964
F1 Score (macro): 0.4068
Log Loss: 0.4306

Classification Report:
               precision    recall  f1-score   support

           0       0.53      0.22      0.31     13162
           1       0.86      0.97      0.91     75226
           2       0.00      0.00      0.00      2261

    accuracy                           0.84     90649
   macro avg       0.46      0.40      0.41     90649
weighted avg       0.79      0.84      0.80     90649


Confusion Matrix:
 [[ 2890 10271     1]
 [ 2287 72938     1]
 [  226  2035     0]]


In [7]:
search_space = {
    'max_depth': hp.choice("max_depth", [4, 6, 8, 10, 12]),
    'min_child_weight' : hp.choice("min_child_weight", np.arange(10,100,10,dtype='int')),
    'max_leaves': hp.choice("max_leaves", [0, 16, 32, 64, 128, 256]),
    'max_bin' : hp.choice("max_bin", [256,512]),
    'eta'      : hp.uniform("eta", 0, 1),
    'n_estimators': hp.choice("n_estimators", np.arange(1000,5000,1000,dtype='int')),
    'max_delta_step' : hp.choice("max_delta_step", [0, 1, 2, 5, 10]),
    'subsample' : hp.uniform("subsample",0.5,1),
    'colsample_bytree': hp.uniform("colsample_bytree", 0.5,1),
    'colsample_bynode': hp.uniform("colsample_bynode", 0.5,1), 
    'colsample_bylevel': hp.uniform("colsample_bylevel", 0.5,1),
    'gamma'    : hp.uniform("gamma", 0, 10e1),
    'reg_alpha': hp.uniform("reg_alpha", 10e-7, 10),
    'reg_lambda' : hp.uniform("reg_lambda", 0,1),
    'early_stopping_rounds' : hp.choice("early_stopping_rounds", [50,100,200]),
    'grow_policy' : hp.choice("grow_policy", ['depthwise','lossguide']),
    'importance_type' : hp.choice("importance_type", ['gain','weight','cover','total_gain','total_cover']),
    'objective' : 'multi:softmax',
    'eval_metric' : 'logloss',
    'seed' : 42,
    'device' : device,
    'tree_method' : 'approx',
    'booster' : 'gbtree',
    'objective' : 'multi:softmax',
    'sampling_method' : 'gradient_based',
    'eval_metric' : 'mlogloss',
    'num_class' : 3,
    'validate_parameters' : True
}

dtrain_clf = xgb.DMatrix(X_train, y_train_enc, enable_categorical = True)

def xgb_objective(space):
  results = xgb.cv(space, 
                   dtrain=dtrain_clf,
                   num_boost_round=500,
                   nfold=5, 
                   stratified=True,  
                   early_stopping_rounds=20,
                   metrics = ['mlogloss','auc','aucpr','merror'])
  
  best_score = results['test-auc-mean'].max()
  return {'loss':-best_score, 'status': STATUS_OK}

trials = Trials()
best_hyperparams = fmin(fn=xgb_objective, space=search_space,algo=tpe.suggest,max_evals=500,trials=trials, return_argmin=False, early_stop_fn=no_progress_loss(10))
best_params = best_hyperparams.copy()

if 'eval_metric' in best_params:
  best_params = {key:best_params[key] for key in best_params if key!='eval_metric'}

print("The best hyperparameters are : ")
print(best_params)

  0%|          | 0/500 [00:00<?, ?trial/s, best loss=?]

Parameters: { "early_stopping_rounds", "importance_type", "n_estimators" } are not used.

  return getattr(self.bst, name)(*args, **kwargs)

Parameters: { "early_stopping_rounds", "importance_type", "n_estimators" } are not used.

  self.bst.update(self.dtrain, iteration, fobj)

Parameters: { "early_stopping_rounds", "importance_type", "n_estimators" } are not used.

  self.bst.update(self.dtrain, iteration, fobj)

Parameters: { "early_stopping_rounds", "importance_type", "n_estimators" } are not used.

  self.bst.update(self.dtrain, iteration, fobj)

Parameters: { "early_stopping_rounds", "importance_type", "n_estimators" } are not used.

  self.bst.update(self.dtrain, iteration, fobj)



  0%|          | 1/500 [11:06<92:23:14, 666.52s/trial, best loss: -0.9480763389074529]

Parameters: { "early_stopping_rounds", "importance_type", "n_estimators" } are not used.

  return getattr(self.bst, name)(*args, **kwargs)

Parameters: { "early_stopping_rounds", "importance_type", "n_estimators" } are not used.

  self.bst.update(self.dtrain, iteration, fobj)

Parameters: { "early_stopping_rounds", "importance_type", "n_estimators" } are not used.

  self.bst.update(self.dtrain, iteration, fobj)

Parameters: { "early_stopping_rounds", "importance_type", "n_estimators" } are not used.

  self.bst.update(self.dtrain, iteration, fobj)

Parameters: { "early_stopping_rounds", "importance_type", "n_estimators" } are not used.

  self.bst.update(self.dtrain, iteration, fobj)



  0%|          | 2/500 [23:20<97:41:24, 706.19s/trial, best loss: -0.9480763389074529]

Parameters: { "early_stopping_rounds", "importance_type", "n_estimators" } are not used.

  return getattr(self.bst, name)(*args, **kwargs)

Parameters: { "early_stopping_rounds", "importance_type", "n_estimators" } are not used.

  self.bst.update(self.dtrain, iteration, fobj)

Parameters: { "early_stopping_rounds", "importance_type", "n_estimators" } are not used.

  self.bst.update(self.dtrain, iteration, fobj)

Parameters: { "early_stopping_rounds", "importance_type", "n_estimators" } are not used.

  self.bst.update(self.dtrain, iteration, fobj)

Parameters: { "early_stopping_rounds", "importance_type", "n_estimators" } are not used.

  self.bst.update(self.dtrain, iteration, fobj)



  1%|          | 3/500 [43:20<128:37:55, 931.74s/trial, best loss: -0.9480763389074529]

Parameters: { "early_stopping_rounds", "importance_type", "n_estimators" } are not used.

  return getattr(self.bst, name)(*args, **kwargs)

Parameters: { "early_stopping_rounds", "importance_type", "n_estimators" } are not used.

  self.bst.update(self.dtrain, iteration, fobj)

Parameters: { "early_stopping_rounds", "importance_type", "n_estimators" } are not used.

  self.bst.update(self.dtrain, iteration, fobj)

Parameters: { "early_stopping_rounds", "importance_type", "n_estimators" } are not used.

  self.bst.update(self.dtrain, iteration, fobj)

Parameters: { "early_stopping_rounds", "importance_type", "n_estimators" } are not used.

  self.bst.update(self.dtrain, iteration, fobj)



  1%|          | 4/500 [1:01:14<136:05:02, 987.71s/trial, best loss: -0.9480763389074529]

Parameters: { "early_stopping_rounds", "importance_type", "n_estimators" } are not used.

  return getattr(self.bst, name)(*args, **kwargs)

Parameters: { "early_stopping_rounds", "importance_type", "n_estimators" } are not used.

  self.bst.update(self.dtrain, iteration, fobj)

Parameters: { "early_stopping_rounds", "importance_type", "n_estimators" } are not used.

  self.bst.update(self.dtrain, iteration, fobj)

Parameters: { "early_stopping_rounds", "importance_type", "n_estimators" } are not used.

  self.bst.update(self.dtrain, iteration, fobj)

Parameters: { "early_stopping_rounds", "importance_type", "n_estimators" } are not used.

  self.bst.update(self.dtrain, iteration, fobj)



  1%|          | 5/500 [1:26:00<160:32:17, 1167.55s/trial, best loss: -0.9480763389074529]

Parameters: { "early_stopping_rounds", "importance_type", "n_estimators" } are not used.

  return getattr(self.bst, name)(*args, **kwargs)

Parameters: { "early_stopping_rounds", "importance_type", "n_estimators" } are not used.

  self.bst.update(self.dtrain, iteration, fobj)

Parameters: { "early_stopping_rounds", "importance_type", "n_estimators" } are not used.

  self.bst.update(self.dtrain, iteration, fobj)

Parameters: { "early_stopping_rounds", "importance_type", "n_estimators" } are not used.

  self.bst.update(self.dtrain, iteration, fobj)

Parameters: { "early_stopping_rounds", "importance_type", "n_estimators" } are not used.

  self.bst.update(self.dtrain, iteration, fobj)



  1%|          | 6/500 [1:42:49<152:47:51, 1113.50s/trial, best loss: -0.9480763389074529]

Parameters: { "early_stopping_rounds", "importance_type", "n_estimators" } are not used.

  return getattr(self.bst, name)(*args, **kwargs)

Parameters: { "early_stopping_rounds", "importance_type", "n_estimators" } are not used.

  self.bst.update(self.dtrain, iteration, fobj)

Parameters: { "early_stopping_rounds", "importance_type", "n_estimators" } are not used.

  self.bst.update(self.dtrain, iteration, fobj)

Parameters: { "early_stopping_rounds", "importance_type", "n_estimators" } are not used.

  self.bst.update(self.dtrain, iteration, fobj)

Parameters: { "early_stopping_rounds", "importance_type", "n_estimators" } are not used.

  self.bst.update(self.dtrain, iteration, fobj)



  1%|▏         | 7/500 [3:05:45<325:26:04, 2376.40s/trial, best loss: -0.9480763389074529]

Parameters: { "early_stopping_rounds", "importance_type", "n_estimators" } are not used.

  return getattr(self.bst, name)(*args, **kwargs)

Parameters: { "early_stopping_rounds", "importance_type", "n_estimators" } are not used.

  self.bst.update(self.dtrain, iteration, fobj)

Parameters: { "early_stopping_rounds", "importance_type", "n_estimators" } are not used.

  self.bst.update(self.dtrain, iteration, fobj)

Parameters: { "early_stopping_rounds", "importance_type", "n_estimators" } are not used.

  self.bst.update(self.dtrain, iteration, fobj)

Parameters: { "early_stopping_rounds", "importance_type", "n_estimators" } are not used.

  self.bst.update(self.dtrain, iteration, fobj)



  2%|▏         | 8/500 [3:26:38<275:54:33, 2018.85s/trial, best loss: -0.9480763389074529]

Parameters: { "early_stopping_rounds", "importance_type", "n_estimators" } are not used.

  return getattr(self.bst, name)(*args, **kwargs)

Parameters: { "early_stopping_rounds", "importance_type", "n_estimators" } are not used.

  self.bst.update(self.dtrain, iteration, fobj)

Parameters: { "early_stopping_rounds", "importance_type", "n_estimators" } are not used.

  self.bst.update(self.dtrain, iteration, fobj)

Parameters: { "early_stopping_rounds", "importance_type", "n_estimators" } are not used.

  self.bst.update(self.dtrain, iteration, fobj)

Parameters: { "early_stopping_rounds", "importance_type", "n_estimators" } are not used.

  self.bst.update(self.dtrain, iteration, fobj)



  2%|▏         | 9/500 [4:59:54<427:51:27, 3137.04s/trial, best loss: -0.9480763389074529]

Parameters: { "early_stopping_rounds", "importance_type", "n_estimators" } are not used.

  return getattr(self.bst, name)(*args, **kwargs)

Parameters: { "early_stopping_rounds", "importance_type", "n_estimators" } are not used.

  self.bst.update(self.dtrain, iteration, fobj)

Parameters: { "early_stopping_rounds", "importance_type", "n_estimators" } are not used.

  self.bst.update(self.dtrain, iteration, fobj)

Parameters: { "early_stopping_rounds", "importance_type", "n_estimators" } are not used.

  self.bst.update(self.dtrain, iteration, fobj)

Parameters: { "early_stopping_rounds", "importance_type", "n_estimators" } are not used.

  self.bst.update(self.dtrain, iteration, fobj)



  2%|▏         | 10/500 [5:20:11<261:29:30, 1921.16s/trial, best loss: -0.9480763389074529]
The best hyperparameters are : 
{'booster': 'gbtree', 'colsample_bylevel': 0.6900324009050225, 'colsample_bynode': 0.6977498035097837, 'colsample_bytree': 0.9012682618091961, 'device': 'cuda', 'early_stopping_rounds': 100, 'eta': 0.8625219715868753, 'gamma': 5.558621313104995, 'grow_policy': 'depthwise', 'importance_type': 'total_gain', 'max_bin': 512, 'max_delta_step': 10, 'max_depth': 8, 'max_leaves': 128, 'min_child_weight': np.int64(20), 'n_estimators': np.int64(3000), 'num_class': 3, 'objective': 'multi:softmax', 'reg_alpha': 5.93424404936359, 'reg_lambda': 0.36640514509408595, 'sampling_method': 'gradient_based', 'seed': 42, 'subsample': 0.9306953803882572, 'tree_method': 'approx', 'validate_parameters': True}


In [8]:
del best_params['early_stopping_rounds']

xgb_best = XGBClassifier(**best_params)
xgb_best.fit(X_train, y_train_enc)

y_pred_best = xgb_best.predict(X_test)
y_proba_best = xgb_best.predict_proba(X_test)

print("BEST MODEL PERFORMANCE")
print(f"Accuracy: {accuracy_score(y_test_enc, y_pred_best):.4f}")
print(f"Precision (macro): {precision_score(y_test_enc, y_pred_best, average='macro', zero_division=0):.4f}")
print(f"Recall (macro): {recall_score(y_test_enc, y_pred_best, average='macro', zero_division=0):.4f}")
print(f"F1 Score (macro): {f1_score(y_test_enc, y_pred_best, average='macro', zero_division=0):.4f}")
print(f"Log Loss: {log_loss(y_test_enc, y_proba_best):.4f}")

print("\nClassification Report:\n", classification_report(y_test_enc, y_pred_best, zero_division=0))
print("\nConfusion Matrix:\n", confusion_matrix(y_test_enc, y_pred_best))

BEST MODEL PERFORMANCE
Accuracy: 0.8320
Precision (macro): 0.4879
Recall (macro): 0.3995
F1 Score (macro): 0.4096
Log Loss: 0.4458

Classification Report:
               precision    recall  f1-score   support

           0       0.50      0.24      0.32     13162
           1       0.86      0.96      0.91     75226
           2       0.11      0.00      0.00      2261

    accuracy                           0.83     90649
   macro avg       0.49      0.40      0.41     90649
weighted avg       0.79      0.83      0.80     90649


Confusion Matrix:
 [[ 3108 10045     9]
 [ 2906 72305    15]
 [  258  2000     3]]


In [9]:
# SAVE PICKLE BUNDLE FOR XGBOOST MODEL (xgb_best)

# 1. Core predictions & confusion matrix
# Uses tuned XGBoost model: xgb_best
# and encoded labels: y_test_enc

y_pred = y_pred_best          # from the notebook
y_proba = y_proba_best        # from the notebook
y_test = y_test_enc           # use encoded labels (0,1,2)

cm = confusion_matrix(y_test, y_pred)


# 2. Feature names and X_test sample for explorer
if hasattr(X_train, "columns"):
    feature_names = X_train.columns.to_numpy()
else:
    feature_names = np.array([f"feature_{i}" for i in range(X_train.shape[1])])

# sample of X_test for interactive explorer & SHAP
if hasattr(X_test, "iloc"):
    X_test_sample = X_test.iloc[:2000]
else:
    X_test_sample = X_test[:2000]


# 3. Gain-based feature importance (XGBoost default = 'gain')
xgb_gain_importance = xgb_best.feature_importances_   # 1D array, per feature


# 4. SHAP values for summary plot
# Use TreeExplainer on a subset of X_test to keep size manageable
try:
    explainer = shap.TreeExplainer(xgb_best)
    X_shap = X_test_sample
    xgb_shap_values = explainer.shap_values(X_shap)
    xgb_shap_expected_value = explainer.expected_value
except Exception as e:
    print("Warning: SHAP computation failed, storing None. Error:", e)
    X_shap = None
    xgb_shap_values = None
    xgb_shap_expected_value = None


# 5. Build bundle dictionary with everything we need later
bundle = {
    "model_name": "XGBoost (tuned)",

    # Core evaluation arrays (for common visualizations) 
    "y_test": y_test,
    "y_pred": y_pred,
    "y_proba": y_proba,
    "confusion_matrix": cm,

    # Scalar performance metrics (for comparison plots)
    "accuracy": accuracy_score(y_test, y_pred),
    "precision_macro": precision_score(y_test, y_pred,
                                      average="macro", zero_division=0),
    "recall_macro": recall_score(y_test, y_pred,
                                 average="macro", zero_division=0),
    "f1_macro": f1_score(y_test, y_pred,
                         average="macro", zero_division=0),
    "log_loss": log_loss(y_test, y_proba),
    "roc_auc_ovr": roc_auc_score(y_test, y_proba, multi_class="ovr"),

    # Hyperparameters
    "params": xgb_best.get_params(),

    # XGBoost-specific: gain-based feature importance
    "xgb_feature_importance_gain": xgb_gain_importance,
    "xgb_feature_names": feature_names,

    # SHAP summary support
    "xgb_shap_values": xgb_shap_values,
    "xgb_shap_expected_value": xgb_shap_expected_value,
    "xgb_shap_X": X_shap,

    # Common keys for interactive feature explorer
    "feature_names": feature_names,
    "X_test_sample": X_test_sample,

    # Optional: store the trained model itself
    "xgb_best_model": xgb_best,
}


# 6. Save bundle to ../../Results/Visualizations
save_path = "../../Results/Visualizations"
os.makedirs(save_path, exist_ok=True)

bundle_filename = os.path.join(save_path, "xgb_bundle.pkl")

with open(bundle_filename, "wb") as f:
    pickle.dump(bundle, f)

print(f"\nXGBoost pickle bundle saved to:\n{bundle_filename}")
print("Bundle keys:", list(bundle.keys()))



XGBoost pickle bundle saved to:
../../Results/Visualizations\xgb_bundle.pkl
Bundle keys: ['model_name', 'y_test', 'y_pred', 'y_proba', 'confusion_matrix', 'accuracy', 'precision_macro', 'recall_macro', 'f1_macro', 'log_loss', 'roc_auc_ovr', 'params', 'xgb_feature_importance_gain', 'xgb_feature_names', 'xgb_shap_values', 'xgb_shap_expected_value', 'xgb_shap_X', 'feature_names', 'X_test_sample', 'xgb_best_model']
