In [16]:
import numpy as np

import sklearn.datasets
import sklearn.metrics
from sklearn.model_selection import train_test_split
import xgboost as xgb


In [9]:
def train_breast_cancer(config):
    data, labels = sklearn.datasets.load_breast_cancer(return_X_y=True)

    train_x, test_x, train_y, test_y = train_test_split(
        data, labels, test_size=0.25
    )

    train_set = xgb.DMatrix(train_x, label=train_y)
    test_set = xgb.DMatrix(test_x, label=test_y)

    results = {}

    bst = xgb.train(
        params=config,
        dtrain=train_set,
        evals=[(test_set, 'evaluation1')],
        evals_result = results,
        verbose_eval=False,
    )

    return results

In [18]:
config = {
    'objective': 'binary:logistic',
    'eval_metric':['logloss', 'error']
}

results = train_breast_cancer(config=config)
accuracy = 1.0-np.mean(results['evaluation1']['error'])
logloss = np.mean(results['evaluation1']['logloss'])
print(f'Accuracy: {accuracy:.4f} \nLog Loss: {logloss:.4f}')

Accuracy: 0.9727 
Log Loss: 0.1844


In [None]:
config = {
    "objective": "binary:logistic",
    "eval_metric": ["logloss", "error"],
    "max_depth": 2,
    "min_child_weight": 0,
    "subsample": 0.8,
    "eta": 0.2,
}

results = train_breast_cancer(config=config)
accuracy = 1.0-np.mean(results['evaluation1']['error'])
logloss = np.mean(results['evaluation1']['logloss'])
print(f'Accuracy: {accuracy:.4f} \nLog Loss: {logloss:.4f}')

Accuracy: 0.8769 
Log Loss: 0.3345


### Tuning the config parameters

In [34]:
from ray import train, tune

def train_breast_cancer(config):
    data, labels = sklearn.datasets.load_breast_cancer(return_X_y=True)

    train_x, test_x, train_y, test_y = train_test_split(
        data, labels, test_size=0.25
    )

    train_set = xgb.DMatrix(train_x, label=train_y)
    test_set = xgb.DMatrix(test_x, label=test_y)

    results = {}

    bst = xgb.train(
        params=config,
        dtrain=train_set,
        evals=[(test_set, 'eval')],
        evals_result = results,
        verbose_eval=False,
    )

    accuracy = 1.0 - results["eval"]["error"][-1]
    train.report({"mean_accuracy": accuracy, "done": True})


config = {
    "objective": "binary:logistic",
    "eval_metric": ["logloss", "error"],
    "max_depth": tune.randint(1, 9),
    "min_child_weight": tune.choice([1, 2, 3]),
    "subsample": tune.uniform(0.5, 1.0),
    "eta": tune.loguniform(1e-4, 1e-1),
}
tuner = tune.Tuner(
    train_breast_cancer,
    tune_config=tune.TuneConfig(metric='mean_accuracy', mode='max',num_samples=10),
    param_space=config,
)
results = tuner.fit()

0,1
Current time:,2024-11-12 17:06:37
Running for:,00:00:13.89
Memory:,11.5/15.9 GiB

Trial name,status,loc,eta,max_depth,min_child_weight,subsample,acc,iter,total time (s)
train_breast_cancer_63aa5_00000,TERMINATED,127.0.0.1:24944,0.000107757,3,1,0.547448,0.699301,1,0.379058
train_breast_cancer_63aa5_00001,TERMINATED,127.0.0.1:31076,0.029087,8,2,0.759357,0.902098,1,0.393061
train_breast_cancer_63aa5_00002,TERMINATED,127.0.0.1:31248,0.000640161,7,1,0.805893,0.587413,1,0.417596
train_breast_cancer_63aa5_00003,TERMINATED,127.0.0.1:19420,0.0916354,6,3,0.863201,0.895105,1,0.596731
train_breast_cancer_63aa5_00004,TERMINATED,127.0.0.1:33416,0.00200605,4,1,0.635122,0.664336,1,0.598727
train_breast_cancer_63aa5_00005,TERMINATED,127.0.0.1:28984,0.000150808,7,3,0.543363,0.622378,1,0.501192
train_breast_cancer_63aa5_00006,TERMINATED,127.0.0.1:7528,0.012959,7,1,0.709664,0.615385,1,0.547197
train_breast_cancer_63aa5_00007,TERMINATED,127.0.0.1:14328,0.000396026,1,3,0.848474,0.629371,1,0.307657
train_breast_cancer_63aa5_00008,TERMINATED,127.0.0.1:23792,0.0475093,2,1,0.657089,0.944056,1,0.60272
train_breast_cancer_63aa5_00009,TERMINATED,127.0.0.1:15556,0.0837664,2,3,0.901574,0.965035,1,0.578726


2024-11-12 17:06:37,133	INFO tune.py:1009 -- Wrote the latest version of all result files and experiment state to 'C:/Users/JCA/ray_results/train_breast_cancer_2024-11-12_17-06-23' in 0.0605s.
2024-11-12 17:06:37,177	INFO tune.py:1041 -- Total run time: 13.95 seconds (13.83 seconds for the tuning loop).


In [43]:
results.get_best_result().metrics_dataframe.T

Unnamed: 0,0
mean_accuracy,0.965035
done,True
timestamp,1731402396
checkpoint_dir_name,
training_iteration,1
trial_id,63aa5_00009
date,2024-11-12_17-06-36
time_this_iter_s,0.578726
time_total_s,0.578726
pid,15556


### Early Stopping

In [44]:
from ray.tune.schedulers import ASHAScheduler
from ray.tune.integration.xgboost import TuneReportCheckpointCallback

In [47]:
def train_breast_cancer(config: dict):
    data, labels = sklearn.datasets.load_breast_cancer(return_X_y=True)
    train_x, test_x, train_y, test_y = train_test_split(data, labels, test_size=0.25)

    # Build input matrices for XGBoost
    train_set = xgb.DMatrix(train_x, label=train_y)
    test_set = xgb.DMatrix(test_x, label=test_y)

    # Train the classifier, using the Tune callback
    xgb.train(
        config,
        train_set,
        evals=[(test_set, "eval")],
        verbose_eval=False,
        # `TuneReportCheckpointCallback` defines the checkpointing frequency and format.
        callbacks=[TuneReportCheckpointCallback(frequency=1)],
    )

def get_best_model_checkpoint(results):
    best_result = results.get_best_result()

    # `TuneReportCheckpointCallback` provides a helper method to retrieve the
    # model from a checkpoint.
    best_bst = TuneReportCheckpointCallback.get_model(best_result.checkpoint)

    accuracy = 1.0 - best_result.metrics["eval-error"]
    print(f"Best model parameters: {best_result.config}")
    print(f"Best model total accuracy: {accuracy:.4f}")
    return best_bst

def tune_xgboost(smoke_test=False):
    search_space = {
        "objective": "binary:logistic",
        "eval_metric": ["logloss", "error"],
        "max_depth": tune.randint(1, 9),
        "min_child_weight": tune.choice([1, 2, 3]),
        "subsample": tune.uniform(0.5, 1.0),
        "eta": tune.loguniform(1e-4, 1e-1),
    }

    # This will enable aggressive early stopping of bad trials.
    scheduler = ASHAScheduler(
        max_t = 10, grace_period=1, reduction_factor=2 # 10 training iterations
    )

    tuner = tune.Tuner(
        train_breast_cancer,
        tune_config=tune.TuneConfig(
            metric="eval-logloss",
            mode="min",
            scheduler=scheduler,
            num_samples=1 if smoke_test else 10,
        ),
        param_space=search_space,
    )
    results = tuner.fit()
    return results


results = tune_xgboost(smoke_test=False)

0,1
Current time:,2024-11-12 17:32:48
Running for:,00:00:14.48
Memory:,12.2/15.9 GiB

Trial name,status,loc,eta,max_depth,min_child_weight,subsample,iter,total time (s),eval-logloss,eval-error
train_breast_cancer_0c084_00000,TERMINATED,127.0.0.1:34532,0.0114616,7,3,0.708841,1,0.821629,0.647297,0.363636
train_breast_cancer_0c084_00001,TERMINATED,127.0.0.1:32964,0.0310194,1,3,0.808823,10,0.730588,0.492223,0.0699301
train_breast_cancer_0c084_00002,TERMINATED,127.0.0.1:35180,0.0383386,7,3,0.630119,10,0.873568,0.44548,0.0839161
train_breast_cancer_0c084_00003,TERMINATED,127.0.0.1:34828,0.000132227,8,3,0.976705,1,0.841491,0.646485,0.342657
train_breast_cancer_0c084_00004,TERMINATED,127.0.0.1:8204,0.0564376,6,2,0.935609,1,0.721515,0.682971,0.482517
train_breast_cancer_0c084_00005,TERMINATED,127.0.0.1:34780,0.00725873,1,2,0.873124,2,0.872263,0.625497,0.307692
train_breast_cancer_0c084_00006,TERMINATED,127.0.0.1:35260,0.0200104,8,1,0.54226,1,0.838391,0.65475,0.391608
train_breast_cancer_0c084_00007,TERMINATED,127.0.0.1:35196,0.00138105,4,3,0.507711,1,0.818888,0.645574,0.342657
train_breast_cancer_0c084_00008,TERMINATED,127.0.0.1:35152,0.000124526,3,1,0.988291,1,0.816899,0.701805,0.440559
train_breast_cancer_0c084_00009,TERMINATED,127.0.0.1:35436,0.000148146,2,2,0.77485,1,0.811785,0.662804,0.377622


2024-11-12 17:32:48,690	INFO tune.py:1009 -- Wrote the latest version of all result files and experiment state to 'C:/Users/JCA/ray_results/train_breast_cancer_2024-11-12_17-32-34' in 0.0981s.
2024-11-12 17:32:48,752	INFO tune.py:1041 -- Total run time: 14.56 seconds (14.38 seconds for the tuning loop).


In [49]:
# Load the best model checkpoint.
best_bst = get_best_model_checkpoint(results)

Best model parameters: {'objective': 'binary:logistic', 'eval_metric': ['logloss', 'error'], 'max_depth': 7, 'min_child_weight': 3, 'subsample': 0.6301194488488465, 'eta': 0.038338617965135324}
Best model total accuracy: 0.9161


In [50]:
# best_bst.predict(...)

### Using fractional GPUs

In [None]:
def tune_xgboost_rsc(smoke_test=False):
    search_space = {
        "objective": "binary:logistic",
        "eval_metric": ["logloss", "error"],
        "max_depth": tune.randint(1, 9),
        "min_child_weight": tune.choice([1, 2, 3]),
        "subsample": tune.uniform(0.5, 1.0),
        "eta": tune.loguniform(1e-4, 1e-1),
    }

    # This will enable aggressive early stopping of bad trials.
    scheduler = ASHAScheduler(
        max_t = 10, grace_period=1, reduction_factor=2 # 10 training iterations
    )

    tuner = tune.Tuner(

        # If you run 20 trials, each task will use 5% of the available GPU memory
        tune.with_resources(train_breast_cancer,
                            resources={'cpu':1, 'gpu':0.05}),
        tune_config=tune.TuneConfig(
            metric="eval-logloss",
            mode="min",
            scheduler=scheduler,
            num_samples=1 if smoke_test else 20,
        ),
        param_space=search_space,
    )
    results = tuner.fit()
    return results


results_gpu_20 = tune_xgboost_rsc(smoke_test=False)

0,1
Current time:,2024-11-12 17:38:36
Running for:,00:00:29.38
Memory:,11.3/15.9 GiB

Trial name,status,loc,eta,max_depth,min_child_weight,subsample,iter,total time (s),eval-logloss,eval-error
train_breast_cancer_d23bd_00000,TERMINATED,127.0.0.1:34864,0.0453978,5,3,0.723825,10,0.589742,0.402169,0.0629371
train_breast_cancer_d23bd_00001,TERMINATED,127.0.0.1:32884,0.0268493,1,1,0.606387,1,0.57024,0.632694,0.34965
train_breast_cancer_d23bd_00002,TERMINATED,127.0.0.1:27468,0.000134706,8,2,0.700534,1,0.625582,0.666551,0.384615
train_breast_cancer_d23bd_00003,TERMINATED,127.0.0.1:14692,0.000174341,1,1,0.989658,1,0.391127,0.64648,0.342657
train_breast_cancer_d23bd_00004,TERMINATED,127.0.0.1:28472,0.000137394,6,3,0.515765,1,0.664687,0.64367,0.335664
train_breast_cancer_d23bd_00005,TERMINATED,127.0.0.1:11312,0.000276525,8,2,0.502781,1,0.672237,0.649345,0.34965
train_breast_cancer_d23bd_00006,TERMINATED,127.0.0.1:28540,0.0852813,5,3,0.954552,10,0.795226,0.307383,0.048951
train_breast_cancer_d23bd_00007,TERMINATED,127.0.0.1:4020,0.0539766,8,1,0.593694,2,0.638414,0.580475,0.356643
train_breast_cancer_d23bd_00008,TERMINATED,127.0.0.1:15028,0.000187583,3,2,0.729,1,0.829602,0.646438,0.342657
train_breast_cancer_d23bd_00009,TERMINATED,127.0.0.1:26336,0.00417489,5,3,0.741505,1,0.63085,0.652858,0.363636


2024-11-12 17:38:36,125	INFO tune.py:1009 -- Wrote the latest version of all result files and experiment state to 'C:/Users/JCA/ray_results/train_breast_cancer_2024-11-12_17-38-06' in 0.0784s.
2024-11-12 17:38:36,166	INFO tune.py:1041 -- Total run time: 29.45 seconds (29.30 seconds for the tuning loop).


In [53]:
# Load the best model checkpoint.
best_bst = get_best_model_checkpoint(results_gpu_20)

Best model parameters: {'objective': 'binary:logistic', 'eval_metric': ['logloss', 'error'], 'max_depth': 5, 'min_child_weight': 3, 'subsample': 0.9545523772371498, 'eta': 0.08528134780427145}
Best model total accuracy: 0.9510
