In [None]:
from sklearn import metrics
from scipy.stats import ks_2samp
import wandb
from wandb.xgboost import WandbCallback
import os
os.environ["WANDB_API_KEY"] = config['WANDB_API_KEY']

xgboost.set_config(verbosity=0)

In [None]:
sweep_config = {
  "method" : "grid",
  "parameters" : {
    "learning_rate" :{
      "values": [0.001, 0.005, 0.01]
    },
    "early_stopping_rounds" :{
      "values" : [1000, 2000, 4000]
    },
    "subsample": {    # fraction of observations to be random samples for each tree
      "values": [0.5, 0.7, 0.8, 1.0]
    },
    "max_depth": {
      "values": [4, 6]  
    }, # Used to control over-fitting as higher depth will allow the model to learn relations very specific to a particular sample
  }
}

sweep_id = wandb.sweep(sweep_config, project='thesis', entity='sanabasharat')

def train():
  with wandb.init(job_type="sweep") as run:
    # for i in list_cvs: # for each of the 50 splits
    x_train, x_test, y_train, y_test = train_test_split(data[COLUMNS_TRAINING], data['driver'],
                                                        random_state=104, 
                                                        test_size=0.25, 
                                                        shuffle=True)         # CODE SOURCE: containers_build\boostdm\training.py LIN 44
    x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.25, random_state=104) # 0.25 x 0.8 = 0.2
    
    bst_params = {
        'objective': 'binary:logistic'
        , 'base_score': y_train.mean()
        , 'gamma': 0
        , 'learning_rate': run.config['learning_rate']
        , 'max_depth': 3
        , 'n_estimators': 20000
        , 'random_state': 42
        , 'early_stopping_rounds': run.config['early_stopping_rounds']
        , 'eval_metric': 'logloss'
        , 'subsample': run.config['subsample']
        , 'max_depth': run.config['max_depth']
        , 'reg_lambda': 1
        , 'random_state': 42
        , 'scale_pos_weight': 1
        , 'silent': 1
        , 'seed': 21
        , 'reg_alpha': 0         # L1 regularization term on weight
        , 'max_delta_step': 0    # positive value can help make the update step more conservative. generally not used
        , 'min_child_weight': 1
        , 'colsample_bylevel': 1.0
        , 'colsample_bytree': 1.0        # fraction of columns to be random samples for each tree
        , 'booster': 'gbtree'
        , 'n_jobs' : 1
        , 'min_child_weight': 1
    }
    # params = XGB_PARAMS.copy()                                          
    # params['n_estimators'] = 20000  # set it high enough to allow "early stopping" events below
    # params['base_score'] = y_train.mean()
    # params['n_jobs'] = 1
    # params['seed'] = seed
    model = XGBClassifier(**bst_params)

    # train with xgboost
    # learning_curve_dict = {}
    model.fit(x_train, y_train, eval_set=[(x_train, y_train), (x_test, y_test)],
                        callbacks = [
                            xgboost.callback.EvaluationMonitor(rank=0, period=1, show_stdv=False),
                            WandbCallback()
                        ],
                        verbose = 0)

    bst_params['n_estimators'] = model.best_iteration
    model.set_params(**bst_params)
    
    bstr = model.get_booster()

    # Log booster metrics
    run.summary["best_ntree_limit"] = bstr.best_ntree_limit
    
    # Get train and validation predictions
    trnYpreds = model.predict_proba(x_train)[:,1]
    valYpreds = model.predict_proba(x_val)[:,1] 

    # Log additional Train metrics
    false_positive_rate, true_positive_rate, thresholds = metrics.roc_curve(y_train, trnYpreds) 
    run.summary['train_ks_stat'] = max(true_positive_rate - false_positive_rate)
    run.summary['train_auc'] = metrics.auc(false_positive_rate, true_positive_rate)
    run.summary['train_log_loss'] = -(y_train * np.log(trnYpreds) + (1-y_train) * np.log(1-trnYpreds)).sum() / len(y_train)

    # Log additional Validation metrics
    ks_stat, ks_pval = ks_2samp(valYpreds[y_val==1], valYpreds[y_val==0])
    run.summary["val_ks_2samp"] = ks_stat
    run.summary["val_ks_pval"] = ks_pval
    run.summary["val_auc"] = metrics.roc_auc_score(y_val, valYpreds)
    run.summary["val_acc_0.5"] = metrics.accuracy_score(y_val, np.where(valYpreds >= 0.5, 1, 0))
    run.summary["val_log_loss"] = -(y_val * np.log(valYpreds) + (1-y_val) * np.log(1-valYpreds)).sum() / len(y_val)

In [None]:
# count = 10 # number of runs to execute
wandb.agent(sweep_id, function=train)