In [1]:
%autosave 0

Autosave disabled


In [15]:
import pandas as pd
import numpy as np
import xgboost as xgb
import myfunctions as mf
from bayes_opt import BayesianOptimization
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold

In [3]:
df = pd.read_csv('prepped_data.csv')

train_labels = pd.read_csv('../../data/prepared/train_labels.csv')
val_labels = pd.read_csv('../../data/prepared/val_labels.csv')

In [4]:
df.shape, train_labels.shape, val_labels.shape

((253159, 1307), (39007, 2), (214152, 2))

In [5]:
seed=42

train, val = train_test_split(df, train_size=0.9,
                              random_state=seed)

train.shape, val.shape

((227843, 1307), (25316, 1307))

In [6]:
all_labels = pd.concat([train_labels, val_labels], axis=0)

all_labels.shape

(253159, 2)

In [7]:
train_final = train.merge(all_labels, how='left', on='customer_ID')
val_final = val.merge(all_labels, how='left', on='customer_ID')

train_final.shape, val_final.shape

((227843, 1308), (25316, 1308))

In [8]:
train_final.drop(columns='customer_ID', inplace=True)
val_final.drop(columns='customer_ID', inplace=True)

In [9]:
train_matrix = xgb.DMatrix(train_final.drop(columns='target'), label=train_final['target'])
val_matrix = xgb.DMatrix(val_final.drop(columns='target'), label=val_final['target'])

In [28]:
train_X = train_final.drop(columns='target')
train_y = train_final['target']

val_X = val_final.drop(columns='target')
val_y = val_final['target']

In [30]:
model = xgb.XGBClassifier().fit(train_X, train_y)

In [31]:
baseline = mf.model_evaluator(model, val_final.drop(columns='target'), val_final['target'])

baseline

0.5629355660396548

In [34]:
y_hat_base = model.predict_proba(val_X)[:,1]

In [35]:
y_true_final = pd.DataFrame(val_y)

In [36]:
y_hat_base_final = pd.DataFrame(y_hat_base, columns=['prediction'])

In [37]:
mf.amex_metric(y_true_final, y_hat_base_final)

0.778272447263978

In [45]:
def classifier_evaluator(model, data, y_true):
    
    y_hat = model.predict_proba(data)[:,1]
    
    y_true_final = pd.DataFrame(y_true)
    
    y_hat_final = pd.DataFrame(y_hat, columns=['prediction'])
    
    return amex_metric(y_true_final, y_hat_final)

In [40]:
model.get_params

<bound method XGBModel.get_params of XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=100,
              n_jobs=0, num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, ...)>

In [42]:
params = {
    'learning_rate': [0.05, 0.1],
    'max_depth': [3, 4, 5],
    'min_child_weight': [5, 10],
    'subsample': [0.6, 0.8],
    'colsample_bytree': [0.6, 0.8],
}

In [43]:
clf = GridSearchCV(model, params, scoring='neg_log_loss',
                   refit=True, cv=2,
                   verbose=3,
                   return_train_score=True)

In [44]:
clf.fit(train_X, train_y)

Fitting 2 folds for each of 48 candidates, totalling 96 fits
[CV 1/2] END colsample_bytree=0.6, learning_rate=0.05, max_depth=3, min_child_weight=5, subsample=0.6;, score=(train=-0.244, test=-0.248) total time= 2.6min
[CV 2/2] END colsample_bytree=0.6, learning_rate=0.05, max_depth=3, min_child_weight=5, subsample=0.6;, score=(train=-0.244, test=-0.248) total time= 2.5min
[CV 1/2] END colsample_bytree=0.6, learning_rate=0.05, max_depth=3, min_child_weight=5, subsample=0.8;, score=(train=-0.245, test=-0.249) total time= 2.8min
[CV 2/2] END colsample_bytree=0.6, learning_rate=0.05, max_depth=3, min_child_weight=5, subsample=0.8;, score=(train=-0.244, test=-0.248) total time= 2.9min
[CV 1/2] END colsample_bytree=0.6, learning_rate=0.05, max_depth=3, min_child_weight=10, subsample=0.6;, score=(train=-0.244, test=-0.248) total time= 2.5min
[CV 2/2] END colsample_bytree=0.6, learning_rate=0.05, max_depth=3, min_child_weight=10, subsample=0.6;, score=(train=-0.244, test=-0.248) total time= 2.

[CV 1/2] END colsample_bytree=0.8, learning_rate=0.05, max_depth=3, min_child_weight=10, subsample=0.6;, score=(train=-0.244, test=-0.249) total time= 3.1min
[CV 2/2] END colsample_bytree=0.8, learning_rate=0.05, max_depth=3, min_child_weight=10, subsample=0.6;, score=(train=-0.244, test=-0.248) total time= 3.1min
[CV 1/2] END colsample_bytree=0.8, learning_rate=0.05, max_depth=3, min_child_weight=10, subsample=0.8;, score=(train=-0.245, test=-0.249) total time= 3.6min
[CV 2/2] END colsample_bytree=0.8, learning_rate=0.05, max_depth=3, min_child_weight=10, subsample=0.8;, score=(train=-0.245, test=-0.248) total time= 3.6min
[CV 1/2] END colsample_bytree=0.8, learning_rate=0.05, max_depth=4, min_child_weight=5, subsample=0.6;, score=(train=-0.234, test=-0.242) total time= 4.0min
[CV 2/2] END colsample_bytree=0.8, learning_rate=0.05, max_depth=4, min_child_weight=5, subsample=0.6;, score=(train=-0.234, test=-0.241) total time= 4.0min
[CV 1/2] END colsample_bytree=0.8, learning_rate=0.05,

GridSearchCV(cv=2,
             estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                     callbacks=None, colsample_bylevel=1,
                                     colsample_bynode=1, colsample_bytree=1,
                                     early_stopping_rounds=None,
                                     enable_categorical=False, eval_metric=None,
                                     gamma=0, gpu_id=-1,
                                     grow_policy='depthwise',
                                     importance_type=None,
                                     interaction_constraints='',
                                     learning_rate=0.300000012, max_bin=256,
                                     max_ca...
                                     min_child_weight=1, missing=nan,
                                     monotone_constraints='()',
                                     n_estimators=100, n_jobs=0,
                                     num_parall

In [25]:
def bo_tune_xgb(max_depth, gamma, learning_rate,
                subsample, colsample_bytree):
    
    params = {'max_depth': int(max_depth),
              'gamma': gamma,
              'learning_rate': learning_rate,
              'subsample': subsample,
              'colsample_bytree': colsample_bytree,
              'objective': 'binary:logistic',
              'eval_metric': 'logloss',
              'early_stopping_rounds': 10}
    
    cv_result = xgb.cv(params, train_matrix,
                       num_boost_round=1_000,
                       nfold=3)
    
    return cv_result

In [21]:
xgb_bo = BayesianOptimization(bo_tune_xgb, {'max_depth': (3, 5),
                                            'gamma': (0,5),
                                            'learning_rate': (0.05, 0.15),
                                            'subsample': (0.5, 0.9),
                                            'colsample_bytree': (0.5, 0.9)})

In [22]:
xgb_bo.maximize(n_iter=2, init_points=8, acq='ei')

|   iter    |  target   | colsam... |   gamma   | learni... | max_depth | subsample |
-------------------------------------------------------------------------------------
Parameters: { "early_stopping_rounds" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "early_stopping_rounds" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "early_stopping_rounds" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenl

ValueError: all the input arrays must have same number of dimensions, but the array at index 0 has 1 dimension(s) and the array at index 1 has 3 dimension(s)