In [10]:
import pandas as pd
import numpy as np
import time
import warnings

from sklearn.metrics import roc_auc_score
import xgboost as xgb
from sklearn.model_selection import train_test_split

# hyperparameter tuning
from hyperopt import fmin, tpe, hp, STATUS_OK
from hyperopt.pyll import scope

# model/grid search tracking
import mlflow

warnings.filterwarnings("ignore")

Simple preprocessing steps (all functions are from `preprocessing.py`) + `SMOTEENN` for class imbalance

In [11]:
from preprocessing import convert_data, engineer_features, select_features
from imblearn.combine import SMOTEENN
from collections import Counter


path = 'data\\train.csv'
df = pd.read_csv(path)
df = convert_data(df)
df = engineer_features(df)
df = select_features(df)

X = df.drop('churn', axis=1)
y = df.churn

oversample = SMOTEENN()
print(Counter(y))
X, y = oversample.fit_resample(X, y)
print(Counter(y))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, shuffle=True, random_state=59)

(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

Counter({0: 3597, 1: 598})
Counter({1: 2990, 0: 2076})


((4306, 16), (4306,), (760, 16), (760,))

We can do without scaling, because the model is not linear

In [12]:
X_train

Unnamed: 0,account_length,international_plan,number_vmail_messages,total_intl_minutes,total_intl_calls,number_customer_service_calls,total_minutes,total_calls,area_code_area_code_408,area_code_area_code_415,area_code_area_code_510,state_other,avg_mt_charge,avg_call_charge,avg_intl_call_charge,both_plans
67,93,0,0,10.900000,3,2,578.300000,300,0,1,0,1,0.743548,0.230500,0.980000,0
1081,69,0,37,5.900000,4,1,441.100000,320,0,1,0,1,0.651014,0.140375,0.397500,0
1379,158,0,0,7.100000,9,1,629.500000,317,0,0,1,1,0.384873,0.191830,0.213333,0
3568,73,0,0,11.765947,8,0,707.710041,310,0,0,0,1,0.992542,0.233796,0.397919,0
3320,102,0,0,9.247045,1,0,585.829422,346,0,0,0,1,0.550211,0.163495,2.379728,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1887,90,0,0,11.400000,2,3,587.300000,334,0,1,0,1,0.646333,0.174162,1.540000,0
3893,102,0,46,8.735002,3,5,568.392704,318,0,0,0,1,0.516815,0.167150,0.633200,0
1387,69,0,40,12.500000,6,1,427.300000,282,0,0,1,1,0.511884,0.125248,0.563333,0
663,87,0,0,13.400000,3,1,616.100000,306,0,0,1,1,0.685287,0.194837,1.206667,0


Fast view of labels

In [13]:
y_train

67      0
1081    0
1379    0
3568    1
3320    1
       ..
1887    0
3893    1
1387    0
663     0
4273    1
Name: churn, Length: 4306, dtype: int64

Our search space, all params will be tuned in those ranges

In [14]:
search_space = {
    'learning_rate': hp.loguniform('learning_rate', -7, 0),
    'max_depth': scope.int(hp.uniform('max_depth', 1, 100)),
    'min_child_weight': hp.loguniform('min_child_weight', -2, 3),
    'subsample': hp.uniform('subsample', 0.5, 1),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1),
    'gamma': hp.loguniform('gamma', -10, 10),
    'alpha': hp.loguniform('alpha', -10, 10),
    'lambda': hp.loguniform('lambda', -10, 10),
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'seed': 59,
}

Data for XGB in DMatrix

In [15]:
train = xgb.DMatrix(data=X_train, label=y_train)
test = xgb.DMatrix(data=X_test, label=y_test)

In [16]:
def train_model(params):
    # With MLflow autologging, hyperparameters and the trained model are automatically logged to MLflow.
    mlflow.xgboost.autolog(silent=True)

    # However, we can log additional information by using an MLFlow tracking context manager 
    with mlflow.start_run(nested=True):

        # Train model and record run time
        start_time = time.time()
        booster = xgb.train(params=params, dtrain=train, num_boost_round=5000, evals=[(test, "test")],
                            early_stopping_rounds=50, verbose_eval=False)
        run_time = time.time() - start_time
        mlflow.log_metric('runtime', run_time)

        # Record AUC as primary loss for Hyperopt to minimize
        predictions_test = booster.predict(test)
        auc_score = roc_auc_score(y_test, predictions_test)

        # Set the loss to -1*auc so fmin maximizes the auc_score
        return {'status': STATUS_OK, 'loss': -auc_score, 'booster': booster.attributes()}

In [17]:
mlflow.set_experiment(experiment_name='XGB-params')
# runs initial search to assess 25 hyperparameter combinations
with mlflow.start_run(run_name='XGB_Search'):
    best_params = fmin(
        fn=train_model,
        space=search_space,
        algo=tpe.suggest,
        max_evals=25,
        rstate=np.random.default_rng(59),
    )

100%|██████████| 25/25 [02:36<00:00,  6.25s/trial, best loss: -0.9947067361051823]


To sum up, best AUC Score with XGB on validation = `0.995`, with the next set of parameters:
| Name | Value |
| --- | --- | 
| alpha | 0.0028945249679493148 |
| colsample_bytree | 0.8053849379900877 |
| early_stopping_rounds | 50 |
| eval_metric | auc |
| gamma | 4.891618838693695e-05 |
| lambda | 0.0026958913843695385 |
| learning_rate | 0.007169413240310411
| max_depth | 10 |
| maximize | None |
| min_child_weight | 0.963095425360374 |
| num_boost_round | 5000 |
| objective | binary:logistic |
| seed | 59 |
| subsample | 0.5317032764328798 |
| verbose_eval | False |