In [53]:
import pandas as pd
import numpy as np
import time
import warnings

from sklearn.metrics import classification_report, f1_score, roc_auc_score
import xgboost as xgb
from sklearn.model_selection import train_test_split

# hyperparameter tuning
from hyperopt import fmin, tpe, hp, SparkTrials, STATUS_OK
from hyperopt.pyll import scope

# model/grid search tracking
import mlflow

warnings.filterwarnings("ignore")

In [54]:
from preprocessing import convert_data, engineer_features, select_features
from imblearn.combine import SMOTEENN
from collections import Counter


path = 'data\\train.csv'
df = pd.read_csv(path)
df = convert_data(df)
df = engineer_features(df)
df = select_features(df)

X = df.drop('churn', axis=1)
y = df.churn

oversample = SMOTEENN()
print(Counter(y))
X, y = oversample.fit_resample(X, y)
print(Counter(y))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, shuffle=True, random_state=59)

(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

Counter({0: 3597, 1: 598})
Counter({1: 3000, 0: 2118})


((4350, 16), (4350,), (768, 16), (768,))

We can do without scaling, because the model is not linear

In [55]:
X_train

Unnamed: 0,account_length,international_plan,number_vmail_messages,total_intl_minutes,total_intl_calls,number_customer_service_calls,total_minutes,total_calls,area_code_area_code_408,area_code_area_code_415,area_code_area_code_510,state_other,avg_mt_charge,avg_call_charge,avg_intl_call_charge,both_plans
1292,122,0,29,8.400000,6,3,631.600000,271,1,0,0,1,0.521148,0.234613,0.378333,0
782,123,0,39,15.400000,7,1,624.400000,322,1,0,0,1,0.582927,0.222671,0.594286,0
4250,95,0,0,11.383436,2,1,750.134144,326,0,0,0,1,0.793959,0.232174,2.259286,0
4096,77,0,0,8.628306,9,1,733.110093,296,0,0,0,0,0.943212,0.247111,0.269163,0
3840,111,0,0,10.619981,2,3,492.912077,267,0,0,1,0,0.434861,0.181452,1.399703,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1887,143,0,0,17.200000,5,1,561.900000,277,0,1,0,1,0.467203,0.241191,0.928000,0
3893,48,0,0,12.847273,5,1,725.161154,266,1,0,0,0,1.558419,0.280735,0.659892,0
1387,106,0,29,8.100000,3,1,517.000000,335,1,0,0,1,0.395000,0.124985,0.730000,0
663,106,0,0,9.600000,2,0,685.300000,371,1,0,0,1,0.594623,0.169892,1.295000,0


In [56]:
search_space = {
    'learning_rate': hp.loguniform('learning_rate', -7, 0),
    'max_depth': scope.int(hp.uniform('max_depth', 1, 100)),
    'min_child_weight': hp.loguniform('min_child_weight', -2, 3),
    'subsample': hp.uniform('subsample', 0.5, 1),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1),
    'gamma': hp.loguniform('gamma', -10, 10),
    'alpha': hp.loguniform('alpha', -10, 10),
    'lambda': hp.loguniform('lambda', -10, 10),
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'seed': 59,
}

In [57]:
train = xgb.DMatrix(data=X_train, label=y_train)
test = xgb.DMatrix(data=X_test, label=y_test)

In [58]:
def train_model(params):
    # With MLflow autologging, hyperparameters and the trained model are automatically logged to MLflow.
    mlflow.xgboost.autolog(silent=True)

    # However, we can log additional information by using an MLFlow tracking context manager 
    with mlflow.start_run(nested=True):

        # Train model and record run time
        start_time = time.time()
        booster = xgb.train(params=params, dtrain=train, num_boost_round=5000, evals=[(test, "test")],
                            early_stopping_rounds=50, verbose_eval=False)
        run_time = time.time() - start_time
        mlflow.log_metric('runtime', run_time)

        # Record AUC as primary loss for Hyperopt to minimize
        predictions_test = booster.predict(test)
        f1score = f1_score(y_test, predictions_test)
        mlflow.log_metric('f1-score-mine', f1score)
        
        auc_score = roc_auc_score(y_test, predictions_test)

        # Set the loss to -1*f1_score so fmin maximizes the auc_score
        return {'status': STATUS_OK, 'loss': -auc_score, 'booster': booster.attributes()}

In [59]:
mlflow.set_experiment(experiment_name='XGB-params')
# runs initial search to assess 25 hyperparameter combinations
with mlflow.start_run(run_name='XGB_Search'):
    best_params = fmin(
        fn=train_model,
        space=search_space,
        algo=tpe.suggest,
        max_evals=25,
        rstate=np.random.default_rng(59),
    )

  0%|          | 0/25 [00:00<?, ?trial/s, best loss=?]

job exception: Classification metrics can't handle a mix of binary and continuous targets



  0%|          | 0/25 [00:11<?, ?trial/s, best loss=?]


ValueError: Classification metrics can't handle a mix of binary and continuous targets