In [16]:
import pandas as pd
import numpy as np
import time
import warnings

from sklearn.metrics import roc_auc_score
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split

# hyperparameter tuning
from hyperopt import fmin, tpe, hp, STATUS_OK
from hyperopt.pyll import scope

# model/grid search tracking
import mlflow

warnings.filterwarnings("ignore")

In [17]:
from preprocessing import convert_data, engineer_features, select_features
from imblearn.combine import SMOTEENN
from collections import Counter


path = 'data\\train.csv'
df = pd.read_csv(path)
df = convert_data(df)
df = engineer_features(df)
df = select_features(df)

X = df.drop('churn', axis=1)
y = df.churn

oversample = SMOTEENN()
print(Counter(y))
X, y = oversample.fit_resample(X, y)
print(Counter(y))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, shuffle=True, random_state=59)

(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

Counter({0: 3597, 1: 598})
Counter({1: 2986, 0: 2079})


((4305, 16), (4305,), (760, 16), (760,))

In [18]:
train = Pool(data=X_train, label=y_train)
test = Pool(data=X_test, label=y_test)

In [19]:
search_space = {
    'learning_rate': hp.loguniform('learning_rate', -7, 0),
    'depth': scope.int(hp.uniform('max_depth', 1, 16)),
    'colsample_bylevel': hp.uniform('colsample_bylevel', 0.5, 1),
    'bagging_temperature': hp.loguniform('gamma', -10, 10),
    'random_strength': hp.loguniform('alpha', -10, 10),
    'l2_leaf_reg': hp.loguniform('lambda', -10, 10),
    'eval_metric': 'AUC',
    'random_state': 59,
}

In [20]:
from sklearn.metrics import f1_score

def train_model(params):
    # log information by using an MLFlow tracking context manager 
    with mlflow.start_run(nested=True):
        cb = CatBoostClassifier(**params)
        # Train model and record run time
        start_time = time.time()
        booster = cb.fit(train, eval_set=[test],
                         early_stopping_rounds=50, verbose_eval=False)
        run_time = time.time() - start_time
        mlflow.log_metric('runtime', run_time)

        # Record AUC as primary loss for Hyperopt to minimize
        predictions_test = booster.predict(test)
        auc_score = roc_auc_score(y_test, predictions_test)
        mlflow.log_metric('auc', auc_score)

        f1score = f1_score(y_test, predictions_test)
        mlflow.log_metric('f1', f1score)
        
        mlflow.log_params(params)

        # Set the loss to -1*auc so fmin maximizes the auc_score
        return {'status': STATUS_OK, 'loss': -auc_score}

In [21]:
mlflow.set_experiment(experiment_name='CatBoost-params')
# runs initial search to assess 25 hyperparameter combinations
with mlflow.start_run(run_name='Cat_Search'):
    best_params = fmin(
        fn=train_model,
        space=search_space,
        algo=tpe.suggest,
        max_evals=25,
        rstate=np.random.default_rng(59),
    )

100%|██████████| 25/25 [13:21<00:00, 32.06s/trial, best loss: -0.9668672984433251] 
