In [13]:
import pandas as pd
import numpy as np
import time
import warnings

from sklearn.metrics import roc_auc_score
import lightgbm as lgb
from sklearn.model_selection import train_test_split

# hyperparameter tuning
from hyperopt import fmin, tpe, hp, STATUS_OK
from hyperopt.pyll import scope

# model/grid search tracking
import mlflow

warnings.filterwarnings("ignore")

In [14]:
from preprocessing import convert_data, engineer_features, select_features
from imblearn.combine import SMOTEENN
from collections import Counter


path = 'data\\train.csv'
df = pd.read_csv(path)
df = convert_data(df)
df = engineer_features(df)
df = select_features(df)

X = df.drop('churn', axis=1)
y = df.churn

oversample = SMOTEENN()
print(Counter(y))
X, y = oversample.fit_resample(X, y)
print(Counter(y))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, shuffle=True, random_state=59)

(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

Counter({0: 3597, 1: 598})
Counter({1: 3026, 0: 2090})


((4348, 16), (4348,), (768, 16), (768,))

In [15]:
# create dataset for lightgbm
lgb_train = lgb.Dataset(X_train, label=y_train)
lgb_eval = lgb.Dataset(X_test, label=y_test, reference=lgb_train)

In [16]:
search_space = {
    'learning_rate': hp.loguniform('learning_rate', -7, 0),
    'max_depth': scope.int(hp.uniform('max_depth', 1, 100)),
    'min_data_in_leaf': scope.int(hp.uniform('min_data_in_leaf', 1, 100)),
    'num_leaves': scope.int(hp.uniform('num_leaves', 2, 50)),
    'bagging_freq': scope.int(hp.uniform('bagging_freq', 1, 100)),
    'bagging_fraction': hp.loguniform('bagging_fraction', -1, 0),  # subsample
    'feature_fraction': hp.loguniform('feature_fraction', -1, 0),
    'lambda_l1': hp.loguniform('lambda_l1', -10, 10),
    'lambda_l2': hp.loguniform('lambda_l2', -10, 10),
    'histogram_pool_size': 5000,
    'feature_pre_filter': False,
    'force_col_wise': True,
    'objective': 'binary',
    'boosting': 'gbdt',
    'metric': 'auc',
    'seed': 59,
    'feature_fraction_seed': 59,
    'bagging_seed': 59,
    'verbose': -1
}

In [17]:
from sklearn.metrics import f1_score

def train_model(params):
    # With MLflow autologging, hyperparameters and the trained model are automatically logged to MLflow.
    mlflow.lightgbm.autolog(silent=True)

    # However, we can log additional information by using an MLFlow tracking context manager 
    with mlflow.start_run(nested=True):

        # Train model and record run time
        start_time = time.time()
        booster = lgb.train(params, lgb_train, num_boost_round=5000, valid_sets=lgb_eval,
                            early_stopping_rounds=50, verbose_eval=False)
        run_time = time.time() - start_time
        mlflow.log_metric('runtime', run_time)

        # Record AUC as primary loss for Hyperopt to minimize
        predictions_test = booster.predict(X_test)
        auc_score = roc_auc_score(y_test, predictions_test)

        # Set the loss to -1*auc so fmin maximizes the auc_score
        return {'status': STATUS_OK, 'loss': -auc_score, 'booster': booster}

In [18]:
mlflow.set_experiment(experiment_name='LGB-params')
# runs initial search to assess 25 hyperparameter combinations
with mlflow.start_run(run_name='LGB_Search'):
    best_params = fmin(
        fn=train_model,
        space=search_space,
        algo=tpe.suggest,
        max_evals=25,
        rstate=np.random.default_rng(59),
    )

100%|██████████| 25/25 [02:04<00:00,  4.98s/trial, best loss: -0.9921300610955783]
