In [None]:
import sys
!{sys.executable} -m pip install xgboost
!{sys.executable} -m pip install hyperopt
!{sys.executable} -m pip install ipython-autotime
!{sys.executable} -m pip install pandas-profiling
!{sys.executable} -m pip install joblib
!{sys.executable} -m pip install pdpbox
!{sys.executable} -m pip install optuna
!{sys.executable} -m pip install lazypredict

In [None]:
from datetime import datetime
from IPython.display import display
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import re
import joblib

import warnings
from pandas.core.common import SettingWithCopyWarning

warnings.simplefilter(action = 'ignore', category = SettingWithCopyWarning)

from sklearn.model_selection import train_test_split, LeaveOneOut
from sklearn.metrics import mean_absolute_error
import os

import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
plt.style.use('classic')
%matplotlib inline

import xgboost as xgb
from pandas_profiling import ProfileReport

from config import *
from utils import *

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from numpy import mean

pd.set_option('display.max_colwidth', None)
sns.set(rc={'figure.figsize':(16,8)})

## Load Model and Dataset

In [None]:
TRAIN_FILEPATH = './Data/Prop_Data/train'
TEST_FILEPATH = './Data/Prop_Data/test'

# Train Paths
X_train = pd.read_csv('{}/train.csv'.format(TRAIN_FILEPATH))
X_train2 = pd.read_csv('{}/train2.csv'.format(TRAIN_FILEPATH))
X_train3 = pd.read_csv('{}/train3_label.csv'.format(TRAIN_FILEPATH))

y_train = pd.read_csv('{}/train_labels.csv'.format(TRAIN_FILEPATH))
y_train2 = pd.read_csv('{}/train2_label.csv'.format(TRAIN_FILEPATH))
y_train3 = pd.read_csv('{}/train3_label.csv'.format(TRAIN_FILEPATH))

# Test Paths
X_test = pd.read_csv('{}/test.csv'.format(TEST_FILEPATH))
X_calibrated = pd.read_csv('{}/test_labels.csv'.format(TEST_FILEPATH))
X_val = pd.read_csv('{}/validation.csv'.format(TEST_FILEPATH))

y_test = pd.read_csv('{}/test_labels.csv'.format(TEST_FILEPATH))
y_calibrated = pd.read_csv('{}/calibrated_labels.csv'.format(TEST_FILEPATH))
y_val = pd.read_csv('{}/validation_label.csv'.format(TEST_FILEPATH))

In [None]:
model_path = './model_results/prop_model'

nb_model = joblib.load('{}/prop_model_hyperopt_20211008.pkl'.format(model_path))
Hyperopt_model = load_model.fit(X_train2[[i for i in df_cleaned.columns if i in FEATURES]], y_train2)
Optuna_model = load_model.fit(X_train2[[i for i in df_cleaned.columns if i in FEATURES]], y_train2)

# HyperOpts Tuning Framework

In [None]:
from hyperopt import fmin, hp, tpe, Trials, STATUS_OK
from hyperopt.tpe import suggest
from hyperopt.pyll.stochastic import sample as ho_sample
from sklearn.model_selection import KFold,TimeSeriesSplit
from sklearn.metrics import roc_auc_score

In [None]:
# Define Hyper tuning parameters and value range
hyperopt_space = {
    'base_score': np.mean(X_train3[LABEL]),
    'objective': 'binary:logistic',
    'n_estimators': hp.choice('n_estimators',[400,600,800]),
    'max_depth': hp.choice('max_depth', [4,5,6]),
    'learning_rate': hp.loguniform('learning_rate', low = -2 * np.log(10), high = -1 * np.log(10)),
    'gamma': hp.uniform('gamma', 0.01, .7),
    'subsample': hp.quniform('subsample', 0.7, 1, 0.1),
    'alpha': hp.quniform('alpha', 0.5, 1.5, 0.1),
    'lambda': hp.quniform('lambda', 0.5, 1.5, 0.1),
    'colsample_bytree': hp.quniform('colsample_bytree', 0.7, 1, 0.1),
    'max_delta_step': hp.quniform('max_delta_step',1, 10, 1),
    'scale_pos_weight': hp.uniform('scale_pos_weight', low =0, high = 20),
#     'scale_pos_weight': hp.loguniform('scale_pos_weight', low = 0, high = 5),
#     'eval_metric': 'auc'
     'eval_metric':'aucpr'}

In [None]:
#Run Hyperopts framework to get best parameters
%%time
def hyperparameter_tuning(param):
    xgb_model = xgb.XGBClassifier(**param, random_state = 0)
    model = xgb_model.fit(X_train3[[i for i in df_cleaned.columns if i in FEATURES]], y_train3)
    
    preds = xgb_model.predict(X_val[model.get_booster().feature_names] )
    
    f1 = f1_score(y_val, preds)
    
    return 1 - f1

trials = Trials()
random_state = np.random.RandomState(0)

param = hyperopt_space

with warnings.catch_warnings():
    warnings.simplefilter('ignore')
    hyperopt_results = fmin(
        fn = hyperparameter_tuning,
        space = hyperopt_space,
        algo=tpe.suggest,
        max_evals=50,
        trials=trials,
        rstate = random_state
    )
        

print("Best: {}".format(hyperopt_results))

In [None]:
def dump(obj):
   for attr in dir(obj):
       if hasattr( obj, attr ):
            
            trial_obj = obj.__dir__
            print( "obj.%s = %s" % (attr, getattr(obj, attr)))
            
            

t = dump(trials)

In [None]:
#Fitting Model with outputted parameters

%%time
# Grab the best hyperparaemeters from the best trial
# This code should ideally be synced to the hyperparameter grid 
best_hyperparams = {
    'n_estimators': [200,400,600,800][trials.best_trial['misc']['vals']['n_estimators'][0]],
    'max_depth': [4,5,6][trials.best_trial['misc']['vals']['max_depth'][0]],
    'learning_rate': trials.best_trial['misc']['vals']['learning_rate'][0],
    'subsample': trials.best_trial['misc']['vals']['subsample'][0],
    'gamma': trials.best_trial['misc']['vals']['gamma'][0],
    'alpha': trials.best_trial['misc']['vals']['alpha'][0],
    'lambda': trials.best_trial['misc']['vals']['lambda'][0],
    'colsample_bytree': trials.best_trial['misc']['vals']['colsample_bytree'][0],
    'max_delta_step': trials.best_trial['misc']['vals']['max_delta_step'][0],
    'scale_pos_weight': trials.best_trial['misc']['vals']['scale_pos_weight'][0],
    'eval_metric': 'aucpr'
}

best_model = xgb.XGBClassifier(**best_hyperparams, random_state = 0)
best_model.fit(X_train[[i for i in df_cleaned.columns if i in FEATURES]], y_train)

In [None]:
preds = best_model.predict(X_test[[i for i in df_cleaned.columns if i in FEATURES]][best_model.get_booster().feature_names])
pred_proba = best_model.predict_proba(X_test[[i for i in df_cleaned.columns if i in FEATURES]][best_model.get_booster().feature_names])[:,1]

acc = round(accuracy_score(y_test, preds), 3)
auc = round(roc_auc_score(y_test, pred_proba), 3)
precision = round(precision_score(y_test, preds), 3)
recall = round(recall_score(y_test, preds), 3)
f1 = round(f1_score(y_test, preds), 3)

print('Model evaluation: ACC: {} / AUC: {} /  Precision: {} / Recall: {} / F1: {}'.format(acc, auc, precision, recall, f1))

## Save Hyperopts Model

In [None]:
# Save the trials
HYPEROPT_OUTPUT_PATH = './model_results/prop_model'
MODEL_NAME = 'prop_trial_hyperopt1'
DATE = '_20210923'
hyp_model = joblib.dump(best_model,  '{}/{}{}{}'.format(HYPEROPT_OUTPUT_PATH,MODEL_NAME,DATE, '_trials.pkl'))
hyp_model

# Optuna Tuning Framework

In [None]:
import optuna

In [None]:
def objective(trial):
    
    param_space = {
        'base_score': np.mean(X_train3[LABEL]),
        'objective': 'binary:logistic',
        'n_estimators': trial.suggest_int('n_estimators', 10,100),
        'max_depth': trial.suggest_int('max_depth', 4,6),
        'learning_rate': trial.suggest_loguniform('learning_rate', low = 0.1, high = 1),
        'gamma': trial.suggest_uniform('gamma', 1e-8, 1),
        'subsample': trial.suggest_uniform('subsample', 0.7, 1),
        'alpha': trial.suggest_uniform('alpha', 0.5, 1.5),
        'lambda': trial.suggest_uniform('lambda', 0.5, 1.5),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.7, 1),
        'max_delta_step': trial.suggest_uniform('max_delta_step', 1, 10),
        'scale_pos_weight': trial.suggest_loguniform('scale_pos_weight', 0.001, 20),
        'eval_metric': 'auc'
        }
    
    
    model = xgb.XGBClassifier(**param_space)
    
    model.fit(X_train3[[i for i in df_cleaned.columns if i in FEATURES]], y_train3)
    
    pred = model.predict(X_val[nb_model.get_booster().feature_names])
    f1 = f1_score(y_val, pred)
    
    return(f1)

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)

print('Best Parameters:', study.best_params)
print()
print('Best Value:', study.best_value)
print()
print('Best Trial:', study.best_trial)


In [None]:
test_model = xgb.XGBClassifier(**study.best_params)
test_model.fit(X_train[[i for i in df_cleaned.columns if i in FEATURES]], y_train)

In [None]:
opt_preds = test_model.predict(X_test[[i for i in df_cleaned.columns if i in FEATURES]][test_model.get_booster().feature_names])
opt_pred_proba = test_model.predict_proba(X_test[[i for i in df_cleaned.columns if i in FEATURES]][test_model.get_booster().feature_names])[:,1]

acc = round(accuracy_score(y_test, opt_preds), 3)
auc = round(roc_auc_score(y_test, opt_pred_proba), 3)
precision = round(precision_score(y_test, opt_preds), 3)
recall = round(recall_score(y_test, opt_preds), 3)
f1 = round(f1_score(y_test, opt_preds), 3)

print('Model evaluation: ACC: {} / AUC: {} /  Precision: {} / Recall: {} / F1: {}'.format(acc, auc, precision, recall, f1))

## Save Optuna Model

In [None]:
# Save the trials
OPTUNA_OUTPUT_PATH = './model_results/prop_model'
MODEL_NAME = 'prop_trial_optuna'
DATE = '_20210923'
opt_model = joblib.dump(test_model,  '{}/{}{}{}'.format(OPTUNA_OUTPUT_PATH,MODEL_NAME,DATE, '_trials.pkl'))
opt_model

# Model Calibration

In [None]:
from sklearn.calibration import CalibratedClassifierCV
from sklearn.calibration import calibration_curve
from matplotlib import pyplot

## Load Best Model for Calibration

In [None]:
load_model = joblib.load('{}/prop_model_hyperopt_20211008.pkl'.format(model_path))
model = load_model.fit(X_train2[[i for i in df_cleaned.columns if i in FEATURES]], y_train2)

In [None]:
pred = model.predict(X_test[model.get_booster().feature_names])
pred_proba = model.predict_proba(X_test[model.get_booster().feature_names])[:,1]

## "Sigmoid" Model Calibration

In [None]:
calibrated = CalibratedClassifierCV(model, method='sigmoid', cv=3)
calibrated.fit(X_calibrated[[i for i in df_cleaned.columns if i in FEATURES]], y_calibrated)

In [None]:
sig_pred_train = calibrated.predict(X_train[model.get_booster().feature_names])
sig_pred_proba_train = calibrated.predict_proba(X_train[model.get_booster().feature_names])[:,1]

sig_pred = calibrated.predict(X_test[model.get_booster().feature_names])
sig_pred_proba = calibrated.predict_proba(X_test[model.get_booster().feature_names])[:,1]

In [None]:
# Calibration Plot

# predict probabilities
probs = calibrated.predict_proba(X_test[[i for i in df_cleaned.columns if i in FEATURES]])[:, 1]
# reliability diagram
fop, mpv = calibration_curve(y_test, probs, n_bins=10, normalize=True)
# plot perfectly calibrated
pyplot.plot([0, 1], [0, 1], linestyle='--')
# plot calibrated reliability
pyplot.plot(mpv, fop, marker='.')
pyplot.show()

In [None]:
# PP Plot

X_test['predictions'] = sig_pred_proba
X_test['label'] = y_test
decile_plot(X_test, 'predictions', 'label', 'with product')

## "Isotonic" Model Calibration

In [None]:
calibrated = CalibratedClassifierCV(model, method='isotonic', cv=3)
calibrated.fit(X_calibrated[[i for i in df_cleaned.columns if i in FEATURES]], y_calibrated)

In [None]:
iso_pred_train = calibrated.predict(X_train[model.get_booster().feature_names])
iso_pred_proba_train = calibrated.predict_proba(X_train[model.get_booster().feature_names])[:,1]

iso_pred = calibrated.predict(X_test[model.get_booster().feature_names])
iso_pred_proba = calibrated.predict_proba(X_test[model.get_booster().feature_names])[:,1]

In [None]:
# Calibration Plot

# predict probabilities
probs = calibrated.predict_proba(X_test[[i for i in df_cleaned.columns if i in FEATURES]])[:, 1]
# reliability diagram
fop, mpv = calibration_curve(y_test, probs, n_bins=10, normalize=True)
# plot perfectly calibrated
pyplot.plot([0, 1], [0, 1], linestyle='--')
# plot calibrated reliability
pyplot.plot(mpv, fop, marker='.')
pyplot.show()

In [None]:
# PP Plot

X_test['predictions'] = sig_pred_proba
X_test['label'] = y_test
decile_plot(X_test, 'predictions', 'label', 'with product')