In [None]:
from xgboost import XGBClassifier
from hyperopt import Trials, fmin, tpe, space_eval, STATUS_OK, hp
from sklearn.model_selection import StratifiedKFold

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.rc("font", size=14)
import pandas as pd
import numpy as np
import math
from sklearn.model_selection import train_test_split
#from sklearn.feature_selection import RFE
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
import seaborn as sns
sns.set(style="white")
sns.set(style="whitegrid", color_codes=True)


In [None]:
import os

In [None]:
from azureml.core import Experiment
from azureml.core import Workspace, Dataset

In [None]:
import shap
shap.initjs()

In [None]:
def delFile(filetoDelete):
    if os.path.exists(filetoDelete):
        os.remove(filetoDelete)

In [None]:
subscription_id = '6ed9d167-b2e6-41b8-9500-35e6df64d9dc'
resource_group = 'MLRG'
workspace_name = 'erbbimlws'

workspace = Workspace(subscription_id, resource_group, workspace_name)

In [None]:
experiment = Experiment(workspace = workspace, name = "vBankingPB")

In [None]:
run = experiment.start_logging(snapshot_directory=None)

In [None]:
dataset = Dataset.get_by_name(workspace, name='vBankingPB')
df = dataset.to_pandas_dataframe()

In [None]:
df.shape

In [None]:
pd.set_option('max_columns', None) #display all columns
#pd.reset_option(“max_columns”) #to return to default

In [None]:
df.head()

In [None]:
df['vBankingFlag'].value_counts().plot(kind = 'pie',y = 'vBankingFlag',figsize=(5, 5),title = 'vBankingFlag',autopct='%1.1f%%')

In [None]:
df = df.drop(columns = ['Customer_ID','Year','Month','Remmitances_In_nonGR_1Y_MaxAmount','Remmitances_In_nonGR_1Y_Amount','education_Code',\
    'Link_contacts_Last_3m_neu','Eb_Logins_3M_Months_num','Eb_Logins_3M_num','Legal_Person','City','Age_Band','Occupation','Global_SubSubSegment'])

In [None]:
def binF(x):
    if x is False:
        z = 0
    elif math.isnan(x):
        z = 0
    else:
        z =1
    return z

In [None]:
df['vBankingFlag'] = df['vBankingFlag'].apply(binF)

In [None]:
#dummy variables
for col in df.columns:       
       if df[col].dtypes=='object':
            #df.drop(columns=col, inplace = True)
            df = pd.get_dummies(df, prefix=col + '_', columns=[col])

In [None]:
for col in df.columns:       
       if df[col].dtypes=='datetime64[ns]':
            df = df.drop(columns = col)

In [None]:
def replace_gt(x):
    x = x.replace('<=','le ')
    x = x.replace('>=','ge')
    x = x.replace('>','gt ')
    x = x.replace('<','lt ')
    return x

In [None]:
df.columns = [replace_gt(x) for x in df.columns]

In [None]:
x_train, x_test, y_train, y_test = train_test_split(df.loc[:,df.columns !='vBankingFlag'].fillna(0), df['vBankingFlag'].fillna(0), test_size=0.25, random_state=0)

In [None]:
space = {
    "n_estimators": hp.choice("n_estimators", [100]),
    "max_depth": hp.choice("max_depth", np.arange(4,7)),
     #"learning_rate": hp.choice("learning_rate", np.round(np.arange(0.1, 1, 0.1), 2)),
    "reg_alpha": hp.choice("reg_alpha", np.arange(10,500,50)),
    "scale_pos_weight": hp.choice("scale_pos_weight", np.arange(1,20,5)),
    "label_encoder":False
}

In [None]:
def hyperparameter_tuning(training_data, training_target):
    
    def objective(params):
    
        cv_scores = []
        
        print('Params testing: ', params)
        print('\n ')

        cv = StratifiedKFold(n_splits = 10, shuffle = True, random_state = 42)

        for train, val in cv.split(training_data, training_target):
        
            x_temp, y_temp = training_data.iloc[train].reset_index(drop = True), training_target.iloc[train].reset_index(drop = True)


            model_xgboost = XGBClassifier(**params, n_jobs=-1)
            model_xgboost.fit(x_temp, y_temp)
            
            predictions = model_xgboost.predict(training_data.iloc[val])

            metric = roc_auc_score(training_target.iloc[val], predictions)    

            cv_scores.append(metric)

        loss = 1 - np.mean(cv_scores)
        
        return {'loss': loss, 'status': STATUS_OK}

    return objective

In [None]:
objective = hyperparameter_tuning(x_train, y_train)

In [None]:
def select_parameters(objective_function, parameter_space, max_evals, prints = True):

    trials = Trials()

    best_indices = fmin(objective_function,
                        parameter_space,
                        algo=tpe.suggest,
                        trials=trials,
                        max_evals=max_evals,
                        rstate=np.random.default_rng(42))

    best_parameters = space_eval(parameter_space, trials.argmin)

    if prints:
        print('Best params are:')
        print(best_parameters)

        #main_plot_history(trials)  

    return best_parameters, best_indices, trials

In [None]:
params, indices, trials = select_parameters(objective, space, 10, prints = True)

In [None]:
params

In [None]:
run.log_table('Best Parameters', params)

In [None]:
xgb = XGBClassifier(label_encoder = False,max_depth = 5,reg_alpha =160,n_estimators= 100,learning_rate = 0.3,scale_pos_weight = 16)

In [None]:
xgb.fit(x_train,y_train,early_stopping_rounds =10, eval_metric = 'aucpr', eval_set = [(x_test,y_test)],verbose = False)

In [None]:
print(classification_report(y_train,  xgb.predict(x_train)))

In [None]:
print('Accuracy of XGBoost classifier on train set: {:.4f}'.format(xgb.score(x_train, y_train)))

In [None]:
print(classification_report(y_test,  xgb.predict(x_test)))

In [None]:
print('Accuracy of XGBoost classifier on test set: {:.4f}'.format(xgb.score(x_test, y_test)))

In [None]:
cm = plot_confusion_matrix(xgb,x_train,y_train,display_labels = ['Non vBanking', 'vBanking'])

In [None]:
delFile('images/Confusion_Matrix_train.jpg')
cm.figure_.savefig('images/Confusion_Matrix_train.jpg')
plt.close()

In [None]:
run.log_image('Confusion Matrix (train)','images/Confusion_Matrix_train.jpg')

In [None]:
cm = plot_confusion_matrix(xgb,x_test,y_test,display_labels = ['Non vBanking', 'vBanking'])

In [None]:
delFile('images/Confusion_Matrix_test.jpg')
cm.figure_.savefig('images/Confusion_Matrix_test.jpg')
plt.close()

In [None]:
run.log_image('Confusion Matrix (test)','images/Confusion_Matrix_test.jpg')

In [None]:
logit_roc_auc = roc_auc_score(y_train.to_numpy(), xgb.predict(x_train))
fpr, tpr, thresholds = roc_curve(y_train.to_numpy(), xgb.predict_proba(x_train)[:,1])
auc_train = roc_auc = auc(fpr, tpr)

logit_roc_auc = roc_auc_score(y_test, xgb.predict(x_test))
fpr_t, tpr_t, thresholds_t = roc_curve(y_test, xgb.predict_proba(x_test)[:,1])
auc_test = roc_auc = auc(fpr_t, tpr_t)
cm =plt.figure(figsize = (10,8))

plt.plot(fpr, tpr, label='XGBoost - train (area = %0.2f)' % auc_train)
plt.plot(fpr_t, tpr_t, label='XGBoost test (area = %0.2f)' % auc_test)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")

In [None]:
delFile('images/ROC.jpg')
cm.savefig('images/ROC.jpg')
plt.close()
run.log_image('ROC','images/ROC.jpg')

In [None]:
#explainer = shap.TreeExplainer(model, x_train)
explainer = shap.Explainer(xgb, x_train)
shap_values = explainer.shap_values(x_train,check_additivity=False)
x_train_array = x_train#.to_numpy() # we need to pass a dense version for the plotting functions

In [None]:
ss= plt.figure()
shap.summary_plot(shap_values, x_train_array,x_train.columns,max_display = 20)

In [None]:
delFile('images/SHAP_Summary.jpg')
ss.savefig('images/SHAP_Summary.jpg',bbox_inches='tight')
plt.close()
run.log_image('SHAP - Summary','images/SHAP_Summary.jpg')

In [None]:
bs = plt.figure()
shap.plots.beeswarm(explainer(x_train),max_display = 20)

In [None]:
delFile('images/SHAP_bees.jpg')
ss.savefig('images/SHAP_bees.jpg',bbox_inches='tight')
plt.close()
run.log_image('SHAP - Beeswarm','images/SHAP_bees.jpg')

**Create a pickle file with the model** Comment out, if there is no need to change the existing model

In [None]:
#import joblib

#joblib.dump(xgb, "models/vBanking_PB_XGBoost_model.pkl")

In [None]:
run.complete()