In [None]:
import lightgbm as lgb

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.rc("font", size=14)
import pandas as pd
import numpy as np
import math
from sklearn.model_selection import train_test_split
#from sklearn.feature_selection import RFE
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
import seaborn as sns
sns.set(style="white")
sns.set(style="whitegrid", color_codes=True)

from azureml.core import Experiment
from azureml.core import Workspace, Dataset


In [None]:
import shap
shap.initjs()

In [None]:
subscription_id = '6ed9d167-b2e6-41b8-9500-35e6df64d9dc'
resource_group = 'MLRG'
workspace_name = 'erbbimlws'

workspace = Workspace(subscription_id, resource_group, workspace_name)

In [None]:
experiment = Experiment(workspace = workspace, name = "LightGBM_on_Primary_Bank")

In [None]:
run = experiment.start_logging(snapshot_directory=None)

In [None]:
dataset = Dataset.get_by_name(workspace, name='Primary_Bank')
df = dataset.to_pandas_dataframe()

In [None]:
pd.set_option('max_columns', None) #display all columns
#pd.reset_option(“max_columns”) #to return to default

In [None]:
df.head()

In [None]:
df['New_Primary_Bank_Flag'].value_counts().plot(kind = 'pie',y = 'New_Primary_Bank_Flag',figsize=(5, 5),title = 'New_Primary_Bank_Flag',autopct='%1.1f%%')


In [None]:
df = df.drop(columns = ['Customer_ID','Year','Month','Primary_Bank'])

In [None]:
def binF(x):
    if x is False:
        z = 0
    elif math.isnan(x):
        z = 0
    else:
        z =1
    return z

In [None]:
df['New_Primary_Bank_Flag'] = df['New_Primary_Bank_Flag'].apply(binF)

In [None]:
#dummy variables
for col in df.columns:       
       if df[col].dtypes=='object':
            df.drop(columns=col, inplace = True)
            #df = pd.get_dummies(df, prefix=col + '_', columns=[col])
    

In [None]:
for col in df.columns:       
       if df[col].dtypes=='datetime64[ns]':
            df = df.drop(columns = col)


In [None]:
def replace_gt(x):
    x = x.replace('<=','le ')
    x = x.replace('>=','ge')
    x = x.replace('>','gt ')
    x = x.replace('<','lt ')
    return x

In [None]:
df.columns = [replace_gt(x) for x in df.columns]

In [None]:
df.head()

In [None]:
x_train, x_test, y_train, y_test = train_test_split(df.loc[:,df.columns !='New_Primary_Bank_Flag'].fillna(0), df['New_Primary_Bank_Flag'].fillna(0), test_size=0.25, random_state=0)
#x_train, x_test, y_train, y_test = train_test_split(df.loc[:,important_features].fillna(0), df['New_Primary_Bank_Flag'].fillna(0), test_size=0.25, random_state=0)


In [None]:
parameters = {'max_depth':5,'colsample_bytree':0.5,'min_data_in_leaf':1000}

In [None]:
run.log_table('Parameters', parameters)

In [None]:
model = lgb.LGBMClassifier(**parameters)

In [None]:
model.fit(x_train,y_train,early_stopping_rounds =10,\
            eval_metric = 'aucpr', eval_set = [(x_test,y_test)],verbose = False)

In [None]:
print(classification_report(y_train,  model.predict(x_train)))

In [None]:
print('Accuracy of Light GBoost classifier on train set: {:.4f}'.format(model.score(x_train, y_train)))

In [None]:
run.log('Accuracy of Light GBoost classifier on train set:','{:.4f}'.format(model.score(x_train, y_train)))

In [None]:
print(classification_report(y_test,  model.predict(x_test)))

In [None]:
print('Accuracy of Light GBoost classifier on test set: {:.4f}'.format(model.score(x_test, y_test)))

In [None]:
run.log('Accuracy of Light GBoost classifier on test set:','{:.4f}'.format(model.score(x_test, y_test)))

In [None]:

cm = plot_confusion_matrix(model,x_train,y_train,display_labels = ['Non Primary', 'Primary'])

In [None]:
cm.figure_.savefig('images/Confusion_Matrix_train.jpg')
plt.close()

In [None]:
run.log_image('Confusion Matrix (train)','images/Confusion_Matrix_train.jpg')

In [None]:
cm = plot_confusion_matrix(model,x_test,y_test,display_labels = ['Non Primary', 'Primary'])

In [None]:
cm.figure_.savefig('images/Confusion_Matrix_test.jpg')
plt.close()

In [None]:
run.log_image('Confusion Matrix (test)','images/Confusion_Matrix_test.jpg')

In [None]:
logit_roc_auc = roc_auc_score(y_test, model.predict(x_test))
fpr_test, tpr_test, thresholds_test = roc_curve(y_test, model.predict_proba(x_test)[:,1])
auc_test = roc_auc = auc(fpr_test, tpr_test)

logit_roc_auc = roc_auc_score(y_train, model.predict(x_train))
fpr_train, tpr_train, thresholds_train = roc_curve(y_train, model.predict_proba(x_train)[:,1])
auc_train = roc_auc = auc(fpr_train, tpr_train)
cm =plt.figure()
plt.plot(fpr_train, tpr_train, label='Light GBoost train (area = %0.2f)' % auc_train)
plt.plot(fpr_test, tpr_test, label='Light GBoost test (area = %0.2f)' % auc_test)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")

In [None]:
cm.savefig('images/ROC.jpg')
plt.close()
run.log_image('ROC','images/ROC.jpg')

In [None]:
#explainer = shap.TreeExplainer(model, x_train)
explainer = shap.Explainer(model, x_train)
shap_values = explainer.shap_values(x_train,check_additivity=False)
x_train_array = x_train#.to_numpy() # we need to pass a dense version for the plotting functions

In [None]:
ss= plt.figure()
shap.summary_plot(shap_values, x_train_array,x_train.columns,max_display = 20)


In [None]:
ss.savefig('images/SHAP_Summary.jpg',bbox_inches='tight')
plt.close()
run.log_image('SHAP - Summary','images/SHAP_Summary.jpg')

In [None]:
pd.set_option('display.max_rows', None)

In [None]:
important_features = pd.DataFrame(model.feature_importances_,x_train.columns,columns =['Importance'])\
    .sort_values(by='Importance', ascending = False).head(20).index.tolist()


In [None]:
important_features

In [None]:
bs = plt.figure()
shap.plots.beeswarm(explainer(x_train),max_display = 20)

In [None]:
bs.savefig('images/SHAP_Beeswarm.jpg',bbox_inches='tight')
plt.close()
run.log_image('SHAP - Beeswarm','images/SHAP_Beeswarm.jpg')

In [None]:
run.complete()