In [1]:
from collections import Counter
import pandas as pd
import numpy as np
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_rows', 20)

In [2]:
import xgboost as xgb
from xgboost.sklearn import XGBClassifier

In [3]:
import os,sys,inspect
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0,parentdir)

In [4]:
from statistic import Statistic
from utils import UtilsKy
from analyzer import HelperAnalyzer, AnalyzerPrediction

In [5]:
# for autoreload modules
%load_ext autoreload
%autoreload 2

In [6]:
db_teach = pd.read_csv(UtilsKy.DB_TEACH_KYW3, dtype=str, encoding='cp1251')
db_test = pd.read_csv(UtilsKy.DB_TEST_KYW3, dtype=str, encoding='cp1251')
white = pd.read_csv(UtilsKy.WHITE_KYW3 , dtype=str)

In [7]:
COL_FACTORS = ['bin', 'amount', 'bank_currency', 'hour', 'day_of_week', 'longitude', 'latitude', 'phone_2_norm']

In [8]:
train = db_teach[COL_FACTORS]
test = db_test[COL_FACTORS]

In [9]:
train = train.apply(pd.to_numeric, errors="coerce")
test = test.apply(pd.to_numeric, errors="coerce")

In [10]:
replace_val = -9999
train = train.fillna(replace_val)
test = test.fillna(replace_val)

In [11]:
X_train = train.copy()
y_train = db_teach.status.astype(int)
X_test = test.copy()
y_test = db_test.status.astype(int)

In [12]:
train = train.values
test = test.values
label = db_teach.status

In [13]:
# white = None
analyzer_prediction =  AnalyzerPrediction(db_teach, db_test, white)
weight = analyzer_prediction.get_xgb_weight()


In [22]:
db_test['is_white'] = np.where(db_test.id.isin(white.ID), 1, 0)
is_white = db_test.is_white.values
XGBClassifier.is_white = is_white

def predict_proba_with_wl(self, data, ntree_limit=None, validate_features=True):   
    class_probs = self.predict_proba(data, ntree_limit=ntree_limit, validate_features=validate_features)
    if self.objective != "multi:softprob":
        class_probs = class_probs[:, 1]
    class_probs =  np.asarray([ 0.0 if self.is_white[i] else class_probs[i] for i in range(len(class_probs))])

    if self.objective == "multi:softprob":
        return class_probs
    else:
        classone_probs = class_probs
        classzero_probs = 1.0 - classone_probs
        return np.vstack((classzero_probs, classone_probs)).transpose()

XGBClassifier.predict_proba_with_wl = predict_proba_with_wl    

In [15]:
config = {'max_depth': 3, 'learning_rate': 0.2, 'n_estimators': 80 }
model = xgb.XGBClassifier(**config)

In [16]:
model.fit(train, label, eval_metric = 'auc', sample_weight=weight)

XGBClassifier(learning_rate=0.2, n_estimators=80)

In [17]:
# New method for XGBClassifier only with "White list" apply.

In [18]:
from interpret import show
from interpret.perf import ROC
blackbox_perf = ROC(model.predict_proba).explain_perf(X_test, y_test, name='XGBoost')
show(blackbox_perf)

In [23]:
model_perf_wl = ROC(model.predict_proba_with_wl).explain_perf(X_test, y_test, name='XGBoost with apply WL')
show(model_perf_wl)

In [25]:
from interpret.blackbox import LimeTabular
from interpret import show


lime = LimeTabular(predict_fn=model.predict_proba, data=X_train, random_state=1)

#Pick the instances to explain, optionally pass in labels if you have them
n = 29276
lime_local = lime.explain_local(X_test[n:n+1], y_test[n:n+1], name='LIME')
show(lime_local)

In [27]:
from interpret.blackbox import ShapKernel
import numpy as np
feature_names = COL_FACTORS
n = 29276
background_val = np.median(X_train, axis=0).reshape(1, -1)
shap = ShapKernel(predict_fn=model.predict_proba, data=background_val, feature_names=feature_names)
shap_local = shap.explain_local(X_test[n:n+1], y_test[n:n+1], name='SHAP')
show(shap_local)

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




In [28]:
from interpret.blackbox import MorrisSensitivity

sensitivity = MorrisSensitivity(predict_fn=model.predict_proba, data=X_train)
sensitivity_global = sensitivity.explain_global(name="Global Sensitivity")

show(sensitivity_global)

In [30]:
from interpret.blackbox import PartialDependence

pdp = PartialDependence(predict_fn=model.predict_proba, data=X_train)
pdp_global = pdp.explain_global(name='Partial Dependence')

show(pdp_global)

In [279]:
show([blackbox_perf, lime_local, shap_local, sensitivity_global, pdp_global])