# Explainability Pipeline

In [None]:
import pandas as pd
import shap
from joblib import load

from sklearn.model_selection import train_test_split

from fraud.config.config import PARAMS
from fraud.config.features import FEATURES

from fraud.steps.DataFetchStep import DataFetchStep
from fraud.steps.PreProcessStep import PreProcessStep
from fraud.steps.ExplainerStep import ExplainerStep

In [None]:
clf = load('imblearn-random-forest.joblib') 

In [None]:
data_fetch_step = DataFetchStep(FEATURES['numerical_columns']
                                , FEATURES['categorical_columns'])

df = data_fetch_step.fetch_data(PARAMS['input_data'])

In [None]:
X = df[FEATURES['numerical_columns']+FEATURES['categorical_columns']]
y = df[FEATURES['label_column']]

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2)

In [None]:
preprocess_step = PreProcessStep(
    FEATURES['numerical_columns']
    , FEATURES['categorical_columns']
)

X_train = preprocess_step.fit_transform(X_train)
X_test = preprocess_step.transform(X_test)

In [None]:
# Create Tree Explainer object that can calculate shap values
explainer = shap.TreeExplainer(clf)

In [None]:
explainer_step = ExplainerStep(explainer
                               , preprocess_step._pipeline.named_transformers_
                               , (preprocess_step._numerical_columns
                                  + preprocess_step._encoded_categorical_columns)
                              )

Make predictions and pass to the explainer step object. Use the explainer step object to show the most important features for an example TP, FP, TN and FN prediction.

In [None]:
# set a threshold of probability for a prediction to be labelled as positive
threshold=0.7

In [None]:
y_pred = clf.predict_proba(X_test)
df_pred = pd.DataFrame(zip(y_test, y_pred[:,1]), columns=['true','proba'])
df_pred['pred'] = df_pred['proba'].apply(lambda x: 1 if x>threshold else 0)

In [None]:
explainer_step.report_tp_fp_tn_fs_explanations(df_pred, X_test)