# Testing  model with XAI methods 

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt

In [2]:
import calendar
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix, f1_score, precision_score, recall_score, roc_auc_score, accuracy_score
from sklearn.pipeline import FunctionTransformer, Pipeline
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, OrdinalEncoder, PowerTransformer
from xgboost import XGBClassifier

## Defining model

In [3]:
TARGET_VALUE = 'y'

def get_training_data():
    df = pd.read_csv("datasource/bank_data_model.csv")
    return df.iloc[:,1:]

def get_validation_data():
    df =  pd.read_csv("datasource/bank_data_validate.csv")
    return df.iloc[:,1:]

def get_categorical_features(dataFrame): 
    return dataFrame.drop(columns=get_numerical_features(dataFrame)).columns.values

def get_numerical_features(dataFrame):
    return dataFrame.select_dtypes(include=np.number).columns.values

def get_numerical_features_transformer():
    class DayMonthTransformer(BaseEstimator,TransformerMixin):
        def fit(self, X, y=None):
            return self
        
        def transform(self, X):
            X_transformed = X.copy()
            month_num_dict = {month.lower(): index for index, month in enumerate(calendar.month_abbr) if month}
            X_transformed['month'] = X_transformed['month'].map(month_num_dict)
            X_transformed['year'] = np.repeat(2024, repeats= X_transformed.shape[0])
            X_transformed['day_of_the_year'] = pd.to_datetime(X_transformed[["year","month","day"]], format='%d/%m/%y', errors='coerce')
            X_transformed['day_of_the_year'] = X_transformed['day_of_the_year'].apply(lambda x: x.dayofyear)
            return X_transformed.drop(columns=["day", "month", "year"])
        
        def get_feature_names_out(self, input_features = None):
            return ['day_of_the_year']
        
    return ColumnTransformer(
        transformers=[
            ("duration", PowerTransformer(method="yeo-johnson"), ["duration"]),
            ("age", Pipeline( steps = [
                                ("age_power_transform", PowerTransformer(method="box-cox")),
                                ("age_function_transform", FunctionTransformer(lambda col : col + abs(np.min(col)), feature_names_out='one-to-one'))
                                ]
                            ),
            ["age"]),
            ("balance", Pipeline( steps = [
                                ("balance_function_transform_min", FunctionTransformer(lambda col : col + abs(np.min(col)),feature_names_out='one-to-one')),
                                ("balance_function_transform_log", FunctionTransformer(np.log1p, feature_names_out='one-to-one')),
                                ]
                            ),
            ["balance"]),
            ("day_of_the_year", DayMonthTransformer(), ["day","month"])
            ]
        )
                        
def get_categorical_features_transformer():
    poutcome_dict = {"failure":0, "other": 0.25, "unknown": 0.5, "success": 1}
    education_dict = {'unknown' : 0, 'secondary' : 0.5, 'tertiary' : 1, 'primary' : 0.25}
    maritial_dict = {"single" : 0, "divorced" : 0.5, "married"  : 1}
    return ColumnTransformer(
        transformers= [
            ("yes_no_variables",  OrdinalEncoder(), ["default", "housing", "loan"]),
            #("sorted_variables", Pipeline(
            #    steps=[
            #        ("ordinal", OrdinalEncoder()),
            #        ("min_max_scaler", MinMaxScaler())
            #    ]
            #),
            #["education", "marital", "poutcome"]),
            ("ordered1", FunctionTransformer(lambda col: col.replace(education_dict), feature_names_out='one-to-one'), ["education"]),
            ("ordered2", FunctionTransformer(lambda col: col.replace(poutcome_dict), feature_names_out='one-to-one'), ["poutcome"]),
            ("ordered3", FunctionTransformer(lambda col: col.replace(maritial_dict), feature_names_out='one-to-one'), ["marital"]),
            ("unsorted", OneHotEncoder(), ["job", "contact"]),
        ]
    )  

def get_features_transformer():
    return ColumnTransformer(
        transformers = [
            ("num", get_numerical_features_transformer(), ["duration","balance", "age", "day", "month"]),
            ("cat", get_categorical_features_transformer(), ["default", "housing", "loan", "education", "marital", "poutcome", "job", "contact"]),
            ("others", FunctionTransformer(feature_names_out='one-to-one'), ["campaign", "pdays","previous"])
        ]
    )

def get_estimator(weights):
    return XGBClassifier(booster= 'gbtree', 
                         eta= 0.10987358313376537,
                         max_delta_step= 1.446953199095078, 
                         max_depth= 4, 
                         min_child_weight= 1.666061864206202, 
                         objective= 'binary:logistic', 
                         scale_pos_weight= weights, 
                         tree_method= 'exact')
def get_model(weights):
    return Pipeline(
        steps = [
            ("preprocessor", get_features_transformer()),
            ("estimator", get_estimator(weights))
        ]
    )

def printMetrics(test, pred): 
    cm = confusion_matrix(test,pred)
    cm_display = ConfusionMatrixDisplay(confusion_matrix = cm, display_labels = [False, True])
    cm_display.plot()
    print(f"accuracy : {accuracy_score(test, pred)}")
    print(f"f1       : {f1_score(test, pred)}")
    print(f"precision: {precision_score(test, pred)}")
    print(f"recall   : {recall_score(test, pred)}")
    print(f"roc_auc  : {roc_auc_score(test,pred)}")

In [4]:
from sklearn.calibration import LabelEncoder
from sklearn.model_selection import train_test_split

df = get_training_data()
le = LabelEncoder()
df[TARGET_VALUE] = le.fit_transform(df[TARGET_VALUE])

X_train, X_test = train_test_split(
    df,
    test_size=0.2,
    stratify=df[TARGET_VALUE],
    random_state=42,
)

y_train, y_test = X_train[TARGET_VALUE], X_test[TARGET_VALUE]
X_train, X_test = X_train.drop(columns=TARGET_VALUE), X_test.drop(columns=TARGET_VALUE)

In [5]:
positive = y_train.sum()
negative = y_train.count() - positive
clf = get_model(negative/positive)

In [6]:
clf.fit(X_train,y_train)

  ("ordered1", FunctionTransformer(lambda col: col.replace(education_dict), feature_names_out='one-to-one'), ["education"]),
  ("ordered2", FunctionTransformer(lambda col: col.replace(poutcome_dict), feature_names_out='one-to-one'), ["poutcome"]),
  ("ordered3", FunctionTransformer(lambda col: col.replace(maritial_dict), feature_names_out='one-to-one'), ["marital"]),


## Dalex

In [7]:
import dalex as dx


In [8]:
exp = dx.Explainer(clf, X_test, y_test)

Preparation of a new explainer is initiated

  -> data              : 724 rows 16 cols
  -> target variable   : Parameter 'y' was a pandas.Series. Converted to a numpy.ndarray.
  -> target variable   : 724 values
  -> model_class       : xgboost.sklearn.XGBClassifier (default)
  -> label             : Not specified, model's class short name will be used. (default)
  -> predict function  : <function yhat_proba_default at 0x71a4c9d29bc0> will be used (default)
  -> predict function  : Accepts only pandas.DataFrame, numpy.ndarray causes problems.
  -> predicted values  : min = 0.00027, mean = 0.217, max = 0.972
  -> model type        : classification will be used (default)
  -> residual function : difference between y and yhat (default)
  -> residuals         : min = -0.957, mean = -0.102, max = 0.999
  -> model_info        : package sklearn

A new explainer has been created!


  ("ordered1", FunctionTransformer(lambda col: col.replace(education_dict), feature_names_out='one-to-one'), ["education"]),
  ("ordered2", FunctionTransformer(lambda col: col.replace(poutcome_dict), feature_names_out='one-to-one'), ["poutcome"]),
  ("ordered3", FunctionTransformer(lambda col: col.replace(maritial_dict), feature_names_out='one-to-one'), ["marital"]),
  ("ordered1", FunctionTransformer(lambda col: col.replace(education_dict), feature_names_out='one-to-one'), ["education"]),
  ("ordered2", FunctionTransformer(lambda col: col.replace(poutcome_dict), feature_names_out='one-to-one'), ["poutcome"]),
  ("ordered3", FunctionTransformer(lambda col: col.replace(maritial_dict), feature_names_out='one-to-one'), ["marital"]),


In [9]:
mp = exp.model_performance(model_type = 'classification')
mp.result

Unnamed: 0,recall,precision,f1,accuracy,auc
XGBClassifier,0.73494,0.472868,0.575472,0.875691,0.911415


### ROC 

In [10]:
mp.plot(geom="roc")

In [11]:
mp.plot()

### Feature importance

In [12]:
vi_grouped = exp.model_parts(variable_groups = {
    'age': ['age'], 
    'job': ['job'], 
    'marital': ['marital'], 
    'education': ['education'], 
    'default': ['default'], 
    'balance': ['balance'], 
    'housing': ['housing'], 
    'loan': ['loan'], 
    'contact': ['contact'], 
    'date': ['day', 'month'], 
    'duration': ['duration'], 
    'campaign': ['campaign'], 
    'pdays': ['pdays'], 
    'previous': ['previous'], 
    'poutcome': ['poutcome']
}
)
vi_grouped.result


Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`


Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`


Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`


Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd

Unnamed: 0,variable,dropout_loss,label
0,age,0.086193,XGBClassifier
1,marital,0.088214,XGBClassifier
2,campaign,0.088561,XGBClassifier
3,_full_model_,0.088585,XGBClassifier
4,default,0.088794,XGBClassifier
5,job,0.088846,XGBClassifier
6,housing,0.089239,XGBClassifier
7,previous,0.089638,XGBClassifier
8,education,0.090741,XGBClassifier
9,balance,0.091861,XGBClassifier


In [13]:
vi_grouped.plot()

### Aggregated Profiles

In [14]:
num_var = ['age', 'balance', 'duration', 'campaign', 'pdays',
       'previous']
pdp_num = exp.model_profile(type = 'partial', variables=num_var, label = "pdp", N = 500)

ale_num = exp.model_profile(type = 'accumulated', variables=num_var, label="ale", N = 500)


Setting an item of incompatible dtype is deprecated and will raise in a future error of pandas. Value '[20.  20.6 21.2 ... 78.8 79.4 80. ]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.


Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`


Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`


Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_down

In [15]:
pdp_num.plot(ale_num)

In [16]:
cat_var = ['job', 'marital', 'education', 'default', 'housing',
       'loan', 'contact', 'poutcome']

pdp_cat = exp.model_profile(type = 'partial', variable_type='categorical',
                            variables = cat_var, label="pdp")

ale_cat = exp.model_profile(type = 'accumulated', variable_type='categorical',
                            variables = cat_var, label="ale")


Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`


Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`


Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`


Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd

In [17]:
ale_cat.plot(pdp_cat)

### Testing predictions for specific cases

In [28]:
blue_collar_worker = pd.DataFrame(
    {   
        'age': [38],
        'job': ['blue-collar'],
        'maritial': ['married'],
        'education': ['tertiary'],
        'default': ['no'],
        'balance': [20000],
        'housing': ['yes'],
        'loan': ['yes'],
        'contact': ['cellular'],
        'day': [16],
        'month': ['may'],
        'duration': [185],
        'campaign': [1],
        'pdays': [330],
        'previous': [1],
        'poutcome': ['success']
    },
    index=["blue_collar_worker"],
)

student = pd.DataFrame(
    {   
        'age': [20],
        'job': ['student'],
        'maritial': ['single'],
        'education': ['tertiary'],
        'default': ['no'],
        'balance': [350],
        'housing': ['no'],
        'loan': ['no'],
        'contact': ['cellular'],
        'day': [16],
        'month': ['may'],
        'duration': [90],
        'campaign': [-1],
        'pdays': [0],
        'previous': [0],
        'poutcome': ['unknown']
    },
    index=["student"],
)


grandpa = pd.DataFrame(
    {   
        'age': [65],
        'job': ['retired'],
        'maritial': ['married'],
        'education': ['secondary'],
        'default': ['no'],
        'balance': [40000],
        'housing': ['no'],
        'loan': ['no'],
        'contact': ['cellular'],
        'day': [16],
        'month': ['may'],
        'duration': [380],
        'campaign': [-1],
        'pdays': [20],
        'previous': [0],
        'poutcome': ['unknown']
    },
    index=["grandpa"],
)

young_businessman = pd.DataFrame(
    {   
        'age': [20],
        'job': ['entrepreneur'],
        'maritial': ['single'],
        'education': ['secondary'],
        'default': ['no'],
        'balance': [70000],
        'housing': ['no'],
        'loan': ['no'],
        'contact': ['cellular'],
        'day': [1],
        'month': ['may'],
        'duration': [200],
        'campaign': [-1],
        'pdays': [20],
        'previous': [0],
        'poutcome': ['unknown']
    },
    index=["young_businessman"],
)


In [29]:
sh_blue_collar_worker = exp.predict_parts(blue_collar_worker, type="shap", B=50, label=blue_collar_worker.index[0])
sh_student = exp.predict_parts(student, type="shap", B=50, label=student.index[0])
sh_grandpa = exp.predict_parts(grandpa, type="shap", B=50, label=grandpa.index[0])
sh_young_businessman = exp.predict_parts(young_businessman, type="shap", B=50, label=young_businessman.index[0])




Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`


Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`


Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`


Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd

In [31]:
sh_blue_collar_worker.plot([sh_student, sh_grandpa, sh_young_businessman])