In [None]:
import pandas as pd
import pmlb
import altair as alt
import numpy as np
import os
import pickle
import joblib

In [None]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, accuracy_score
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

In [None]:
import shap
from lime import lime_tabular
import xgboost 

## Data Preparation

In [None]:
df = pmlb.fetch_data('pima')

In [None]:
# impute the missing input feature values with the median of the target class  
imputeFeatures = ['plasma glucose', 'Diastolic blood pressure', 'Triceps skin fold thickness', 'Body mass index', '2-Hour serum insulin']
for feature in imputeFeatures:
    df.loc[(df.target==0) & (df[feature] == 0), feature] = df[df.target==0][feature].median()
    df.loc[(df.target==1) & (df[feature] == 0), feature] = df[df.target==1][feature].median()

In [None]:
# split
X = df.drop(['target'], axis=1)
Y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state=42)

## Machine Learning Modelling

In [None]:
Load_Trained_Model = True 
model_filename = 'SavedWeights/model.sav'

In [None]:
os = SMOTE(random_state=42)
Aug_X, Aug_Y = os.fit_resample(X_train, y_train.ravel())

if Load_Trained_Model:
    model = pickle.load(open(model_filename, 'rb'))
else: # train the model
    model = xgboost.XGBClassifier(random_state = 42)
    model.fit(Aug_X, Aug_Y)
    pickle.dump(model, open(model_filename, 'wb'))

In [None]:
# train accuracy
pred_train = model.predict(X_train)
print("train accuracy: ", accuracy_score(pred_train, y_train))

# test accuracy
pred_test = model.predict(X_test)
print("test accuracy: ", accuracy_score(pred_test, y_test))

In [None]:
# Get the correct and incorrect prediction to a dataframe  
df_test = X_test.copy()
df_test['prediction'] = pred_test
df_test['target'] = y_test
df_test['correct'] = (df_test['prediction'] == df_test['target'])

In [None]:
# Get all predictions into the dataframe
df['healthy_prediction_proba'] = model.predict_proba(X)[:, 0]

## Confusion Matrix



In [None]:
cm = df_test.groupby(['target', 'prediction'], as_index=False).size()

In [None]:
cm_rc = cm.replace({0: 'healthy', 1: 'diabetic'})

base = alt.Chart(cm_rc).encode(
    x=alt.X('prediction:N', title='predicted class'),
    y=alt.Y('target:N', title='actual class'),
).properties(
    width={'step': 100},
    height={'step': 100}
)

rect = base.mark_square().encode(
    color=alt.Color('prediction', legend=None),
    opacity=alt.Opacity('size', legend=None),
    size='size:Q'
)

text = base.mark_text().encode(
    text='size',
)

rect + text

## Calculating SHAP and LIME Weights

In [None]:
Load_Shap_Values = True
Load_Lime_Values = False
shap_filename = 'SavedWeights/shapValue.pkl'
lime_value_filename = 'SavedWeights/limeValue.pkl'
lime_expData_filename = 'SavedWeights/limeData.pkl'


### Extract SHAP values 


In [None]:
# Extract SHAP values 
if Load_Shap_Values: # load the explanation too
    shap_values = pickle.load(open(shap_filename, 'rb'))
else: # compute the shap values
    # Fits the explainer
    explainer = shap.Explainer(model.predict, X)
    # Calculates the SHAP values - It takes some time
    shap_values = explainer(X)
    pickle.dump(shap_values, open(shap_filename, "wb"))

In [None]:
# Embed the shap values into a dataframe
feature_names = X.columns
shap_df = pd.DataFrame(shap_values.values, columns = feature_names)

### Extract LIME values


In [None]:
# get the weights of each feature of the instance explanation 
def get_weights(exp): 
    exp_list = exp.as_map()[1]
    exp_list = sorted(exp_list, key=lambda x: x[0])
    exp_weight = [x[1] for x in exp_list]
    return exp_weight

In [None]:
if Load_Lime_Values: # load the explanation too
    lime_df = pickle.load(open(lime_value_filename, 'rb'))
    exp_df = pickle.load(open(lime_expData_filename, 'rb'))
else:
    lime_explainer = lime_tabular.LimeTabularExplainer(
        training_data=np.array(X),
        feature_names=X.columns,
        mode='classification',
        kernel_width = 0.7,
        random_state = 42
    )
    # Get all the weights of all instances from LIME explanation object
    weights = []
    list_exp_data = []
    for i in range(X.values.shape[0]):
        exp = lime_explainer.explain_instance(
            data_row=X.values[i],
            num_samples = 700,
            predict_fn=model.predict_proba
        )
        # Get weights
        exp_weight = get_weights(exp)
        weights.append(exp_weight)
        exp_data = [exp.score, exp.intercept[1], exp.local_pred, model.predict_proba(X.iloc[i:i+1])[0][1]]
        list_exp_data.append(exp_data)

    # Create DataFrame
    lime_df = pd.DataFrame(data=weights, columns=X.columns)
    exp_df = pd.DataFrame(data=list_exp_data, columns=['score','intercept','local_pred', 'model_pred'])
    pickle.dump(lime_df, open(lime_value_filename, "wb"))
    pickle.dump(exp_df, open(lime_expData_filename, "wb"))
    

In [None]:
exp_df[(exp_df['local_pred'] > 0.5) & (exp_df['model_pred'] < 0.5)]

In [None]:
exp_df[(exp_df['local_pred'] < 0.5) & (exp_df['model_pred'] > 0.5)]

## Explanation Analysis: Overall Feature Importance Weights

In [None]:
mean_shap = pd.DataFrame(shap_df.abs().mean()).reset_index()
mean_shap.columns = ['feature', 'value']
shap_mean_chart = alt.Chart(mean_shap).mark_bar().encode(
    y=alt.Y('feature', sort='-x'),
    x=alt.X('value', title='mean absolute shap value'),
    color = alt.value('purple')
)

mean_lime = pd.DataFrame(lime_df.abs().mean()).reset_index()
mean_lime.columns = ['feature', 'value']
lime_mean_chart = alt.Chart(mean_lime).mark_bar().encode(
    y=alt.Y('feature', sort='-x'),
    x=alt.X('value', title='mean absolute lime value'),
    color = alt.value('orange')
)

shap_mean_chart | lime_mean_chart 

## Explanation Analysis: Instance Level

### Break down the explanation weights into per feature value  

In [None]:
feature_important_weights = X.melt(ignore_index=False)
feature_important_weights.columns = ['feature', 'feature_value']

# shap value for each feature value 
shapley_values = shap_df.melt(ignore_index=False)
feature_important_weights['shap_value'] = shapley_values['value']

# lime weight for each feature value
lime_weights = lime_df.melt(ignore_index=False)
feature_important_weights['lime_weights'] = lime_weights['value'] 

In [None]:
def plot_Shap_contribution(values):
    values = values.copy()
    
    values['is_positive'] = values['shap_value'] > 0
    values['feature_and_value'] = values['feature'] + ' = ' + values['feature_value'].astype(str)
    
    bars = alt.Chart(values).mark_bar().encode(
        y=alt.Y('feature_and_value'),
        x='shap_value',
        color=alt.Color('is_positive', legend=None, scale=alt.Scale(range=['crimson', 'steelblue']))
    )
    
    return bars

In [None]:
def plot_lime_contribution(values):
    values = values.copy()
    
    values['is_positive'] = values['lime_weights'] > 0
    values['feature_and_value'] = values['feature'] + ' = ' + values['feature_value'].astype(str)
    
    bars = alt.Chart(values).mark_bar().encode(
        y=alt.Y('feature_and_value'),
        x='lime_weights',
        color=alt.Color('is_positive', legend=None, scale=alt.Scale(range=['crimson', 'steelblue']))
    )
    
    return bars

In [None]:
# feature attribution for an instance by LIME and SHAP 
i = 725
plot_Shap_contribution(feature_important_weights.loc[i]) | plot_lime_contribution(feature_important_weights.loc[i]) 

In [None]:
exp_df

In [None]:
# Summary Strip Plot