In [2]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import DBSCAN

from src.baseline_experiments import *

from mountaineer import Mountaineer
from gale import create_mapper, bootstrap_mapper_params

## Explain fuctions

In [3]:
from lime.lime_tabular import LimeTabularExplainer
from shap import KernelExplainer

def run_lime(X, model, num_features=4, num_samples=100):
    explainer = LimeTabularExplainer(X, discretize_continuous=False, random_state=2020)
    lime_exp = []
    for x in X:
        exp = explainer.explain_instance(x, model.predict_proba, num_features=num_features, num_samples=num_samples)
        tmp = [0 for i in range(X.shape[1])]
        for e in exp.as_list():
            tmp[int(e[0])] = e[1]
        lime_exp.append(tmp)
    lime_exp = np.array(lime_exp)
    return lime_exp

def run_shap(X, model):
    def f(X):
        return model.predict_proba(X)[:,1]
    explainer = KernelExplainer(f, X)
    exp_shap = explainer.shap_values(X, random_state=2020)
    return exp_shap

## Generate dataset

In [4]:
np.random.seed(2020)
X1 = np.random.multivariate_normal([-1.5,-1.5], [[1,0],[0,1]], size=250)
y1 = np.repeat(1, 250)
X2 = np.random.multivariate_normal([-1.5,1.5], [[1,0],[0,1]], size=250)
y2 = np.repeat(0, 250)
X3 = np.random.multivariate_normal([1.5,1.5], [[1,0],[0,1]], size=250)
y3 = np.repeat(1, 250)
X4 = np.random.multivariate_normal([1.5,-1.5], [[1,0],[0,1]], size=250)
y4 = np.repeat(0, 250)

X = np.concatenate([X1,X2,X3,X4])
y = np.concatenate([y1,y2,y3,y4])

## Train model and get exp

In [5]:
mlp = MLPClassifier(hidden_layer_sizes=(64, 64), batch_size=16, random_state=2020, max_iter=1000)
mlp.fit(X, y)

predictions = mlp.predict_proba(X)[:, 1]
function_mlp = predictions.reshape(-1,1)

exp_lime_mlp = run_lime(X, mlp)
exp_shap_mlp = run_shap(X, mlp)

Using 1000 background data samples could cause slower run times. Consider using shap.sample(data, K) or shap.kmeans(data, K) to summarize the background as K samples.
100%|██████████| 1000/1000 [02:42<00:00,  6.17it/s]


In [6]:
rf = RandomForestClassifier(random_state=2020)
rf.fit(X, y)

predictions = rf.predict_proba(X)[:, 1]
function_rf = predictions.reshape(-1,1)

exp_lime_rf = run_lime(X, rf)
exp_shap_rf = run_shap(X, rf)

Using 1000 background data samples could cause slower run times. Consider using shap.sample(data, K) or shap.kmeans(data, K) to summarize the background as K samples.
100%|██████████| 1000/1000 [02:37<00:00,  6.34it/s]


## Creating the mapper outputs

In [7]:
resolutions=[5,10,15,20,25]
gains=[0.3,0.4]

function = np.array([np.squeeze(i) for i in predictions])

params_boots = {'Original_MLP': bootstrap_mapper_params(X, function_mlp, resolutions, gains, distances=[0.1], 
                                                        clusterer=DBSCAN(),ci=0.95),
                'LIME_MLP':     bootstrap_mapper_params(exp_lime_mlp, function_mlp, resolutions, gains, distances=[0.1], 
                                                        clusterer=DBSCAN(), ci=0.95),
                'SHAP_MLP':     bootstrap_mapper_params(exp_shap_mlp, function_mlp, resolutions, gains, distances=[0.1], 
                                                        clusterer=DBSCAN(), ci=0.95),
                'Original_RF':  bootstrap_mapper_params(X, function_rf, resolutions, gains, distances=[0.1], 
                                                        clusterer=DBSCAN(), ci=0.95),
                'LIME_RF':      bootstrap_mapper_params(exp_lime_rf, function_rf, resolutions, gains, distances=[0.1], 
                                                        clusterer=DBSCAN(), ci=0.95),
                'SHAP_RF':      bootstrap_mapper_params(exp_shap_rf, function_rf, resolutions, gains, distances=[0.1], 
                                                        clusterer=DBSCAN(), ci=0.95),
                }

In [8]:
for mode in params_boots.keys():
    print(f"Params {mode}: {params_boots[mode]}")

Params Original_MLP: {'stability': 0.21766294292707877, 'components': 5, 'resolution': 5, 'gain': 0.4, 'distance_threshold': 0.1}
Params LIME_MLP: {'stability': 0.0024446653670870955, 'components': 1, 'resolution': 20, 'gain': 0.4, 'distance_threshold': 0.1}
Params SHAP_MLP: {'stability': 0.0024446653670870955, 'components': 1, 'resolution': 20, 'gain': 0.4, 'distance_threshold': 0.1}
Params Original_RF: {'stability': 0.1117499999999998, 'components': 13, 'resolution': 10, 'gain': 0.4, 'distance_threshold': 0.1}
Params LIME_RF: {'stability': 0.007034974560314086, 'components': 1, 'resolution': 5, 'gain': 0.4, 'distance_threshold': 0.1}
Params SHAP_RF: {'stability': 0.007034974560314086, 'components': 1, 'resolution': 5, 'gain': 0.4, 'distance_threshold': 0.1}


In [15]:
mappers = { 'Original_MLP': create_mapper(X, function_mlp, resolution=params_boots['Original_MLP']['resolution'], gain=params_boots['Original_MLP']['gain'], 
                                          dist_thresh=0.1, clusterer=DBSCAN()),
            'LIME_MLP':     create_mapper(exp_lime_mlp, function_mlp, resolution=params_boots['LIME_MLP']['resolution'], gain=params_boots['LIME_MLP']['gain'], 
                                          dist_thresh=0.1, clusterer=DBSCAN()),
            'SHAP_MLP':     create_mapper(exp_shap_mlp, function_mlp, resolution=params_boots['SHAP_MLP']['resolution'], gain=params_boots['SHAP_MLP']['gain'], 
                                          dist_thresh=0.1, clusterer=DBSCAN()),
            'Original_RF':  create_mapper(X, function_rf, resolution=params_boots['Original_RF']['resolution'], gain=params_boots['Original_RF']['gain'], 
                                          dist_thresh=0.1, clusterer=DBSCAN()),
            'LIME_RF':      create_mapper(exp_lime_rf, function_rf, resolution=params_boots['LIME_RF']['resolution'], gain=params_boots['LIME_RF']['gain'], 
                                          dist_thresh=0.1, clusterer=DBSCAN()),
            'SHAP_RF':      create_mapper(exp_shap_rf, function_rf, resolution=params_boots['SHAP_RF']['resolution'], gain=params_boots['SHAP_RF']['gain'], 
                                          dist_thresh=0.1, clusterer=DBSCAN()),
            'Original_RF_fix':  create_mapper(X, function_rf, resolution=params_boots['LIME_MLP']['resolution'], gain=params_boots['LIME_MLP']['gain'], 
                                          dist_thresh=0.1, clusterer=DBSCAN()),
            'LIME_RF_fix':      create_mapper(exp_lime_rf, function_rf, resolution=params_boots['LIME_MLP']['resolution'], gain=params_boots['LIME_MLP']['gain'], 
                                          dist_thresh=0.1, clusterer=DBSCAN()),
            'SHAP_RF_fix':      create_mapper(exp_shap_rf, function_rf, resolution=params_boots['LIME_MLP']['resolution'], gain=params_boots['LIME_MLP']['gain'], 
                                          dist_thresh=0.1, clusterer=DBSCAN()),
           }

## Visualize

In [16]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:1920px !important; }</style>"))

In [17]:
mapper_outputs=[mappers['Original_MLP'], mappers['LIME_MLP'], mappers['SHAP_MLP'],
                mappers['Original_RF'], mappers['LIME_RF'], mappers['SHAP_RF'],
                mappers['Original_RF_fix'], mappers['LIME_RF_fix'], mappers['SHAP_RF_fix']]

explanation_vectors=[X, exp_lime_mlp, exp_shap_mlp, 
                    X, exp_lime_rf, exp_shap_rf,
                    X, exp_lime_rf, exp_shap_rf]
explanation_list=[]
for expl in explanation_vectors:
    explanation_list.append(expl.tolist())

expl_labels=['Features_MLP','LIME_MLP','SHAP_MLP', 
        'Features_RF','LIME_RF','SHAP_RF', 
        'Features_RF_fix','LIME_RF_fix','SHAP_RF_fix']
class_labels = {1:'Red', 0:"Blue" }
#array of what we want the values to be colored by by default(In this case we are simply passing the lens - ie. Prediction Probabilities)
predicted_prob = np.array([np.squeeze(i) for i in predictions])

#column names of the dataframe
column_names= pd.Series(["Feat1", "Feat2"])

In [18]:
#visualize
test = Mountaineer()

#visualize
mnt = Mountaineer()
mnt.visualize(X,y, predicted_prob, explanation_list, mapper_outputs, column_names, expl_labels, class_labels) #Projection method- 'UMAP' or 'TSNE'
