In [1]:
import numpy as np
import pandas as pd

from sklearn.neural_network import MLPClassifier
from sklearn.datasets import make_circles
from sklearn.metrics import accuracy_score, f1_score

from src.baseline_experiments import *

from mountaineer import Mountaineer
from gale import create_mapper, bootstrap_mapper_params

## Explain fuctions

In [2]:
from lime.lime_tabular import LimeTabularExplainer
from shap import KernelExplainer

def run_lime(X, model, num_features=4, num_samples=100):
    explainer = LimeTabularExplainer(X, discretize_continuous=False, random_state=2020)
    lime_exp = []
    for x in X:
        exp = explainer.explain_instance(x, model.predict_proba, num_features=num_features, num_samples=num_samples)
        tmp = [0 for i in range(X.shape[1])]
        for e in exp.as_list():
            tmp[int(e[0])] = e[1]
        lime_exp.append(tmp)
    lime_exp = np.array(lime_exp)
    return lime_exp

def run_shap(X, model):
    def f(X):
        return model.predict_proba(X)[:,1]
    explainer = KernelExplainer(f, X)
    exp_shap = explainer.shap_values(X, random_state=2020)
    return exp_shap

## Generate dataset

In [3]:
np.random.seed(2020)
X, y = make_circles(n_samples=1000, noise=0.3, factor=0.5, random_state=2020)

In [11]:
X

array([[-0.53817828,  0.18327441],
       [ 0.18124665, -0.55110768],
       [-0.25353628, -0.54940582],
       ...,
       [-0.46927203, -0.84022608],
       [ 0.27095977, -1.19966353],
       [-0.37505863, -1.13471659]])

## Train model and get exp

In [4]:
mlp = MLPClassifier(hidden_layer_sizes=(64, 64), batch_size=16, random_state=2020, max_iter=1000)
mlp.fit(X, y)

predictions = mlp.predict_proba(X)[:, 1]
function_mlp = predictions.reshape(-1,1)

exp_lime_mlp = run_lime(X, mlp)
exp_shap_mlp = run_shap(X, mlp)

Using 1000 background data samples could cause slower run times. Consider using shap.sample(data, K) or shap.kmeans(data, K) to summarize the background as K samples.
100%|██████████| 1000/1000 [02:50<00:00,  5.88it/s]


In [5]:
print(f"Accuracy: {accuracy_score(y, mlp.predict(X))}")
print(f"F1: {f1_score(y, mlp.predict(X))}")

Accuracy: 0.785
F1: 0.7785787847579814


## Creating the mapper outputs

In [6]:
resolutions=[5,10,15,20,25]
gains=[0.1,0.2,0.3,0.4,0.5]
distances=[0.1,0.2,0.3,0.4,0.5]

function = np.array([np.squeeze(i) for i in predictions])

params_boots = bootstrap_mapper_params(X, function_mlp, resolutions, gains, distances, ci=0.95, n=100)
print(f"Params for original feature space: {params_boots}")

Params for original feature space: {'stability': 0.0025369670342185735, 'components': 1, 'resolution': 25, 'gain': 0.3, 'distance_threshold': 0.3}


In [16]:
mapper_outputs = []#create_mapper(X, function_mlp, resolution=params_boots['resolution'], gain=params_boots['gain'], 
                                #dist_thresh=params_boots['distance_threshold'])]
explanation_vectors = []#X]
labels = []#"Feature Space"]

for res in [5,10,20]:
    for gain in [0.2,0.3,0.4]:
        for dist in [0.2,0.3,0.4]:
            labels.append(f"R{res}-G{gain}-D{dist}")
            explanation_vectors.append(exp_lime_mlp)
            mapper_outputs.append(create_mapper(exp_lime_mlp, function_mlp, resolution=res, gain=gain, dist_thresh=dist))

## Visualize

In [17]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:1920px !important; }</style>"))

In [18]:
explanation_list=[]
for expl in explanation_vectors:
    explanation_list.append(expl.tolist())

predicted_prob = np.array([np.squeeze(i) for i in predictions])

#column names of the dataframe
column_names= pd.Series(["Feat1", "Feat2"]) #np.array(X.columns)


In [19]:
#visualize
mnt = Mountaineer()
mnt.visualize(X, y, predicted_prob, explanation_list, mapper_outputs, column_names, labels) #Projection method- 'UMAP' or 'TSNE'