In [1]:
import sklearn, os
import numpy as np
import pandas as pd
import shap
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.neighbors import NearestNeighbors

In [2]:
def returnModel(model):
	if model == "KNN":
		m = sklearn.neighbors.KNeighborsClassifier()
	elif model == "SVM":
		m = sklearn.svm.SVC(kernel='linear', probability=True)
	elif model == "RF":
		from sklearn.ensemble import RandomForestClassifier
		m = RandomForestClassifier(n_estimators=100, max_depth=None, min_samples_split=2, random_state=0)
	elif model == "NN":
		from sklearn.neural_network import MLPClassifier
		m = MLPClassifier(solver='lbfgs', alpha=1e-1, hidden_layer_sizes=(5, 2), random_state=0)
	return m

In [3]:
def returnDataset(dataset):
	if dataset == 'IRIS':
		X_train,X_test,Y_train,Y_test = train_test_split(*shap.datasets.iris(), test_size=0.2, random_state=0)
	return X_train,X_test,Y_train,Y_test

In [4]:
def makePrediction(dataset, model, datapoint, random = True):
	X_train,X_test,Y_train,Y_test = returnDataset(dataset)
	algo = returnModel(model)
	algo.fit(X_train, Y_train)
	if random:
		return int(algo.predict([X_test.iloc[datapoint,:]]))
	return int(algo.predict([pd.Series(datapoint)]))

In [5]:
def returnSHAP(dataset, model, datapoint, random = True):
	X_train,X_test,Y_train,Y_test = returnDataset(dataset)
	algo = returnModel(model)
	algo.fit(X_train, Y_train)
	explainer = shap.KernelExplainer(algo.predict_proba, X_train)
	if random:
		shap_values = explainer.shap_values(X_test.iloc[datapoint,:])
	else:
		shap_values = explainer.shap_values(pd.Series(datapoint))
	return shap_values

In [6]:
sv_random = returnSHAP("IRIS", "NN", 26)

Using 120 background data samples could cause slower run times. Consider using shap.kmeans(data, K) to summarize the background as K weighted samples.


In [7]:
sv_random

In [8]:
sv_specific = returnSHAP("IRIS", "NN", [6.0, 3.3, 6.0, 1.5], False)

Using 120 background data samples could cause slower run times. Consider using shap.kmeans(data, K) to summarize the background as K weighted samples.


In [9]:
sv_specific

In [10]:
pd.Series([6.0, 3.3, 6.0, 1.5])

0    6.0
1    3.3
2    6.0
3    1.5
dtype: float64

In [11]:
X_train,X_test,Y_train,Y_test = returnDataset("IRIS")
pd.Series([5.8, 2.8, 5.1, 2.4], index=X_test.columns)

sepal length (cm)    6.0
sepal width (cm)     3.3
petal length (cm)    6.0
petal width (cm)     1.5
dtype: float64

In [23]:
X_test.iloc[0,:]

sepal length (cm)    5.8
sepal width (cm)     2.8
petal length (cm)    5.1
petal width (cm)     2.4
Name: 114, dtype: float64

In [27]:
pd.Series([5.8, 2.8, 5.1, 2.4], index=X_test.columns,name=114)

sepal length (cm)    5.8
sepal width (cm)     2.8
petal length (cm)    5.1
petal width (cm)     2.4
Name: 114, dtype: float64

In [None]:
pd.Series(n)

In [15]:
algo = returnModel("NN")
algo.fit(X_train, Y_train)
explainer = shap.KernelExplainer(algo.predict_proba, X_train)

Using 120 background data samples could cause slower run times. Consider using shap.kmeans(data, K) to summarize the background as K weighted samples.


In [32]:
explainer.shap_values(pd.Series([5.8, 2.8, 5.1, 2.4], index=X_test.columns))

In [31]:
explainer.shap_values(X_test.iloc[0,:])