# Explaining xgboost decisions on Boston Housing with SHAP TreeExplainer

* TreeExplainer Reference: https://arxiv.org/abs/1905.04610



In [None]:
import xgboost
import shap
import seaborn as sns

# load JS visualization code to notebook
shap.initjs()

In [None]:
# train XGBoost model
X,y = shap.datasets.boston(display=True)
X.shape

In [None]:
X.head()

CRIM - per capita crime rate by town

ZN - proportion of residential land zoned for lots over 25,000 sq.ft.

INDUS - proportion of non-retail business acres per town.

CHAS - Charles River dummy variable (1 if tract bounds river; 0 otherwise)

NOX - nitric oxides concentration (parts per 10 million)

RM - average number of rooms per dwelling

AGE - proportion of owner-occupied units built prior to 1940

DIS - weighted distances to five employment centres

RAD - index of accessibility to radial highways

TAX - full-value property-tax rate per $10,000

PTRATIO - pupil-teacher ratio by town

B - 1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town

LSTAT - % lower status of the population

In [None]:
y[:5]

In [None]:
sns.distplot(y)

MEDV - Median value of owner-occupied homes in $1000’s

In [None]:
%%time
model = xgboost.train({"learning_rate": 0.01}, xgboost.DMatrix(X, label=y), 100)

## Explainer for XGboost Model

In [None]:
# explain the model's predictions using SHAP values
# (same syntax works for LightGBM, CatBoost, scikit-learn and spark models)
explainer = shap.TreeExplainer(model)

In [None]:
# the expected value of the model output
explainer.expected_value

In [None]:
#Estimate the SHAP values for a set of samples.
shap_values = explainer.shap_values(X)
# Each row sums to the difference between the model output for that sample and the expected value of the model output 
shap_values.shape

In [None]:
 X.head(1)

In [None]:
y[0]

In [None]:
model.predict(xgboost.DMatrix(X.head(5)))

In [None]:
# visualize the first prediction's explanation (use matplotlib=True to avoid Javascript)
shap.force_plot(explainer.expected_value, shap_values[5,:], X.iloc[5,:])

In [None]:
#plt.figure()
sns.distplot(X['LSTAT'])

In [None]:
# visualize the training set predictions
shap.force_plot(explainer.expected_value, shap_values, X)

In [None]:
# create a SHAP dependence plot to show the effect of a single feature across the whole dataset
shap.dependence_plot("RM", shap_values, X)

In [None]:
# summarize the effects of all the features
shap.summary_plot(shap_values, X)

In [None]:
shap.summary_plot(shap_values, X, plot_type="bar")