In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from catboost import CatBoostRegressor, Pool
from sklearn.model_selection import train_test_split

from shapkit_nbdev.shapley_values import ShapleyValues
from shapkit_nbdev.inspector import inspector
from shapkit_nbdev.monte_carlo_shapley import MonteCarloShapley
from shapkit_nbdev.sgd_shapley import SGDshapley

%load_ext autoreload
%autoreload 2

# Load dataset

In [2]:
df = pd.read_csv("/home/sgrah/Documents/shapkit/dataset/bike/hour.csv")
df = df.drop(columns=["instant", "dteday", "yr"])
cat_features = ["season", "mnth", "hr", "holiday", "weekday", "workingday", "weathersit"]
df[cat_features] = df[cat_features].astype("str")
df.head(3)

Unnamed: 0,season,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,1,0,0,6,0,1,0.24,0.2879,0.81,0.0,3,13,16
1,1,1,1,0,6,0,1,0.22,0.2727,0.8,0.0,8,32,40
2,1,1,2,0,6,0,1,0.22,0.2727,0.8,0.0,5,27,32


In [10]:
target_names = 'cnt'
columns = [col for col in df.columns if col != target_names]
X = df[columns].copy()
y = df[target_names].copy()


# Train a ML model

In [12]:
cat_features_index = [i for i, col in enumerate(X.columns) if col in cat_features]
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.33,
                                                    random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train,
                                                  test_size=0.33,
                                                  random_state=42)
test_data = catboost_pool = Pool(X_test, y_test,
                                 cat_features=cat_features_index)
val_data = catboost_pool = Pool(X_val, y_val,
                                cat_features=cat_features_index)
model = CatBoostRegressor(iterations=500, depth=3,
                           learning_rate=1,
                           loss_function='RMSE',
                           verbose=100)
# train the model
model.fit(X_train, y_train,
          cat_features=cat_features_index,
          eval_set=val_data)

# make the prediction using the resulting model
preds = model.predict(test_data)
rmse = np.sqrt(np.mean((preds - y_test)**2))
print("Test RMSE: {0:.2}".format(rmse))

0:	learn: 72.0079582	test: 68.8890183	best: 68.8890183 (0)	total: 6.88ms	remaining: 3.43s
100:	learn: 3.3868955	test: 4.0279568	best: 3.9496225 (95)	total: 279ms	remaining: 1.1s
200:	learn: 2.2990130	test: 3.3573326	best: 3.3573326 (200)	total: 539ms	remaining: 802ms
300:	learn: 1.8800424	test: 3.2616370	best: 3.2345658 (291)	total: 731ms	remaining: 484ms
400:	learn: 1.6002957	test: 3.1736670	best: 3.1580785 (393)	total: 956ms	remaining: 236ms
499:	learn: 1.4203956	test: 3.1616702	best: 3.1245929 (474)	total: 1.14s	remaining: 0us

bestTest = 3.124592859
bestIteration = 474

Shrink model to first 475 iterations.
Test RMSE: 3.0


# Define the game

In [13]:
d = X_train.shape[1]
n = 2**d - 2
d, n

(13, 8190)

In [26]:
idx_r, idx_x = np.random.choice(np.arange(len(X_test)), size=2, replace=False)
r = X_test.iloc[idx_r,:]
x = X_test.iloc[idx_x,:]

In [27]:
print(r)
print()
print("Prediction for r: {0:.0f}".format(model.predict(r.values)))
print("Real number of bike sharing for r: {0:.0f}".format(y_test.iloc[idx_r]))

season             1
mnth               3
hr                 8
holiday            0
weekday            6
workingday         0
weathersit         2
temp             0.3
atemp         0.2727
hum                1
windspeed     0.2985
casual            11
registered        34
Name: 1441, dtype: object

Prediction for r: 44
Real number of bike sharing for r: 45


In [28]:
print(x)
print()
print("Prediction for x: {0:.0f}".format(model.predict(x.values)))
print("Real number of bike sharing for x: {0:.0f}".format(y_test.iloc[idx_x]))

season             2
mnth               6
hr                19
holiday            0
weekday            1
workingday         1
weathersit         1
temp            0.72
atemp         0.6667
hum             0.54
windspeed      0.194
casual            64
registered       343
Name: 3669, dtype: object

Prediction for x: 409
Real number of bike sharing for x: 407


# Exact Shapley Values

In [29]:
# Parameters
fc = lambda x: model.predict(x)
# Computation
true_shap = ShapleyValues(x=x, fc=fc, r=r)

100%|██████████| 13/13 [00:56<00:00,  4.34s/it]


In [31]:
true_shap

season         -0.454902
mnth            0.011578
hr             -0.136489
holiday         0.000000
weekday         0.100299
workingday      0.000000
weathersit      0.059661
temp            0.725175
atemp           0.019816
hum            -0.695102
windspeed      -0.091746
casual         53.739848
registered    311.802916
dtype: float64

# Approximation methods

## Monte Carlo 

In [33]:
mc_shap = MonteCarloShapley(x=x, fc=fc, r=r, n_iter=100)
mc_shap

100%|██████████| 100/100 [00:01<00:00, 89.11it/s]


season         -0.382437
mnth            0.026674
hr             -0.107627
holiday         0.000000
weekday         0.088629
workingday      0.000000
weathersit      0.091351
temp            0.588402
atemp           0.011428
hum            -0.635469
windspeed      -0.096385
casual         53.645497
registered    311.850992
dtype: float64

## SGD

In [56]:
sgd_est = SGDshapley(d, C=y.max())
sgd_shap = sgd_est.sgd(x=x, fc=fc, r=r, n_iter=5000, step=.1, step_type="sqrt")
sgd_shap

100%|██████████| 5000/5000 [00:05<00:00, 843.71it/s]


season          1.139998
mnth            1.207532
hr              1.286116
holiday        -0.433410
weekday        -0.485653
workingday      0.257571
weathersit      1.584520
temp            0.581246
atemp           1.293513
hum            -0.338280
windspeed      -1.095424
casual         53.079079
registered    307.004248
dtype: float64