In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.datasets import fetch_20newsgroups
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline

from shapkit.shapley_values import ShapleyValues
from shapkit.inspector import inspector
from shapkit.monte_carlo_shapley import MonteCarloShapley
from shapkit.sgd_shapley import SGDshapley

%load_ext autoreload
%autoreload 2

# Load dataset

In [2]:
categories = [
    'rec.autos',
    'sci.med',
]

# Uncomment the following to do the analysis on all the categories
#categories = None

print("Loading 20 newsgroups dataset for categories:")
print(categories)

newsgroups_train = fetch_20newsgroups(subset='train', categories=categories, remove=('headers', 'footers', 'quotes'))
print("%d documents" % len(newsgroups_train.filenames))
print("%d categories" % len(newsgroups_train.target_names))
print()

Loading 20 newsgroups dataset for categories:
['rec.autos', 'sci.med']
1188 documents
2 categories



In [3]:
count_vect = CountVectorizer()
X_train = np.array(count_vect.fit_transform(newsgroups_train.data).todense())
word_columns = ['']*len(count_vect.vocabulary_)
for word, idx in count_vect.vocabulary_.items():
    word_columns[idx] = word
X_train = pd.DataFrame(X_train, columns=word_columns)
y_train = newsgroups_train.target
X_train.head(3)

Unnamed: 0,00,000,0000,00014,000mi,000miles,0010,0033,004021809,00500,...,zimmerman,zinc,zip,zoloft,zonal,zooid,zubkoff,zx,zz,zzz
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
newsgroups_test = fetch_20newsgroups(subset='test', categories=categories, remove=('headers', 'footers', 'quotes'))
X_test = np.array(count_vect.transform(newsgroups_test.data).todense())
X_test = pd.DataFrame(X_test, columns=word_columns)
y_test = newsgroups_test.target


# Train a ML model

In [5]:
pipeline = Pipeline([
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB(alpha=.01)),
])
pipeline.fit(X_train, y_train)

pred = pipeline.predict(X_test)
metrics.f1_score(y_test, pred, average='macro')

0.9418658088235294

# Define the game

In [6]:
d = X_train.shape[1]
n = 2**d - 2

In [7]:
fc = lambda x: int(pipeline.predict(x.reshape(1,-1)))

In [8]:
r_class, x_class = 0, 0
while x_class == r_class:
    idx_r, idx_x = np.random.choice(np.arange(len(X_test)), size=2, replace=False)
    r = X_test.iloc[idx_r,:]
    x = X_test.iloc[idx_x,:]
    r_class = fc(r.values)
    x_class = fc(x.values)
fc_class = lambda x: 1 if int(fc(x)) == int(x_class) else 0

In [9]:
print(r)
print()
print("Class Prediction for r: {0}".format(newsgroups_train.target_names[fc(r.values)]))
print("Real class for r: {0}".format(newsgroups_train.target_names[y_test[idx_r]]))
print()
print("RAW TEXT")
newsgroups_test.data[idx_r]

00         0
000        0
0000       0
00014      0
000mi      0
          ..
zooid      0
zubkoff    0
zx         0
zz         0
zzz        0
Name: 79, Length: 17863, dtype: int64

Class Prediction for r: rec.autos
Real class for r: rec.autos

RAW TEXT


"\nI am the original owner of the seats and the original poster. \nI take VERY serious offence in your statement. \nI see a lot of computers advertized on the net, and my friend just had been\nreleived of his machine = all the net-computer ads are for stolen computers?\nWhere did you learn logic?\n\nAs for the seats, they were replaced by a much harder (literally) Celica GTS\nseats due to my back problem. That is why I had to reuse the MR2 brackets\nand that's why the MR2 seats I sell are attached to Celica brackets."

In [10]:
print(x)
print()
print("Class Prediction for x: {0}".format(newsgroups_train.target_names[fc(x.values)]))
print("Real class for x: {0}".format(newsgroups_train.target_names[y_test[idx_x]]))
print("RAW TEXT")
newsgroups_test.data[idx_x]

00         0
000        0
0000       0
00014      0
000mi      0
          ..
zooid      0
zubkoff    0
zx         0
zz         0
zzz        0
Name: 659, Length: 17863, dtype: int64

Class Prediction for x: sci.med
Real class for x: sci.med
RAW TEXT


'hi all, Ive applied for the class of 93 at quite a number of schools (20)\nand have gotten 13 rejects, 4 interviews and 3 no responses.\nAny one know when the heck these people send out their acceptance letters?\nAccording to the med school admissions book theyre supposed to send out\nthe number of their class in acceptances by mid March. Whats going on... I\nam losing my sanity checking my mailbox every day.\n\nAlso does anyone have some useful alternatives in case i dont get in, i\nkind of looked into Chiropractic and Podiatry but they really dont\ninterest me. Thanks.\n'

# Approximation methods

## Monte Carlo 

In [11]:
mc_shap = MonteCarloShapley(x=x, fc=fc_class, ref=r, n_iter=100)
mc_shap

  0%|          | 0/100 [00:00<?, ?it/s]

new dimension 111


100%|██████████| 100/100 [00:56<00:00,  1.77it/s]


00         0.0
000        0.0
0000       0.0
00014      0.0
000mi      0.0
          ... 
zooid      0.0
zubkoff    0.0
zx         0.0
zz         0.0
zzz        0.0
Length: 17863, dtype: float64

In [12]:
mc_shap[mc_shap != 0].sort_values(ascending=False)

seats           0.33
mr2             0.27
celica          0.13
podiatry        0.09
sell            0.09
stolen          0.08
med             0.07
losing          0.07
replaced        0.06
chiropractic    0.06
march           0.06
acceptance      0.05
statement       0.05
owner           0.04
schools         0.04
gotten          0.03
admissions      0.02
applied         0.02
interest        0.01
back            0.01
letters         0.01
13              0.01
alternatives    0.01
lot             0.01
case            0.01
rejects         0.01
school          0.01
book            0.01
any            -0.01
also           -0.01
checking       -0.01
literally      -0.01
due            -0.01
going          -0.01
net            -0.01
out            -0.01
send           -0.01
class          -0.02
poster         -0.02
93             -0.02
attached       -0.03
reuse          -0.04
logic          -0.07
mid            -0.07
whats          -0.07
computers      -0.10
dont           -0.13
dtype: float6

In [13]:
x_attributes = []
for index, val in x[mc_shap.index].iteritems():
    x_attributes.append(index + " = "+str(val))

mc_shap_attr = pd.Series(mc_shap.values, index=x_attributes)
mc_shap_attr[mc_shap_attr != 0].sort_values(ascending=False)

seats = 0           0.33
mr2 = 0             0.27
celica = 0          0.13
podiatry = 1        0.09
sell = 0            0.09
stolen = 0          0.08
med = 1             0.07
losing = 1          0.07
replaced = 0        0.06
chiropractic = 1    0.06
march = 1           0.06
acceptance = 1      0.05
statement = 0       0.05
owner = 0           0.04
schools = 1         0.04
gotten = 1          0.03
admissions = 1      0.02
applied = 1         0.02
interest = 1        0.01
back = 0            0.01
letters = 1         0.01
13 = 1              0.01
alternatives = 1    0.01
lot = 0             0.01
case = 1            0.01
rejects = 1         0.01
school = 1          0.01
book = 1            0.01
any = 1            -0.01
also = 1           -0.01
checking = 1       -0.01
literally = 0      -0.01
due = 0            -0.01
going = 1          -0.01
net = 0            -0.01
out = 2            -0.01
send = 2           -0.01
class = 2          -0.02
poster = 0         -0.02
93 = 1             -0.02
