In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.datasets import fetch_20newsgroups
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline

from shapkit_nbdev.shapley_values import ShapleyValues
from shapkit_nbdev.inspector import inspector
from shapkit_nbdev.monte_carlo_shapley import MonteCarloShapley
from shapkit_nbdev.sgd_shapley import SGDshapley

%load_ext autoreload
%autoreload 2

# Load dataset

In [2]:
categories = [
    'rec.autos',
    'sci.med',
]

# Uncomment the following to do the analysis on all the categories
#categories = None

print("Loading 20 newsgroups dataset for categories:")
print(categories)

newsgroups_train = fetch_20newsgroups(subset='train', categories=categories, remove=('headers', 'footers', 'quotes'))
print("%d documents" % len(newsgroups_train.filenames))
print("%d categories" % len(newsgroups_train.target_names))
print()

Loading 20 newsgroups dataset for categories:
['rec.autos', 'sci.med']
1188 documents
2 categories



In [3]:
count_vect = CountVectorizer()
X_train = np.array(count_vect.fit_transform(newsgroups_train.data).todense())
word_columns = ['']*len(count_vect.vocabulary_)
for word, idx in count_vect.vocabulary_.items():
    word_columns[idx] = word
X_train = pd.DataFrame(X_train, columns=word_columns)
y_train = newsgroups_train.target
X_train.head(3)

Unnamed: 0,00,000,0000,00014,000mi,000miles,0010,0033,004021809,00500,...,zimmerman,zinc,zip,zoloft,zonal,zooid,zubkoff,zx,zz,zzz
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
newsgroups_test = fetch_20newsgroups(subset='test', categories=categories, remove=('headers', 'footers', 'quotes'))
X_test = np.array(count_vect.transform(newsgroups_test.data).todense())
X_test = pd.DataFrame(X_test, columns=word_columns)
y_test = newsgroups_test.target


# Train a ML model

In [5]:
pipeline = Pipeline([
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB(alpha=.01)),
])
pipeline.fit(X_train, y_train)

pred = pipeline.predict(X_test)
metrics.f1_score(y_test, pred, average='macro')

0.9418658088235294

# Define the game

In [6]:
d = X_train.shape[1]
n = 2**d - 2

In [7]:
fc = lambda x: int(pipeline.predict(x.reshape(1,-1)))

In [8]:
r_class, x_class = 0, 0
while x_class == r_class:
    idx_r, idx_x = np.random.choice(np.arange(len(X_test)), size=2, replace=False)
    r = X_test.iloc[idx_r,:]
    x = X_test.iloc[idx_x,:]
    r_class = fc(r.values)
    x_class = fc(x.values)
fc_class = lambda x: 1 if int(fc(x)) == int(x_class) else 0

In [9]:
print(r)
print()
print("Class Prediction for r: {0}".format(newsgroups_train.target_names[fc(r.values)]))
print("Real class for r: {0}".format(newsgroups_train.target_names[y_test[idx_r]]))
print()
print("RAW TEXT")
newsgroups_test.data[idx_r]

00         0
000        0
0000       0
00014      0
000mi      0
          ..
zooid      0
zubkoff    0
zx         0
zz         0
zzz        0
Name: 549, Length: 17863, dtype: int64

Class Prediction for r: sci.med
Real class for r: sci.med

RAW TEXT


'BIOLOGICAL ALCHEMY\n                          \n                        ( ANOTHER Form of COLD FUSION )\n\n               ( ALTERNATIVE Heavy Element Creation in Universe ) \n\n               A very simple experiment can demonstrate (PROVE) the \n          FACT of "BIOLOGICAL TRANSMUTATIONS" (reactions like Mg + O \n          --> Ca, Si + C --> Ca, K + H --> Ca, N2 --> CO, etc.), as \n          described in the BOOK "Biological Transmutations" by Louis \n          Kervran, [1972 Edition is BEST.], and in Chapter 17 of the \n          book "THE SECRET LIFE OF PLANTS" by Peter Tompkins and \n          Christopher Bird, 1973: \n\n               (1) Obtain a good sample of plant seeds, all of the same \n                   kind.  [Some kinds might work better that others.]\n\n               (2) Divide the sample into two groups of equal weight \n                   and number.\n\n               (3) Sprout one group in distilled water on filter paper \n                   for three or four we

In [10]:
print(x)
print()
print("Class Prediction for x: {0}".format(newsgroups_train.target_names[fc(x.values)]))
print("Real class for x: {0}".format(newsgroups_train.target_names[y_test[idx_x]]))
print("RAW TEXT")
newsgroups_test.data[idx_x]

00         0
000        0
0000       0
00014      0
000mi      0
          ..
zooid      0
zubkoff    0
zx         0
zz         0
zzz        0
Name: 261, Length: 17863, dtype: int64

Class Prediction for x: rec.autos
Real class for x: rec.autos
RAW TEXT


'\n\n\n\nA dealer will make money off you in three ways, if you let him:\n\n\t1)  New car markup over his cost (remember his hold-back),\n\t2)  Arranging financing through the dealership, and\n\t3)  Screwing you on the trade.\n\nKeep the deal with the dealer simple by eliminating 2 & 3.  Buying a car at \n"dealer\'s cost" is meaningless if he makes $1000 on the trade and/or gets a \nkickback from the bank.\n\nBlue book (you need to know if you\'re talking average wholesale or average \nretail) is a good guide to value for a car.  If you are selling it yourself, \ntry to get average retail, and chances are you\'ll have done ok.\n\nBe careful selling to acquaintances if you ever want them to become friends.'

# Approximation methods

## Monte Carlo 

In [15]:
mc_shap = MonteCarloShapley(x=x, fc=fc_class, r=r, n_iter=100)
mc_shap

  0%|          | 0/100 [00:00<?, ?it/s]

new dimension 260


100%|██████████| 100/100 [00:59<00:00,  1.74it/s]


00         0.0
000        0.0
0000       0.0
00014      0.0
000mi      0.0
          ... 
zooid      0.0
zubkoff    0.0
zx         0.0
zz         0.0
zzz        0.0
Length: 17863, dtype: float64

In [16]:
mc_shap[mc_shap != 0].sort_values(ascending=False)

biological    0.15
mineral       0.07
dealer        0.05
healing       0.04
residue       0.04
              ... 
1972         -0.03
ground       -0.03
carbon       -0.04
nitrogen     -0.05
si           -0.05
Length: 84, dtype: float64

In [17]:
x_attributes = []
for index, val in x[mc_shap.index].iteritems():
    x_attributes.append(index + " = "+str(val))

mc_shap_attr = pd.Series(mc_shap.values, index=x_attributes)
mc_shap_attr[mc_shap_attr != 0].sort_values(ascending=False)

biological = 0    0.15
mineral = 0       0.07
dealer = 3        0.05
healing = 0       0.04
residue = 0       0.04
                  ... 
1972 = 0         -0.03
ground = 0       -0.03
carbon = 0       -0.04
nitrogen = 0     -0.05
si = 0           -0.05
Length: 84, dtype: float64