In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.datasets import fetch_20newsgroups
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline

from shapkit_nbdev.shapley_values import ShapleyValues
from shapkit_nbdev.inspector import inspector
from shapkit_nbdev.monte_carlo_shapley import MonteCarloShapley
from shapkit_nbdev.sgd_shapley import SGDshapley

%load_ext autoreload
%autoreload 2

# Load dataset

In [2]:
categories = [
    'rec.autos',
    'sci.med',
]

# Uncomment the following to do the analysis on all the categories
#categories = None

print("Loading 20 newsgroups dataset for categories:")
print(categories)

newsgroups_train = fetch_20newsgroups(subset='train', categories=categories, remove=('headers', 'footers', 'quotes'))
print("%d documents" % len(newsgroups_train.filenames))
print("%d categories" % len(newsgroups_train.target_names))
print()

Loading 20 newsgroups dataset for categories:
['rec.autos', 'sci.med']
1188 documents
2 categories



In [3]:
count_vect = CountVectorizer()
X_train = np.array(count_vect.fit_transform(newsgroups_train.data).todense())
word_columns = ['']*len(count_vect.vocabulary_)
for word, idx in count_vect.vocabulary_.items():
    word_columns[idx] = word
X_train = pd.DataFrame(X_train, columns=word_columns)
y_train = newsgroups_train.target
X_train.head(3)

Unnamed: 0,00,000,0000,00014,000mi,000miles,0010,0033,004021809,00500,...,zimmerman,zinc,zip,zoloft,zonal,zooid,zubkoff,zx,zz,zzz
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
newsgroups_test = fetch_20newsgroups(subset='test', categories=categories, remove=('headers', 'footers', 'quotes'))
X_test = np.array(count_vect.transform(newsgroups_test.data).todense())
X_test = pd.DataFrame(X_test, columns=word_columns)
y_test = newsgroups_test.target


# Train a ML model

In [5]:
pipeline = Pipeline([
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB(alpha=.01)),
])
pipeline.fit(X_train, y_train)

pred = pipeline.predict(X_test)
metrics.f1_score(y_test, pred, average='macro')

0.9418658088235294

# Define the game

In [7]:
d = X_train.shape[1]
n = 2**d - 2

In [8]:
fc = lambda x: int(pipeline.predict(x.reshape(1,-1)))

In [9]:
r_class, x_class = 0, 0
while x_class == r_class:
    idx_r, idx_x = np.random.choice(np.arange(len(X_test)), size=2, replace=False)
    r = X_test.iloc[idx_r,:]
    x = X_test.iloc[idx_x,:]
    r_class = fc(r.values)
    x_class = fc(x.values)
fc_class = lambda x: 1 if int(fc(x)) == int(x_class) else 0

In [10]:
print(r)
print()
print("Class Prediction for r: {0}".format(newsgroups_train.target_names[fc(r.values)]))
print("Real class for r: {0}".format(newsgroups_train.target_names[y_test[idx_r]]))
print()
print("RAW TEXT")
newsgroups_test.data[idx_r]

00         0
000        0
0000       0
00014      0
000mi      0
          ..
zooid      0
zubkoff    0
zx         0
zz         0
zzz        0
Name: 286, Length: 17863, dtype: int64

Class Prediction for r: sci.med
Real class for r: sci.med

RAW TEXT


':>Sounds to me like someone was pulling your leg.  There is only one way for\n:>pregnancy to occur: intercourse.  These days however there is also\n:>artificial insemination and implantation techniques, but we\'re speaking of\n:>"natural" acts here.  It is possible for pregnancy to occur if semen is\n:>deposited just outside of the vagina (i.e. coitus interruptus), but that\'s\n:>about at far as you can get.  Through clothes -- no way.  Better go talk\n:>to your biology teacher.\n:\n: what is the likely hood of conception if sperm is deposited just outside\n:the vagina?  ie.  __% chance.\n: -------------------------------------------------------------------------\n\nHmmm.... I really don\'t know.  Probably quite low overall.  Why don\'t we\nget a couple hundred willing couples together and find out ;->\n'

In [11]:
print(x)
print()
print("Class Prediction for x: {0}".format(newsgroups_train.target_names[fc(x.values)]))
print("Real class for x: {0}".format(newsgroups_train.target_names[y_test[idx_x]]))
print("RAW TEXT")
newsgroups_test.data[idx_x]

00         0
000        0
0000       0
00014      0
000mi      0
          ..
zooid      0
zubkoff    0
zx         0
zz         0
zzz        0
Name: 680, Length: 17863, dtype: int64

Class Prediction for x: rec.autos
Real class for x: rec.autos
RAW TEXT


'Is it ok to take the car out of gear without using the clutch\n(while the car is turned off)?\n\nThanks in advance.\n\nPlease reply by mail.'

# Approximation methods

## Monte Carlo 

In [12]:
mc_shap = MonteCarloShapley(x=x, fc=fc_class, r=r, n_iter=1000)
mc_shap

  0%|          | 1/1000 [00:00<02:00,  8.28it/s]

new dimension 95


100%|██████████| 1000/1000 [01:42<00:00,  7.69it/s]


__        -0.079
about     -0.001
acts      -0.045
advance    0.011
also      -0.002
           ...  
why        0.006
willing   -0.039
without   -0.011
you       -0.004
your      -0.001
Length: 95, dtype: float64

In [13]:
mc_shap[mc_shap != 0].sort_values(ascending=False)

vagina        0.277
pregnancy     0.253
car           0.164
clutch        0.140
artificial    0.119
              ...  
acts         -0.045
clothes      -0.069
__           -0.079
hood         -0.095
pulling      -0.126
Length: 87, dtype: float64

In [25]:
x_attributes = []
for index, val in x[mc_shap.index].iteritems():
    x_attributes.append(index + " = "+str(val))

mc_shap_attr = pd.Series(mc_shap.values, index=x_attributes)
mc_shap_attr[mc_shap_attr != 0].sort_values(ascending=False)

vagina = 0        0.277
pregnancy = 0     0.253
car = 2           0.164
clutch = 1        0.140
artificial = 0    0.119
                  ...  
acts = 0         -0.045
clothes = 0      -0.069
__ = 0           -0.079
hood = 0         -0.095
pulling = 0      -0.126
Length: 87, dtype: float64