In [1]:
from sklearn.datasets import fetch_20newsgroups
categories = [
    'alt.atheism',
    'talk.religion.misc',
    'comp.graphics',
    'sci.space',
]
fetch_subset = lambda subset: fetch_20newsgroups(
    subset=subset, categories=categories,
    shuffle=True, random_state=42,
    remove=('headers', 'footers', 'quotes'))
train = fetch_subset('train')
test = fetch_subset('test')

In [2]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

vec = TfidfVectorizer(analyzer='char_wb', ngram_range=(3, 4))
clf = SGDClassifier(n_jobs=-1)
pipeline = Pipeline([('vec', vec), ('clf', clf)])
pipeline.fit(train['data'], train['target'])

Pipeline(steps=[('vec', TfidfVectorizer(analyzer='char_wb', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(3, 4), norm='l2', preprocessor=None, smooth_idf=True,
...   penalty='l2', power_t=0.5, random_state=None, shuffle=True,
       verbose=0, warm_start=False))])

In [3]:
from eli5.sklearn import explain_weights, explain_prediction
from eli5.formatters import format_as_html, format_as_text, format_html_styles

print(format_as_text(explain_weights(clf, vec, target_names=train['target_names'])))

Explained as: linear model

Features with largest coefficients per class.
Caveats:
1. Be careful with features which are not
   independent - weights don't show their importance.
2. If scale of input features is different then scale of coefficients
   will also be different, making direct comparison between coefficient values
   incorrect.
3. Depending on regularization, rare features sometimes may have high
   coefficients; this doesn't mean they contribute much to the
   classification result for most examples.

y='alt.atheism' top features
--------------
  +2.617  heis
  +2.202  eist
  +2.133  eis 
  +1.865  nat 
  +1.843  ░ath
  +1.765  thei
  +1.747  ░pos
  +1.678  hei 
  +1.658  athe
  +1.612  ish░
  +1.583  sla 
  +1.523  rna 
  +1.488  ░is 
  +1.486  post
  +1.446  pos 
  +1.418  slam
       …  (20165 more positive features)
       …  (31026 more negative features)
  -1.703  ░*░ 
  -1.675  pac 
  -1.558  ░us 
  -1.536  ░his

y='comp.graphics' top features
--------------
  +2.12

In [4]:
from IPython.core.display import display, HTML
show_html = lambda html: display(HTML(html))
show_html_expl = lambda expl, **kwargs: show_html(format_as_html(expl, include_styles=False, **kwargs))
show_html(format_html_styles())

In [5]:
show_html_expl(explain_weights(clf, vec, target_names=train['target_names'], top=100))

Weight,Feature
+2.617,heis
+2.202,eist
+2.133,eis
+1.865,nat
+1.843,ath
+1.765,thei
+1.747,pos
+1.678,hei
+1.658,athe
+1.612,ish

Weight,Feature
+2.129,3d
+2.008,mag
+1.968,file
+1.930,ima
+1.918,mage
+1.836,phi
+1.834,gra
+1.800,imag
+1.779,fil
+1.718,raph

Weight,Feature
+3.394,spac
+3.358,pace
+2.917,spa
+2.719,spa
+2.637,pac
+2.005,nas
+1.945,ace
+1.895,sp
+1.853,nas
+1.808,orb

Weight,Feature
+2.494,*
+1.785,us
+1.764,rist
+1.621,ian
+1.603,he
+1.478,sa
+1.469,ian
+1.435,and
+1.391,and
+1.390,mor


In [6]:
show_html_expl(explain_prediction(clf, test['data'][7], vec, target_names=train['target_names'], top=50), force_weights=True)

Weight,Feature
+0.208,:
+0.056,be
+0.042,tin
+0.034,wh
+0.033,is
+0.033,ill
+0.030,up
+0.030,of
+0.026,st
+0.026,of

Weight,Feature
… 478 more positive …,… 478 more positive …
… 577 more negative …,… 577 more negative …
-0.384,Highlighted in text (sum)
-0.948,<BIAS>

Weight,Feature
+0.080,:
+0.041,fi
+0.034,li
+0.032,it
+0.032,co
+0.031,ile
+0.030,ha
+0.029,is
… 476 more positive …,… 476 more positive …
… 563 more negative …,… 563 more negative …

Weight,Feature
… 476 more positive …,… 476 more positive …
… 563 more negative …,… 563 more negative …
-1.022,<BIAS>
-1.579,Highlighted in text (sum)

Weight,Feature
+0.112,pac
+0.105,spac
+0.103,pace
+0.102,igh
+0.099,astr
+0.085,spa
+0.085,spa
+0.073,ht
+0.072,ght
+0.071,ight

Weight,Feature
+1.881,Highlighted in text (sum)
… 528 more positive …,… 528 more positive …
… 532 more negative …,… 532 more negative …
-1.018,<BIAS>

Weight,Feature
+0.104,th
+0.045,sa
+0.043,is
+0.043,br
+0.040,eld
+0.038,he
+0.037,of
+0.037,fra
+0.034,der
+0.033,serv

Weight,Feature
+0.141,Highlighted in text (sum)
… 473 more positive …,… 473 more positive …
… 581 more negative …,… 581 more negative …
-0.995,<BIAS>


In [7]:
show_html_expl(explain_prediction(clf, test['data'][1], vec, target_names=train['target_names']))

Weight,Feature
+0.120,mad
+0.101,atic
+0.085,mad
+0.073,vat
+0.062,ican
+0.054,is
+0.050,ble.
+0.050,an
… 55 more positive …,… 55 more positive …
… 101 more negative …,… 101 more negative …

Weight,Feature
… 55 more positive …,… 55 more positive …
… 101 more negative …,… 101 more negative …
-0.106,Highlighted in text (sum)
-0.948,<BIAS>

Weight,Feature
+0.112,ftp
+0.109,ft
+0.097,lib
+0.096,ftp
+0.092,tp
+0.090,ftp
+0.089,lib
+0.088,hel
+0.079,site
+0.076,help

Weight,Feature
+1.405,Highlighted in text (sum)
… 95 more positive …,… 95 more positive …
… 61 more negative …,… 61 more negative …
-1.022,<BIAS>

Weight,Feature
+0.067,ndin
+0.059,ndi
+0.054,ry
+0.049,lle
… 87 more positive …,… 87 more positive …
… 69 more negative …,… 69 more negative …
-0.041,he
-0.042,fin
-0.043,find
-0.045,ican

Weight,Feature
… 87 more positive …,… 87 more positive …
… 69 more negative …,… 69 more negative …
-0.567,Highlighted in text (sum)
-1.018,<BIAS>

Weight,Feature
+0.070,vati
+0.069,indi
+0.064,th
+0.060,is
+0.048,ndi
+0.048,us.
+0.047,ecen
… 68 more positive …,… 68 more positive …
… 88 more negative …,… 88 more negative …
-0.046,olle

Weight,Feature
… 68 more positive …,… 68 more positive …
… 88 more negative …,… 88 more negative …
-0.283,Highlighted in text (sum)
-0.995,<BIAS>


In [8]:
import numpy as np
for doc in test['data'][:10]:
    expl = explain_prediction(clf, doc, vec, target_names=train['target_names'])
    # haaack - leave only the winner
    max_class_idx = np.argmax([cl['score'] for cl in expl['classes']])
    expl['classes'] = [expl['classes'][max_class_idx]]
    show_html_expl(expl, force_weights=False)

Weight,Feature
+1.314,Highlighted in text (sum)
… 20 more positive …,… 20 more positive …
… 12 more negative …,… 12 more negative …
-1.018,<BIAS>


Weight,Feature
+1.405,Highlighted in text (sum)
… 95 more positive …,… 95 more positive …
… 61 more negative …,… 61 more negative …
-1.022,<BIAS>


Weight,Feature
+1.105,Highlighted in text (sum)
… 186 more positive …,… 186 more positive …
… 123 more negative …,… 123 more negative …
-1.022,<BIAS>


Weight,Feature
+0.117,Highlighted in text (sum)
… 885 more positive …,… 885 more positive …
… 756 more negative …,… 756 more negative …
-1.022,<BIAS>


Weight,Feature
+1.207,Highlighted in text (sum)
… 92 more positive …,… 92 more positive …
… 98 more negative …,… 98 more negative …
-1.022,<BIAS>


Weight,Feature
+1.153,Highlighted in text (sum)
… 141 more positive …,… 141 more positive …
… 111 more negative …,… 111 more negative …
-1.022,<BIAS>


Weight,Feature
+0.029,Highlighted in text (sum)
… 70 more positive …,… 70 more positive …
… 46 more negative …,… 46 more negative …
-1.022,<BIAS>


Weight,Feature
+0.892,Highlighted in text (sum)
… 555 more positive …,… 555 more positive …
… 535 more negative …,… 535 more negative …
-1.018,<BIAS>


Weight,Feature
+1.980,Highlighted in text (sum)
… 150 more positive …,… 150 more positive …
… 134 more negative …,… 134 more negative …
-0.948,<BIAS>


Weight,Feature
+0.397,Highlighted in text (sum)
… 402 more positive …,… 402 more positive …
… 347 more negative …,… 347 more negative …
-1.018,<BIAS>
