In [1]:
from sklearn.datasets import fetch_20newsgroups
categories = [
    'alt.atheism',
    'talk.religion.misc',
    'comp.graphics',
    'sci.space',
]
fetch_subset = lambda subset: fetch_20newsgroups(
    subset=subset, categories=categories,
    shuffle=True, random_state=42,
    remove=('headers', 'footers', 'quotes'))
train = fetch_subset('train')
test = fetch_subset('test')

In [2]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

vec = TfidfVectorizer(analyzer='char_wb', ngram_range=(3, 4))
clf = SGDClassifier(n_jobs=-1)
pipeline = Pipeline([('vec', vec), ('clf', clf)])
pipeline.fit(train['data'], train['target'])

Pipeline(steps=[('vec', TfidfVectorizer(analyzer='char_wb', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(3, 4), norm='l2', preprocessor=None, smooth_idf=True,
...   penalty='l2', power_t=0.5, random_state=None, shuffle=True,
       verbose=0, warm_start=False))])

In [9]:
from eli5 import explain_weights, explain_prediction
from eli5 import format_as_html, format_as_text, format_html_styles

print(format_as_text(explain_weights(clf, vec, target_names=train['target_names'])))

Explained as: linear model

Features with largest coefficients per class.
Caveats:
1. Be careful with features which are not
   independent - weights don't show their importance.
2. If scale of input features is different then scale of coefficients
   will also be different, making direct comparison between coefficient values
   incorrect.
3. Depending on regularization, rare features sometimes may have high
   coefficients; this doesn't mean they contribute much to the
   classification result for most examples.

y='alt.atheism' top features
Weight  Feature
------  -------
+2.761  heis   
+2.240  eis    
+2.136  eist   
+1.953  ░ath   
+1.915  thei   
+1.881  ░pos   
+1.872  hei    
+1.821  nat    
+1.748  sla    
+1.709  post   
+1.686  slam   
+1.656  ish░   
+1.646  rna    
+1.633  athe   
+1.596  lam    
+1.548  it░    
+1.519  ░is    
… 20221 more positive …
… 31994 more negative …
-1.519  pac    
-1.522  ░*░    
-1.539  ░us    

y='comp.graphics' top features
Weight  Feature
---

In [4]:
from IPython.core.display import display, HTML
show_html = lambda html: display(HTML(html))
show_html_expl = lambda expl, **kwargs: show_html(format_as_html(expl, include_styles=False, **kwargs))
show_html(format_html_styles())

In [5]:
show_html_expl(explain_weights(clf, vec, target_names=train['target_names'], top=100))

Weight?,Feature,Unnamed: 2_level_0,Unnamed: 3_level_0
Weight?,Feature,Unnamed: 2_level_1,Unnamed: 3_level_1
Weight?,Feature,Unnamed: 2_level_2,Unnamed: 3_level_2
Weight?,Feature,Unnamed: 2_level_3,Unnamed: 3_level_3
+2.761,heis,,
+2.240,eis,,
+2.136,eist,,
+1.953,ath,,
+1.915,thei,,
+1.881,pos,,
+1.872,hei,,
+1.821,nat,,
+1.748,sla,,
+1.709,post,,

Weight?,Feature
+2.761,heis
+2.240,eis
+2.136,eist
+1.953,ath
+1.915,thei
+1.881,pos
+1.872,hei
+1.821,nat
+1.748,sla
+1.709,post

Weight?,Feature
+2.089,file
+1.947,3d
+1.936,phi
+1.783,gra
+1.749,raph
+1.744,fil
+1.734,mage
+1.725,ima
+1.696,mag
+1.670,hics

Weight?,Feature
+3.213,spac
+3.136,pace
+2.723,spa
+2.533,pac
+2.470,spa
+1.960,orb
+1.866,ace
+1.862,rbit
+1.839,rbi
+1.830,orbi

Weight?,Feature
+2.022,*
+1.799,he
+1.690,ian
+1.673,us
+1.564,ian
+1.503,de
+1.466,fbi
+1.466,rist
+1.393,fbi
+1.353,fbi


In [6]:
show_html_expl(explain_prediction(clf, test['data'][7], vec, target_names=train['target_names'], top=50), force_weights=True)

Contribution?,Feature,Unnamed: 2_level_0,Unnamed: 3_level_0
Contribution?,Feature,Unnamed: 2_level_1,Unnamed: 3_level_1
Contribution?,Feature,Unnamed: 2_level_2,Unnamed: 3_level_2
Contribution?,Feature,Unnamed: 2_level_3,Unnamed: 3_level_3
+0.294,:,,
+0.059,be,,
+0.048,tin,,
+0.048,the,,
+0.034,is,,
+0.033,up,,
+0.032,ill,,
+0.030,ing,,
+0.029,wh,,
+0.027,ght,,

Contribution?,Feature
+0.294,:
+0.059,be
+0.048,tin
+0.048,the
+0.034,is
+0.033,up
+0.032,ill
+0.030,ing
+0.029,wh
+0.027,ght

Contribution?,Feature
+0.294,:
+0.038,fi
+0.033,ile
+0.031,bri
+0.030,co
+0.030,ase
+0.028,it
… 462 more positive …,… 462 more positive …
… 584 more negative …,… 584 more negative …
-0.028,ill

Contribution?,Feature
+0.108,pac
+0.099,spac
+0.096,pace
+0.093,astr
+0.087,th
+0.080,spa
+0.077,spa
+0.075,orb
+0.073,igh
+0.071,orbi

Contribution?,Feature
+0.089,:
+0.059,is
+0.049,th
+0.047,fra
+0.040,sa
+0.035,his
+0.035,as
+0.032,fra
+0.030,my
+0.027,serv

Contribution?,Feature
… 466 more positive …,… 466 more positive …
… 591 more negative …,… 591 more negative …
-0.243,Highlighted in text (sum)
-0.961,<BIAS>

Contribution?,Feature
… 462 more positive …,… 462 more positive …
… 584 more negative …,… 584 more negative …
-0.964,<BIAS>
-1.373,Highlighted in text (sum)

Contribution?,Feature
+1.968,Highlighted in text (sum)
… 543 more positive …,… 543 more positive …
… 511 more negative …,… 511 more negative …
-0.998,<BIAS>

Contribution?,Feature
… 509 more positive …,… 509 more positive …
… 545 more negative …,… 545 more negative …
-0.258,Highlighted in text (sum)
-0.977,<BIAS>


In [7]:
show_html_expl(explain_prediction(clf, test['data'][1], vec, target_names=train['target_names']))

Contribution?,Feature,Unnamed: 2_level_0,Unnamed: 3_level_0
Contribution?,Feature,Unnamed: 2_level_1,Unnamed: 3_level_1
Contribution?,Feature,Unnamed: 2_level_2,Unnamed: 3_level_2
Contribution?,Feature,Unnamed: 2_level_3,Unnamed: 3_level_3
+0.109,mad,,
+0.094,vat,,
+0.073,atic,,
+0.070,mad,,
+0.066,ican,,
+0.056,is,,
+0.052,ble.,,
+0.047,ade,,
+0.043,le.,,
+0.041,an,,

Contribution?,Feature
0.109,mad
0.094,vat
0.073,atic
0.07,mad
0.066,ican
0.056,is
0.052,ble.
0.047,ade
0.043,le.
0.041,an

Contribution?,Feature
0.117,ftp
0.114,ft
0.096,ftp
0.086,help
0.084,elp
0.082,tp
0.082,hel
0.08,ftp
0.079,lib
0.078,lib

Contribution?,Feature
0.083,ndin
0.067,ndi
0.056,ry
0.054,th
0.051,lle
0.048,tou
0.044,oll
0.043,col
0.04,a
0.038,coll

Contribution?,Feature
0.083,is
0.06,me
0.057,us.
0.053,indi
0.051,me
0.048,he
0.045,ecen
0.044,ite
0.044,tou
0.043,his

Contribution?,Feature
-0.961,<BIAS>
-1.542,Highlighted in text (sum)

Contribution?,Feature
2.698,Highlighted in text (sum)
-0.964,<BIAS>

Contribution?,Feature
-0.002,Highlighted in text (sum)
-0.998,<BIAS>

Contribution?,Feature
-0.977,<BIAS>
-1.135,Highlighted in text (sum)


In [8]:
import numpy as np
for doc in test['data'][:10]:
    expl = explain_prediction(clf, doc, vec, target_names=train['target_names'], top_targets=1)
    show_html_expl(expl, force_weights=False)

Contribution?,Feature
1.273,Highlighted in text (sum)
-0.998,<BIAS>


Contribution?,Feature
2.698,Highlighted in text (sum)
-0.964,<BIAS>


Contribution?,Feature
2.619,Highlighted in text (sum)
-0.964,<BIAS>


Contribution?,Feature
1.431,Highlighted in text (sum)
-0.964,<BIAS>


Contribution?,Feature
1.457,Highlighted in text (sum)
-0.964,<BIAS>


Contribution?,Feature
1.475,Highlighted in text (sum)
-0.964,<BIAS>


Contribution?,Feature
0.351,Highlighted in text (sum)
-0.964,<BIAS>


Contribution?,Feature
2.86,Highlighted in text (sum)
-0.998,<BIAS>


Contribution?,Feature
2.839,Highlighted in text (sum)
-0.961,<BIAS>


Contribution?,Feature
0.95,Highlighted in text (sum)
-0.998,<BIAS>
