In [16]:
from sklearn.datasets import fetch_20newsgroups
categories = [
    'alt.atheism',
    'talk.religion.misc',
    'comp.graphics',
    'sci.space',
]
fetch_subset = lambda subset: fetch_20newsgroups(
    subset=subset, categories=categories,
    shuffle=True, random_state=42,
    remove=('headers', 'footers', 'quotes'))
train = fetch_subset('train')
test = fetch_subset('test')

In [17]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegressionCV
from sklearn.feature_extraction.text import TfidfVectorizer

vec = TfidfVectorizer()
clf = LogisticRegressionCV()
pipeline = Pipeline([('vec', vec), ('clf', clf)])
pipeline.fit(train['data'], train['target'])

Pipeline(steps=[('vec', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
   ...2', random_state=None,
           refit=True, scoring=None, solver='lbfgs', tol=0.0001, verbose=0))])

In [18]:
from eli5.sklearn import explain_weights, explain_prediction
from eli5.formatters import format_as_html, format_as_text, format_html_styles

print(format_as_text(explain_weights(clf, vec, target_names=train['target_names'])))

Explained as: linear model

Features with largest coefficients per class.
Caveats:
1. Be careful with features which are not
   independent - weights don't show their importance.
2. If scale of input features is different then scale of coefficients
   will also be different, making direct comparison between coefficient values
   incorrect.
3. Depending on regularization, rare features sometimes may have high
   coefficients; this doesn't mean they contribute much to the
   classification result for most examples.

y='alt.atheism' top features
----------------------
 +18.067  atheism     
 +16.546  atheists    
 +14.379  religion    
 +14.378  bobby       
 +14.321  matthew     
 +13.380  motto       
 +13.209  atheist     
 +13.021  islam       
 +12.787  nanci       
 +12.247  enviroleague
 +12.100  loans       
 +11.656  satan       
 +11.482  posting     
 +11.189  enlightening
 +11.092  natural     
       …  (6379 more positive features)
       …  (20481 more negative features)
 -

In [19]:
from IPython.core.display import display, HTML
show_html = lambda html: display(HTML(html))
show_html_expl = lambda expl, **kwargs: show_html(format_as_html(expl, include_styles=False, **kwargs))
show_html(format_html_styles())

In [20]:
show_html_expl(explain_weights(clf, vec, target_names=train['target_names']))

Weight,Feature
+18.067,atheism
+16.546,atheists
+14.379,religion
+14.378,bobby
+14.321,matthew
+13.380,motto
+13.209,atheist
+13.021,islam
+12.787,nanci
+12.247,enviroleague

Weight,Feature
+26.182,graphics
+19.102,image
+17.399,computer
+16.913,3d
+16.262,file
+14.144,points
+13.288,sgi
+13.188,42
+12.454,hi
+11.824,3do

Weight,Feature
+35.829,space
+17.687,orbit
+15.236,nasa
+15.157,launch
+13.201,spacecraft
+12.886,mars
+12.326,nick
+12.148,allen
+11.852,moon
+11.763,shuttle

Weight,Feature
+19.206,christian
+16.642,blood
+14.916,fbi
+14.187,christians
+12.780,hudson
+12.761,order
+12.350,christ
+12.105,ekr
+11.964,terrorist
+11.622,koresh


In [21]:
show_html_expl(explain_prediction(clf, test['data'][2], vec, target_names=train['target_names']), force_weights=True)

Weight,Feature
+0.886,some
+0.537,much
… 6 more positive …,… 6 more positive …
… 15 more negative …,… 15 more negative …
-0.405,he
-0.407,how
-0.432,am
-0.461,from
-0.509,features
-0.607,where

Weight,Feature
+3.154,graphics
+2.690,software
+1.710,hi
+1.187,looking
+1.138,buy
+0.905,features
+0.846,pc
+0.679,help
+0.537,any
+0.521,it

Weight,Feature
+0.864,costs
+0.651,buy
+0.610,software
+0.497,most
+0.385,the
… 14 more positive …,… 14 more positive …
… 7 more negative …,… 7 more negative …
-0.293,help
-0.311,am
-0.321,trying

Weight,Feature
+2.188,he
+0.527,my
+0.481,more
… 7 more positive …,… 7 more positive …
… 14 more negative …,… 14 more negative …
-0.395,there
-0.399,costs
-0.472,pc
-0.499,much
-0.513,looking


We can hide weights by passing ``force_weights=False`` (they still will be shown if it's impossible to highlight text)

In [22]:
show_html_expl(explain_prediction(clf, test['data'][4], vec, target_names=train['target_names']), force_weights=False)

Show explanations for the winning class for first 10 documents from test data

In [23]:
import numpy as np
for doc in test['data'][:10]:
    expl = explain_prediction(clf, doc, vec, target_names=train['target_names'])
    # haaack - leave only the winner
    max_class_idx = np.argmax([cl['proba'] for cl in expl['classes']])
    expl['classes'] = [expl['classes'][max_class_idx]]
    show_html_expl(expl, force_weights=False)

Now use a vectorizer that skips stopwords

In [24]:
vec_stop = TfidfVectorizer(stop_words='english')
clf_stop = LogisticRegressionCV()
pipeline_stop = Pipeline([('vec', vec_stop), ('clf', clf_stop)])
pipeline_stop.fit(train['data'], train['target'])

Pipeline(steps=[('vec', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
   ...2', random_state=None,
           refit=True, scoring=None, solver='lbfgs', tol=0.0001, verbose=0))])

Words such as "the", "in", "of" are not used as features and are not highlighted

In [25]:
show_html_expl(explain_prediction(clf_stop, test['data'][4], vec_stop, target_names=train['target_names']), force_weights=False)