In [2]:
from sklearn.datasets import fetch_20newsgroups
categories = [
    'alt.atheism',
    'talk.religion.misc',
    'comp.graphics',
    'sci.space',
]
fetch_subset = lambda subset: fetch_20newsgroups(
    subset=subset, categories=categories,
    shuffle=True, random_state=42,
    remove=('headers', 'footers', 'quotes'))
train = fetch_subset('train')
test = fetch_subset('test')

In [19]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegressionCV
from sklearn.feature_extraction.text import TfidfVectorizer

vec = TfidfVectorizer()
clf = LogisticRegressionCV()
pipeline = Pipeline([('vec', vec), ('clf', clf)])
pipeline.fit(train['data'], train['target'])

Pipeline(steps=[('vec', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
   ...2', random_state=None,
           refit=True, scoring=None, solver='lbfgs', tol=0.0001, verbose=0))])

In [21]:
from eli5.sklearn import explain_weights, explain_prediction
from eli5.formatters import format_as_html, format_as_text, format_html_styles

print(format_as_text(explain_weights(clf, vec, target_names=train['target_names'])))

Explained as: linear model

Features with largest coefficients per class.
Caveats:
1. Be careful with features which are not
   independent - weights don't show their importance.
2. If scale of input features is different then scale of coefficients
   will also be different, making direct comparison between coefficient values
   incorrect.
3. Depending on regularization, rare features sometimes may have high
   coefficients; this doesn't mean they contribute much to the
   classification result for most examples.

y='alt.atheism' top features
----------------------
     atheism  +18.067
    atheists  +16.546
    religion  +14.379
       bobby  +14.378
     matthew  +14.321
       motto  +13.380
     atheist  +13.209
       islam  +13.021
       nanci  +12.787
enviroleague  +12.247
       loans  +12.100
       satan  +11.656
     posting  +11.482
enlightening  +11.189
     natural  +11.092
         ...   (6379 more positive features)
         ...   (20481 more negative features)
       

In [18]:
from IPython.core.display import display, HTML
show_html = lambda html: display(HTML(html))
show_html_expl = lambda expl: show_html(format_as_html(expl, include_styles=False))
show_html(format_html_styles())

In [22]:
show_html_expl(explain_weights(clf, vec, target_names=train['target_names']))

Feature,Weight,Unnamed: 2
atheism,+18.067,
atheists,+16.546,
religion,+14.379,
bobby,+14.378,
matthew,+14.321,
motto,+13.380,
atheist,+13.209,
islam,+13.021,
nanci,+12.787,
enviroleague,+12.247,

Feature,Weight,Unnamed: 2
graphics,+26.182,
image,+19.102,
computer,+17.399,
3d,+16.913,
file,+16.262,
points,+14.144,
sgi,+13.288,
42,+13.188,
hi,+12.454,
3do,+11.824,

Feature,Weight,Unnamed: 2
space,+35.829,
orbit,+17.687,
nasa,+15.236,
launch,+15.157,
spacecraft,+13.201,
mars,+12.886,
nick,+12.326,
allen,+12.148,
moon,+11.852,
shuttle,+11.763,

Feature,Weight,Unnamed: 2
christian,+19.206,
blood,+16.642,
fbi,+14.916,
christians,+14.187,
hudson,+12.780,
order,+12.761,
christ,+12.350,
ekr,+12.105,
terrorist,+11.964,
koresh,+11.622,


In [24]:
show_html_expl(explain_prediction(clf, test['data'][1], vec, target_names=train['target_names']))

Feature,Weight,Unnamed: 2
vatican,+1.989,
made,+1.117,
is,+0.156,
…,1 more positive,1 more positive
…,1 more negative,
this,-0.078,
finding,-0.137,
can,-0.142,
me,-0.143,
us,-0.174,

Feature,Weight,Unnamed: 2
site,+2.295,
library,+2.066,
ftp,+1.600,
anyone,+1.326,
help,+1.115,
available,+0.822,
where,+0.569,
collection,+0.441,
finding,+0.326,
can,+0.284,

Feature,Weight,Unnamed: 2
collection,+0.715,
tour,+0.484,
the,+0.421,
us,+0.162,
finding,+0.150,
this,+0.105,
made,+0.055,
…,1 more positive,1 more positive
…,1 more negative,
where,-0.095,

Feature,Weight,Unnamed: 2
recently,+1.246,
us,+1.143,
me,+0.852,
…,2 more positive,2 more positive
help,-0.092,
tour,-0.120,
the,-0.224,
is,-0.338,
where,-0.357,
collection,-0.405,
