In [1]:
from sklearn.datasets import fetch_20newsgroups
categories = [
    'alt.atheism',
    'talk.religion.misc',
    'comp.graphics',
    'sci.space',
]
fetch_subset = lambda subset: fetch_20newsgroups(
    subset=subset, categories=categories,
    shuffle=True, random_state=42,
    remove=('headers', 'footers', 'quotes'))
train = fetch_subset('train')
test = fetch_subset('test')

In [2]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegressionCV
from sklearn.feature_extraction.text import TfidfVectorizer

vec = TfidfVectorizer()
clf = LogisticRegressionCV()
pipeline = Pipeline([('vec', vec), ('clf', clf)])
pipeline.fit(train['data'], train['target'])

Pipeline(steps=[('vec', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
   ...2', random_state=None,
           refit=True, scoring=None, solver='lbfgs', tol=0.0001, verbose=0))])

In [3]:
from eli5 import explain_weights, explain_prediction
from eli5.formatters import format_as_html, format_as_text, format_html_styles, fields

print(format_as_text(explain_weights(clf, vec, target_names=train['target_names'])))

Explained as: linear model

Features with largest coefficients per class.
Caveats:
1. Be careful with features which are not
   independent - weights don't show their importance.
2. If scale of input features is different then scale of coefficients
   will also be different, making direct comparison between coefficient values
   incorrect.
3. Depending on regularization, rare features sometimes may have high
   coefficients; this doesn't mean they contribute much to the
   classification result for most examples.

y='alt.atheism' top features
----------------------
 +18.161  atheism     
 +16.664  atheists    
 +14.504  religion    
 +14.493  bobby       
 +14.350  matthew     
 +13.392  motto       
 +13.322  atheist     
 +13.071  islam       
 +12.770  nanci       
 +12.251  enviroleague
 +12.059  loans       
 +11.700  satan       
 +11.495  posting     
 +11.199  enlightening
 +11.101  natural     
       …  (6377 more positive features)
       …  (20483 more negative features)
 -

In [4]:
from IPython.core.display import display, HTML
show_html = lambda html: display(HTML(html))
show_html_expl = lambda expl, **kwargs: show_html(format_as_html(expl, include_styles=False, **kwargs))
show_html(format_html_styles())

In [5]:
explain_weights(clf, vec, target_names=train['target_names'])

y=alt.atheism  top features,y=alt.atheism  top features,Unnamed: 2_level_0,Unnamed: 3_level_0
Weight,Feature,Unnamed: 2_level_1,Unnamed: 3_level_1
y=comp.graphics  top features,y=comp.graphics  top features,Unnamed: 2_level_2,Unnamed: 3_level_2
Weight,Feature,Unnamed: 2_level_3,Unnamed: 3_level_3
y=sci.space  top features,y=sci.space  top features,Unnamed: 2_level_4,Unnamed: 3_level_4
Weight,Feature,Unnamed: 2_level_5,Unnamed: 3_level_5
y=talk.religion.misc  top features,y=talk.religion.misc  top features,Unnamed: 2_level_6,Unnamed: 3_level_6
Weight,Feature,Unnamed: 2_level_7,Unnamed: 3_level_7
+18.161,atheism,,
+16.664,atheists,,
+14.504,religion,,
+14.493,bobby,,
+14.350,matthew,,
+13.392,motto,,
+13.322,atheist,,
+13.071,islam,,
+12.770,nanci,,
+12.251,enviroleague,,

y=alt.atheism  top features,y=alt.atheism  top features
Weight,Feature
+18.161,atheism
+16.664,atheists
+14.504,religion
+14.493,bobby
+14.350,matthew
+13.392,motto
+13.322,atheist
+13.071,islam
+12.770,nanci
+12.251,enviroleague

y=comp.graphics  top features,y=comp.graphics  top features
Weight,Feature
+26.146,graphics
+19.048,image
+17.396,computer
+16.901,3d
+16.243,file
+14.122,points
+13.240,sgi
+13.201,42
+12.432,hi
+11.843,3do

y=sci.space  top features,y=sci.space  top features
Weight,Feature
+35.805,space
+17.691,orbit
+15.216,nasa
+15.149,launch
+13.202,spacecraft
+12.879,mars
+12.329,nick
+12.132,allen
+11.880,moon
+11.761,shuttle

y=talk.religion.misc  top features,y=talk.religion.misc  top features
Weight,Feature
+19.192,christian
+16.648,blood
+14.927,fbi
+14.200,christians
+12.769,order
+12.760,hudson
+12.369,christ
+12.102,ekr
+11.959,terrorist
+11.653,koresh


In [6]:
show_html_expl(
    explain_prediction(clf, test['data'][2], vec, target_names=train['target_names']),
    force_weights=True, horizontal_layout=False)

Weight,Feature
+0.888,some
+0.536,much
… 6 more positive …,… 6 more positive …
… 15 more negative …,… 15 more negative …
-0.399,he
-0.406,how
-0.439,am
-0.463,from
-0.512,features
-0.609,where

Weight,Feature
… 6 more positive …,… 6 more positive …
… 15 more negative …,… 15 more negative …
-1.394,<BIAS>
-13.014,Highlighted in text (sum)

Weight,Feature
+3.150,graphics
+2.690,software
+1.707,hi
+1.185,looking
+1.127,buy
+0.905,features
+0.850,pc
+0.680,help
+0.538,any
+0.523,it

Weight,Feature
+10.045,Highlighted in text (sum)
… 9 more positive …,… 9 more positive …
… 12 more negative …,… 12 more negative …
-1.015,<BIAS>

Weight,Feature
+0.864,costs
+0.649,buy
+0.604,software
+0.496,most
+0.392,the
… 14 more positive …,… 14 more positive …
… 7 more negative …,… 7 more negative …
-0.293,help
-0.311,am
-0.321,trying

Weight,Feature
… 14 more positive …,… 14 more positive …
… 7 more negative …,… 7 more negative …
-1.015,<BIAS>
-7.271,Highlighted in text (sum)

Weight,Feature
+2.188,he
+0.526,my
+0.481,more
… 7 more positive …,… 7 more positive …
… 14 more negative …,… 14 more negative …
-0.397,there
-0.398,costs
-0.471,pc
-0.500,much
-0.510,looking

Weight,Feature
… 7 more positive …,… 7 more positive …
… 14 more negative …,… 14 more negative …
-1.019,<BIAS>
-10.512,Highlighted in text (sum)


``dense_multitarget=True`` is supported for prediction explanations too, and shows just the top prediction highlighting.

In [7]:
show_html_expl(explain_prediction(clf, test['data'][2], vec, target_names=train['target_names']),
               force_weights=True)

"y=alt.atheism  (probability 0.000, score -16.212) top features","y=alt.atheism  (probability 0.000, score -16.212) top features",Unnamed: 2_level_0,Unnamed: 3_level_0
Weight,Feature,Unnamed: 2_level_1,Unnamed: 3_level_1
"y=comp.graphics  (probability 0.999, score 8.690) top features","y=comp.graphics  (probability 0.999, score 8.690) top features",Unnamed: 2_level_2,Unnamed: 3_level_2
Weight,Feature,Unnamed: 2_level_3,Unnamed: 3_level_3
"y=sci.space  (probability 0.001, score -6.819) top features","y=sci.space  (probability 0.001, score -6.819) top features",Unnamed: 2_level_4,Unnamed: 3_level_4
Weight,Feature,Unnamed: 2_level_5,Unnamed: 3_level_5
"y=talk.religion.misc  (probability 0.000, score -11.869) top features","y=talk.religion.misc  (probability 0.000, score -11.869) top features",Unnamed: 2_level_6,Unnamed: 3_level_6
Weight,Feature,Unnamed: 2_level_7,Unnamed: 3_level_7
+0.888,some,,
+0.536,much,,
… 6 more positive …,… 6 more positive …,,
… 15 more negative …,… 15 more negative …,,
-0.399,he,,
-0.406,how,,
-0.439,am,,
-0.463,from,,
-0.512,features,,
-0.609,where,,

"y=alt.atheism  (probability 0.000, score -16.212) top features","y=alt.atheism  (probability 0.000, score -16.212) top features"
Weight,Feature
+0.888,some
+0.536,much
… 6 more positive …,… 6 more positive …
… 15 more negative …,… 15 more negative …
-0.399,he
-0.406,how
-0.439,am
-0.463,from
-0.512,features
-0.609,where

"y=comp.graphics  (probability 0.999, score 8.690) top features","y=comp.graphics  (probability 0.999, score 8.690) top features"
Weight,Feature
+3.150,graphics
+2.690,software
+1.707,hi
+1.185,looking
+1.127,buy
+0.905,features
+0.850,pc
+0.680,help
+0.538,any
+0.523,it

"y=sci.space  (probability 0.001, score -6.819) top features","y=sci.space  (probability 0.001, score -6.819) top features"
Weight,Feature
+0.864,costs
+0.649,buy
+0.604,software
+0.496,most
+0.392,the
… 14 more positive …,… 14 more positive …
… 7 more negative …,… 7 more negative …
-0.293,help
-0.311,am
-0.321,trying

"y=talk.religion.misc  (probability 0.000, score -11.869) top features","y=talk.religion.misc  (probability 0.000, score -11.869) top features"
Weight,Feature
+2.188,he
+0.526,my
+0.481,more
… 7 more positive …,… 7 more positive …
… 14 more negative …,… 14 more negative …
-0.397,there
-0.398,costs
-0.471,pc
-0.500,much
-0.510,looking

Weight,Feature
… 6 more positive …,… 6 more positive …
… 15 more negative …,… 15 more negative …
-1.394,<BIAS>
-13.014,Highlighted in text (sum)

Weight,Feature
+10.045,Highlighted in text (sum)
… 9 more positive …,… 9 more positive …
… 12 more negative …,… 12 more negative …
-1.015,<BIAS>

Weight,Feature
… 14 more positive …,… 14 more positive …
… 7 more negative …,… 7 more negative …
-1.015,<BIAS>
-7.271,Highlighted in text (sum)

Weight,Feature
… 7 more positive …,… 7 more positive …
… 14 more negative …,… 14 more negative …
-1.019,<BIAS>
-10.512,Highlighted in text (sum)


We can hide weights by passing ``force_weights=False`` (they still will be shown if it's impossible to highlight text)

In [8]:
show_html_expl(explain_prediction(clf, test['data'][4], vec, target_names=train['target_names']), force_weights=False)

Weight,Feature
… 5 more positive …,… 5 more positive …
… 5 more negative …,… 5 more negative …
-1.394,<BIAS>
-6.044,Highlighted in text (sum)

Weight,Feature
+7.625,Highlighted in text (sum)
… 5 more positive …,… 5 more positive …
… 5 more negative …,… 5 more negative …
-1.015,<BIAS>

Weight,Feature
… 6 more positive …,… 6 more positive …
… 4 more negative …,… 4 more negative …
-1.015,<BIAS>
-8.702,Highlighted in text (sum)

Weight,Feature
… 2 more positive …,… 2 more positive …
… 8 more negative …,… 8 more negative …
-1.019,<BIAS>
-9.832,Highlighted in text (sum)


Show explanations for the winning class for first 10 documents from test data

In [9]:
import numpy as np
for doc in test['data'][:10]:
    expl = explain_prediction(clf, doc, vec, target_names=train['target_names'])
    show_html_expl(expl, force_weights=False)

Weight,Feature
-1.394,<BIAS>
-3.078,Highlighted in text (sum)

Weight,Feature
-1.015,<BIAS>
-3.688,Highlighted in text (sum)

Weight,Feature
6.059,Highlighted in text (sum)
-1.015,<BIAS>

Weight,Feature
-1.019,<BIAS>
-6.486,Highlighted in text (sum)


Weight,Feature
… 1 more positive …,… 1 more positive …
… 1 more negative …,… 1 more negative …
-1.394,<BIAS>
-6.268,Highlighted in text (sum)

Weight,Feature
+7.279,Highlighted in text (sum)
… 1 more positive …,… 1 more positive …
… 1 more negative …,… 1 more negative …
-1.015,<BIAS>

Weight,Feature
… 1 more positive …,… 1 more positive …
… 1 more negative …,… 1 more negative …
-1.015,<BIAS>
-6.291,Highlighted in text (sum)

Weight,Feature
… 2 more positive …,… 2 more positive …
-1.019,<BIAS>
-8.049,Highlighted in text (sum)


Weight,Feature
… 6 more positive …,… 6 more positive …
… 15 more negative …,… 15 more negative …
-1.394,<BIAS>
-13.014,Highlighted in text (sum)

Weight,Feature
+10.045,Highlighted in text (sum)
… 9 more positive …,… 9 more positive …
… 12 more negative …,… 12 more negative …
-1.015,<BIAS>

Weight,Feature
… 14 more positive …,… 14 more positive …
… 7 more negative …,… 7 more negative …
-1.015,<BIAS>
-7.271,Highlighted in text (sum)

Weight,Feature
… 7 more positive …,… 7 more positive …
… 14 more negative …,… 14 more negative …
-1.019,<BIAS>
-10.512,Highlighted in text (sum)


Weight,Feature
… 58 more positive …,… 58 more positive …
… 119 more negative …,… 119 more negative …
-1.394,<BIAS>
-4.125,Highlighted in text (sum)

Weight,Feature
+4.109,Highlighted in text (sum)
… 83 more positive …,… 83 more positive …
… 94 more negative …,… 94 more negative …
-1.015,<BIAS>

Weight,Feature
… 66 more positive …,… 66 more positive …
… 111 more negative …,… 111 more negative …
-0.429,Highlighted in text (sum)
-1.015,<BIAS>

Weight,Feature
… 58 more positive …,… 58 more positive …
… 119 more negative …,… 119 more negative …
-1.019,<BIAS>
-1.853,Highlighted in text (sum)


Weight,Feature
… 5 more positive …,… 5 more positive …
… 5 more negative …,… 5 more negative …
-1.394,<BIAS>
-6.044,Highlighted in text (sum)

Weight,Feature
+7.625,Highlighted in text (sum)
… 5 more positive …,… 5 more positive …
… 5 more negative …,… 5 more negative …
-1.015,<BIAS>

Weight,Feature
… 6 more positive …,… 6 more positive …
… 4 more negative …,… 4 more negative …
-1.015,<BIAS>
-8.702,Highlighted in text (sum)

Weight,Feature
… 2 more positive …,… 2 more positive …
… 8 more negative …,… 8 more negative …
-1.019,<BIAS>
-9.832,Highlighted in text (sum)


Weight,Feature
… 6 more positive …,… 6 more positive …
… 10 more negative …,… 10 more negative …
-1.394,<BIAS>
-9.924,Highlighted in text (sum)

Weight,Feature
+1.281,Highlighted in text (sum)
… 9 more positive …,… 9 more positive …
… 7 more negative …,… 7 more negative …
-1.015,<BIAS>

Weight,Feature
+0.113,Highlighted in text (sum)
… 11 more positive …,… 11 more positive …
… 5 more negative …,… 5 more negative …
-1.015,<BIAS>

Weight,Feature
… 5 more positive …,… 5 more positive …
… 11 more negative …,… 11 more negative …
-1.019,<BIAS>
-8.194,Highlighted in text (sum)


Weight,Feature
-1.394,<BIAS>
-2.57,Highlighted in text (sum)

Weight,Feature
-0.003,Highlighted in text (sum)
-1.015,<BIAS>

Weight,Feature
-0.523,Highlighted in text (sum)
-1.015,<BIAS>

Weight,Feature
-0.438,Highlighted in text (sum)
-1.019,<BIAS>


Weight,Feature
… 47 more positive …,… 47 more positive …
… 68 more negative …,… 68 more negative …
-1.394,<BIAS>
-7.123,Highlighted in text (sum)

Weight,Feature
… 36 more positive …,… 36 more positive …
… 79 more negative …,… 79 more negative …
-1.015,<BIAS>
-8.056,Highlighted in text (sum)

Weight,Feature
+11.074,Highlighted in text (sum)
… 56 more positive …,… 56 more positive …
… 59 more negative …,… 59 more negative …
-1.015,<BIAS>

Weight,Feature
… 41 more positive …,… 41 more positive …
… 74 more negative …,… 74 more negative …
-1.019,<BIAS>
-2.920,Highlighted in text (sum)


Weight,Feature
+9.383,Highlighted in text (sum)
… 14 more positive …,… 14 more positive …
… 3 more negative …,… 3 more negative …
-1.394,<BIAS>

Weight,Feature
… 4 more positive …,… 4 more positive …
… 13 more negative …,… 13 more negative …
-1.015,<BIAS>
-6.294,Highlighted in text (sum)

Weight,Feature
… 9 more positive …,… 9 more positive …
… 8 more negative …,… 8 more negative …
-1.015,<BIAS>
-3.429,Highlighted in text (sum)

Weight,Feature
… 6 more positive …,… 6 more positive …
… 11 more negative …,… 11 more negative …
-1.019,<BIAS>
-6.381,Highlighted in text (sum)


Weight,Feature
… 23 more positive …,… 23 more positive …
… 44 more negative …,… 44 more negative …
-1.394,<BIAS>
-2.313,Highlighted in text (sum)

Weight,Feature
… 20 more positive …,… 20 more positive …
… 47 more negative …,… 47 more negative …
-1.015,<BIAS>
-5.475,Highlighted in text (sum)

Weight,Feature
… 42 more positive …,… 42 more positive …
… 25 more negative …,… 25 more negative …
-0.114,Highlighted in text (sum)
-1.015,<BIAS>

Weight,Feature
+0.149,Highlighted in text (sum)
… 21 more positive …,… 21 more positive …
… 46 more negative …,… 46 more negative …
-1.019,<BIAS>


Now use a vectorizer that skips stopwords

In [10]:
vec_stop = TfidfVectorizer(stop_words='english')
clf_stop = LogisticRegressionCV()
pipeline_stop = Pipeline([('vec', vec_stop), ('clf', clf_stop)])
pipeline_stop.fit(train['data'], train['target'])

Pipeline(steps=[('vec', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
   ...2', random_state=None,
           refit=True, scoring=None, solver='lbfgs', tol=0.0001, verbose=0))])

Words such as "the", "in", "of" are not used as features and are not highlighted

In [11]:
show_html_expl(explain_prediction(clf_stop, test['data'][4], vec_stop, target_names=train['target_names']), force_weights=False)

Weight,Feature
-1.397,<BIAS>
-6.387,Highlighted in text (sum)

Weight,Feature
6.991,Highlighted in text (sum)
-1.018,<BIAS>

Weight,Feature
-1.016,<BIAS>
-6.67,Highlighted in text (sum)

Weight,Feature
-1.071,<BIAS>
-9.29,Highlighted in text (sum)
