Merge pull request #12 from TeamHG-Memex/hashing-explain

Explain hashing vectorizer
TeamHG-Memex · Sep 26, 2016 · 26c6538 · 26c6538
2 parents 504d1e4 + 6da8bb3
commit 26c6538
Show file tree

Hide file tree

Showing 6 changed files with 113 additions and 44 deletions.
diff --git a/eli5/sklearn/explain_prediction.py b/eli5/sklearn/explain_prediction.py
@@ -3,6 +3,7 @@
 
 import numpy as np
 import scipy.sparse as sp
+from sklearn.feature_extraction.text import HashingVectorizer
 from sklearn.linear_model import (
     ElasticNet,
     Lars,
@@ -17,6 +18,7 @@
 )
 from sklearn.svm import LinearSVC, LinearSVR
 
+from eli5.sklearn.unhashing import InvertableHashingVectorizer, handle_hashing_vec
 from eli5.sklearn.utils import (
     get_feature_names,
     get_coef,
@@ -35,7 +37,7 @@
 
 @singledispatch
 def explain_prediction(clf, doc, vec=None, top=_TOP, target_names=None,
-                       feature_names=None, vectorized=False):
+                       feature_names=None, vectorized=False, coef_scale=None):
     """ Return an explanation of an estimator """
     return {
         "estimator": repr(clf),
@@ -51,8 +53,13 @@ def explain_prediction(clf, doc, vec=None, top=_TOP, target_names=None,
 @explain_prediction.register(LinearSVC)
 def explain_prediction_linear_classifier(
         clf, doc, vec=None, top=_TOP, target_names=None,
-        feature_names=None, vectorized=False):
+        feature_names=None, vectorized=False, coef_scale=None):
     """ Explain prediction of a linear classifier. """
+    if isinstance(vec, HashingVectorizer) and not vectorized:
+        vec = InvertableHashingVectorizer(vec)
+        vec.fit([doc])
+    feature_names, coef_scale = handle_hashing_vec(vec, feature_names,
+                                                   coef_scale)
     feature_names = get_feature_names(clf, vec, feature_names=feature_names)
     X = _get_X(doc, vec=vec, vectorized=vectorized)
 
@@ -73,7 +80,7 @@ def explain_prediction_linear_classifier(
     }
 
     def _weights(label_id):
-        coef = get_coef(clf, label_id)
+        coef = get_coef(clf, label_id, scale=coef_scale)
         scores = _multiply(x, coef)
         return get_top_features_dict(feature_names, scores, top)
 

diff --git a/eli5/sklearn/explain_weights.py b/eli5/sklearn/explain_weights.py
@@ -28,7 +28,7 @@
 
 from eli5._feature_weights import get_top_features_dict
 from eli5.utils import argsort_k_largest
-from eli5.sklearn.unhashing import InvertableHashingVectorizer
+from eli5.sklearn.unhashing import handle_hashing_vec, is_invhashing
 from eli5.sklearn.utils import (
     get_coef,
     is_multiclass_classifier,
@@ -148,10 +148,10 @@ def explain_linear_classifier_weights(clf, vec=None, top=_TOP, target_names=None
 
     To print it use utilities from eli5.formatters.
     """
-    feature_names, coef_scale = _handle_hashing_vec(vec, feature_names,
-                                                    coef_scale)
+    feature_names, coef_scale = handle_hashing_vec(vec, feature_names,
+                                                   coef_scale)
     feature_names = get_feature_names(clf, vec, feature_names=feature_names)
-    _extra_caveats = "\n" + HASHING_CAVEATS if _is_invhashing(vec) else ''
+    _extra_caveats = "\n" + HASHING_CAVEATS if is_invhashing(vec) else ''
 
     def _features(label_id):
         coef = get_coef(clf, label_id, scale=coef_scale)
@@ -306,10 +306,10 @@ def explain_linear_regressor_weights(clf, vec=None, feature_names=None,
 
     To print it use utilities from eli5.formatters.
     """
-    feature_names, coef_scale = _handle_hashing_vec(vec, feature_names,
+    feature_names, coef_scale = handle_hashing_vec(vec, feature_names,
                                                     coef_scale)
     feature_names = get_feature_names(clf, vec, feature_names=feature_names)
-    _extra_caveats = "\n" + HASHING_CAVEATS if _is_invhashing(vec) else ''
+    _extra_caveats = "\n" + HASHING_CAVEATS if is_invhashing(vec) else ''
 
     def _features(target_id):
         coef = get_coef(clf, target_id, scale=coef_scale)
@@ -343,16 +343,3 @@ def _label(target_id, target):
             'estimator': repr(clf),
             'method': 'linear model',
         }
-
-
-def _handle_hashing_vec(vec, feature_names, coef_scale):
-    if _is_invhashing(vec):
-        if feature_names is None:
-            feature_names = vec.get_feature_names(always_signed=False)
-        if coef_scale is None:
-            coef_scale = vec.column_signs_
-    return feature_names, coef_scale
-
-
-def _is_invhashing(vec):
-    return isinstance(vec, InvertableHashingVectorizer)
diff --git a/eli5/sklearn/unhashing.py b/eli5/sklearn/unhashing.py
@@ -230,3 +230,16 @@ def _format_name(names, signs, sep=" | ", always_signed=False):
     if not always_signed and len(set(signs)) == 1:
         return sep.join(names)
     return sep.join(_signed(n, s) for n, s in zip(names, signs))
+
+
+def handle_hashing_vec(vec, feature_names, coef_scale):
+    if is_invhashing(vec):
+        if feature_names is None:
+            feature_names = vec.get_feature_names(always_signed=False)
+        if coef_scale is None:
+            coef_scale = vec.column_signs_
+    return feature_names, coef_scale
+
+
+def is_invhashing(vec):
+    return isinstance(vec, InvertableHashingVectorizer)
diff --git a/eli5/sklearn/utils.py b/eli5/sklearn/utils.py
@@ -91,6 +91,7 @@ def get_coef(clf, label_id, scale=None):
             ))
         # print("shape is ok")
         not_nan = ~np.isnan(scale)
+        coef = coef.copy()
         coef[not_nan] *= scale[not_nan]
 
     if not has_intercept(clf):

diff --git a/tests/test_sklearn_explain_prediction.py b/tests/test_sklearn_explain_prediction.py
@@ -3,7 +3,8 @@
 from pprint import pprint
 
 from sklearn.datasets import make_regression
-from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
+from sklearn.feature_extraction.text import (
+    CountVectorizer, TfidfVectorizer, HashingVectorizer)
 from sklearn.feature_extraction.dict_vectorizer import DictVectorizer
 from sklearn.linear_model import (
     ElasticNet,
@@ -22,20 +23,20 @@
 from sklearn.base import BaseEstimator
 import pytest
 
-from eli5.sklearn import explain_prediction
+from eli5.sklearn import explain_prediction, InvertableHashingVectorizer
 from eli5.formatters import format_as_text
 
 
 @pytest.mark.parametrize(['clf'], [
-    [LogisticRegression()],
-    [LogisticRegression(multi_class='multinomial', solver='lbfgs')],
-    [LogisticRegression(fit_intercept=False)],
-    [LogisticRegressionCV()],
+    [LogisticRegression(random_state=42)],
+    [LogisticRegression(random_state=42, multi_class='multinomial', solver='lbfgs')],
+    [LogisticRegression(random_state=42, fit_intercept=False)],
+    [LogisticRegressionCV(random_state=42)],
     [SGDClassifier(random_state=42)],
     [SGDClassifier(loss='log', random_state=42)],
-    [PassiveAggressiveClassifier()],
-    [Perceptron()],
-    [LinearSVC()],
+    [PassiveAggressiveClassifier(random_state=42)],
+    [Perceptron(random_state=42)],
+    [LinearSVC(random_state=42)],
 ])
 def test_explain_linear(newsgroups_train, clf):
     docs, y, target_names = newsgroups_train
@@ -44,7 +45,9 @@ def test_explain_linear(newsgroups_train, clf):
     X = vec.fit_transform(docs)
     clf.fit(X, y)
 
-    res = explain_prediction(clf, docs[0], vec=vec, target_names=target_names, top=20)
+    get_res = lambda: explain_prediction(
+        clf, docs[0], vec=vec, target_names=target_names, top=20)
+    res = get_res()
     expl = format_as_text(res)
     print(expl)
     pprint(res)
@@ -59,32 +62,74 @@ def test_explain_linear(newsgroups_train, clf):
         assert str(label) in expl
     assert 'file' in expl
 
+    assert res == get_res()
 
-def test_explain_linear_binary(newsgroups_train_binary):
-    docs, y, target_names = newsgroups_train_binary
-    vec = TfidfVectorizer()
-    clf = LogisticRegression()
-    X = vec.fit_transform(docs)
-    clf.fit(X, y)
 
-    res = explain_prediction(clf, docs[0], vec, target_names=target_names, top=20)
+def check_explain_linear_binary(res):
     expl = format_as_text(res)
     print(expl)
     pprint(res)
-
     assert len(res['classes']) == 1
     e = res['classes'][0]
     assert e['class'] == 'comp.graphics'
     neg = {name for name, value in e['feature_weights']['neg']}
-    assert 'freedom' in neg
+    assert 'objective' in neg
     assert 'comp.graphics' in expl
-    assert 'freedom' in expl
+    assert 'objective' in expl
 
+
+@pytest.mark.parametrize(['vec'], [
+    [CountVectorizer()],
+    [HashingVectorizer()],
+])
+def test_explain_linear_binary(vec, newsgroups_train_binary):
+    docs, y, target_names = newsgroups_train_binary
+    clf = LogisticRegression(random_state=42)
+    X = vec.fit_transform(docs)
+    clf.fit(X, y)
+
+    get_res = lambda: explain_prediction(
+        clf, docs[0], vec, target_names=target_names, top=20)
+    res = get_res()
+    check_explain_linear_binary(res)
+    assert res == get_res()
     res_vectorized = explain_prediction(
         clf, vec.transform([docs[0]])[0], vec, target_names=target_names,
         top=20, vectorized=True)
+    if isinstance(vec, HashingVectorizer):
+        # InvertableHashingVectorizer must be passed with vectorized=True
+        neg_vectorized = {name for name, value in
+                          res_vectorized['classes'][0]['feature_weights']['neg']}
+        assert all(name.startswith('x') for name in neg_vectorized)
+    else:
+        assert res_vectorized == res
+
+
+def test_explain_hashing_vectorizer(newsgroups_train_binary):
+    # test that we can pass InvertableHashingVectorizer explicitly
+    vec = HashingVectorizer()
+    ivec = InvertableHashingVectorizer(vec)
+    clf = LogisticRegression(random_state=42)
+    docs, y, target_names = newsgroups_train_binary
+    ivec.fit([docs[0]])
+    X = vec.fit_transform(docs)
+    clf.fit(X, y)
+
+    get_res = lambda **kwargs: explain_prediction(
+        clf, docs[0], ivec, target_names=target_names, top=20, **kwargs)
+    res = get_res()
+    check_explain_linear_binary(res)
+    assert res == get_res()
+    res_vectorized = explain_prediction(
+        clf, vec.transform([docs[0]])[0], ivec, target_names=target_names,
+        top=20, vectorized=True)
+    pprint(res_vectorized)
     assert res_vectorized == res
 
+    assert res == get_res(
+        feature_names=ivec.get_feature_names(always_signed=False),
+        coef_scale=ivec.column_signs_)
+
 
 def test_explain_linear_dense():
     clf = LogisticRegression()
@@ -146,6 +191,8 @@ def test_explain_linear_regression(boston_train, clf):
     assert '<BIAS>' in expl
     assert "'y'" in expl
 
+    assert res == explain_prediction(clf, X[0])
+
 
 @pytest.mark.parametrize(['clf'], [
     [ElasticNet(random_state=42)],
@@ -174,3 +221,5 @@ def test_explain_linear_regression_multitarget(clf):
     assert 'x8' in expl
     assert '<BIAS>' in expl
     assert "'y2'" in expl
+
+    assert res == explain_prediction(clf, X[0])
diff --git a/tests/test_sklearn_explain_weights.py b/tests/test_sklearn_explain_weights.py
@@ -37,7 +37,9 @@
 
 
 def check_newsgroups_explanation_linear(clf, vec, target_names):
-    res = explain_weights(clf, vec, target_names=target_names, top=20)
+    get_res = lambda: explain_weights(
+        clf, vec, target_names=target_names, top=20)
+    res = get_res()
     expl = format_as_text(res)
     print(expl)
 
@@ -58,6 +60,8 @@ def check_newsgroups_explanation_linear(clf, vec, target_names):
     for label in target_names:
         assert str(label) in expl
 
+    assert res == get_res()
+
 
 @pytest.mark.parametrize(['clf'], [
     [LogisticRegression()],
@@ -149,12 +153,16 @@ def test_explain_random_forest(newsgroups_train, clf):
     X = vec.fit_transform(docs)
     clf.fit(X.toarray(), y)
 
-    res = explain_weights(clf, vec, target_names=target_names, top=30)
+    get_res = lambda: explain_weights(
+        clf, vec, target_names=target_names, top=30)
+    res = get_res()
     expl = format_as_text(res)
     print(expl)
     assert 'feature importances' in expl
     assert 'that 0.' in expl  # high-ranked feature
 
+    assert res == get_res()
+
 
 def test_explain_empty(newsgroups_train):
     clf = LogisticRegression(C=0.01, penalty='l1')
@@ -203,6 +211,8 @@ def test_explain_linear_regression(boston_train, clf):
     assert 'x9' in neg or 'x9' in pos
     assert '<BIAS>' in neg or '<BIAS>' in pos
 
+    assert res == explain_weights(clf)
+
 
 @pytest.mark.parametrize(['clf'], [
     [ElasticNet(random_state=42)],
@@ -225,3 +235,5 @@ def test_explain_linear_regression_multitarget(clf):
     pos, neg = top_pos_neg(res['targets'], 'target', 'y2')
     assert 'x9' in neg or 'x9' in pos
     assert '<BIAS>' in neg or '<BIAS>' in pos
+
+    assert res == explain_weights(clf)