Skip to content

Commit

Permalink
Merge pull request #12 from TeamHG-Memex/hashing-explain
Browse files Browse the repository at this point in the history
Explain hashing vectorizer
  • Loading branch information
lopuhin committed Sep 26, 2016
2 parents 504d1e4 + 6da8bb3 commit 26c6538
Show file tree
Hide file tree
Showing 6 changed files with 113 additions and 44 deletions.
13 changes: 10 additions & 3 deletions eli5/sklearn/explain_prediction.py
Expand Up @@ -3,6 +3,7 @@

import numpy as np
import scipy.sparse as sp
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import (
ElasticNet,
Lars,
Expand All @@ -17,6 +18,7 @@
)
from sklearn.svm import LinearSVC, LinearSVR

from eli5.sklearn.unhashing import InvertableHashingVectorizer, handle_hashing_vec
from eli5.sklearn.utils import (
get_feature_names,
get_coef,
Expand All @@ -35,7 +37,7 @@

@singledispatch
def explain_prediction(clf, doc, vec=None, top=_TOP, target_names=None,
feature_names=None, vectorized=False):
feature_names=None, vectorized=False, coef_scale=None):
""" Return an explanation of an estimator """
return {
"estimator": repr(clf),
Expand All @@ -51,8 +53,13 @@ def explain_prediction(clf, doc, vec=None, top=_TOP, target_names=None,
@explain_prediction.register(LinearSVC)
def explain_prediction_linear_classifier(
clf, doc, vec=None, top=_TOP, target_names=None,
feature_names=None, vectorized=False):
feature_names=None, vectorized=False, coef_scale=None):
""" Explain prediction of a linear classifier. """
if isinstance(vec, HashingVectorizer) and not vectorized:
vec = InvertableHashingVectorizer(vec)
vec.fit([doc])
feature_names, coef_scale = handle_hashing_vec(vec, feature_names,
coef_scale)
feature_names = get_feature_names(clf, vec, feature_names=feature_names)
X = _get_X(doc, vec=vec, vectorized=vectorized)

Expand All @@ -73,7 +80,7 @@ def explain_prediction_linear_classifier(
}

def _weights(label_id):
coef = get_coef(clf, label_id)
coef = get_coef(clf, label_id, scale=coef_scale)
scores = _multiply(x, coef)
return get_top_features_dict(feature_names, scores, top)

Expand Down
25 changes: 6 additions & 19 deletions eli5/sklearn/explain_weights.py
Expand Up @@ -28,7 +28,7 @@

from eli5._feature_weights import get_top_features_dict
from eli5.utils import argsort_k_largest
from eli5.sklearn.unhashing import InvertableHashingVectorizer
from eli5.sklearn.unhashing import handle_hashing_vec, is_invhashing
from eli5.sklearn.utils import (
get_coef,
is_multiclass_classifier,
Expand Down Expand Up @@ -148,10 +148,10 @@ def explain_linear_classifier_weights(clf, vec=None, top=_TOP, target_names=None
To print it use utilities from eli5.formatters.
"""
feature_names, coef_scale = _handle_hashing_vec(vec, feature_names,
coef_scale)
feature_names, coef_scale = handle_hashing_vec(vec, feature_names,
coef_scale)
feature_names = get_feature_names(clf, vec, feature_names=feature_names)
_extra_caveats = "\n" + HASHING_CAVEATS if _is_invhashing(vec) else ''
_extra_caveats = "\n" + HASHING_CAVEATS if is_invhashing(vec) else ''

def _features(label_id):
coef = get_coef(clf, label_id, scale=coef_scale)
Expand Down Expand Up @@ -306,10 +306,10 @@ def explain_linear_regressor_weights(clf, vec=None, feature_names=None,
To print it use utilities from eli5.formatters.
"""
feature_names, coef_scale = _handle_hashing_vec(vec, feature_names,
feature_names, coef_scale = handle_hashing_vec(vec, feature_names,
coef_scale)
feature_names = get_feature_names(clf, vec, feature_names=feature_names)
_extra_caveats = "\n" + HASHING_CAVEATS if _is_invhashing(vec) else ''
_extra_caveats = "\n" + HASHING_CAVEATS if is_invhashing(vec) else ''

def _features(target_id):
coef = get_coef(clf, target_id, scale=coef_scale)
Expand Down Expand Up @@ -343,16 +343,3 @@ def _label(target_id, target):
'estimator': repr(clf),
'method': 'linear model',
}


def _handle_hashing_vec(vec, feature_names, coef_scale):
if _is_invhashing(vec):
if feature_names is None:
feature_names = vec.get_feature_names(always_signed=False)
if coef_scale is None:
coef_scale = vec.column_signs_
return feature_names, coef_scale


def _is_invhashing(vec):
return isinstance(vec, InvertableHashingVectorizer)
13 changes: 13 additions & 0 deletions eli5/sklearn/unhashing.py
Expand Up @@ -230,3 +230,16 @@ def _format_name(names, signs, sep=" | ", always_signed=False):
if not always_signed and len(set(signs)) == 1:
return sep.join(names)
return sep.join(_signed(n, s) for n, s in zip(names, signs))


def handle_hashing_vec(vec, feature_names, coef_scale):
if is_invhashing(vec):
if feature_names is None:
feature_names = vec.get_feature_names(always_signed=False)
if coef_scale is None:
coef_scale = vec.column_signs_
return feature_names, coef_scale


def is_invhashing(vec):
return isinstance(vec, InvertableHashingVectorizer)
1 change: 1 addition & 0 deletions eli5/sklearn/utils.py
Expand Up @@ -91,6 +91,7 @@ def get_coef(clf, label_id, scale=None):
))
# print("shape is ok")
not_nan = ~np.isnan(scale)
coef = coef.copy()
coef[not_nan] *= scale[not_nan]

if not has_intercept(clf):
Expand Down
89 changes: 69 additions & 20 deletions tests/test_sklearn_explain_prediction.py
Expand Up @@ -3,7 +3,8 @@
from pprint import pprint

from sklearn.datasets import make_regression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction.text import (
CountVectorizer, TfidfVectorizer, HashingVectorizer)
from sklearn.feature_extraction.dict_vectorizer import DictVectorizer
from sklearn.linear_model import (
ElasticNet,
Expand All @@ -22,20 +23,20 @@
from sklearn.base import BaseEstimator
import pytest

from eli5.sklearn import explain_prediction
from eli5.sklearn import explain_prediction, InvertableHashingVectorizer
from eli5.formatters import format_as_text


@pytest.mark.parametrize(['clf'], [
[LogisticRegression()],
[LogisticRegression(multi_class='multinomial', solver='lbfgs')],
[LogisticRegression(fit_intercept=False)],
[LogisticRegressionCV()],
[LogisticRegression(random_state=42)],
[LogisticRegression(random_state=42, multi_class='multinomial', solver='lbfgs')],
[LogisticRegression(random_state=42, fit_intercept=False)],
[LogisticRegressionCV(random_state=42)],
[SGDClassifier(random_state=42)],
[SGDClassifier(loss='log', random_state=42)],
[PassiveAggressiveClassifier()],
[Perceptron()],
[LinearSVC()],
[PassiveAggressiveClassifier(random_state=42)],
[Perceptron(random_state=42)],
[LinearSVC(random_state=42)],
])
def test_explain_linear(newsgroups_train, clf):
docs, y, target_names = newsgroups_train
Expand All @@ -44,7 +45,9 @@ def test_explain_linear(newsgroups_train, clf):
X = vec.fit_transform(docs)
clf.fit(X, y)

res = explain_prediction(clf, docs[0], vec=vec, target_names=target_names, top=20)
get_res = lambda: explain_prediction(
clf, docs[0], vec=vec, target_names=target_names, top=20)
res = get_res()
expl = format_as_text(res)
print(expl)
pprint(res)
Expand All @@ -59,32 +62,74 @@ def test_explain_linear(newsgroups_train, clf):
assert str(label) in expl
assert 'file' in expl

assert res == get_res()

def test_explain_linear_binary(newsgroups_train_binary):
docs, y, target_names = newsgroups_train_binary
vec = TfidfVectorizer()
clf = LogisticRegression()
X = vec.fit_transform(docs)
clf.fit(X, y)

res = explain_prediction(clf, docs[0], vec, target_names=target_names, top=20)
def check_explain_linear_binary(res):
expl = format_as_text(res)
print(expl)
pprint(res)

assert len(res['classes']) == 1
e = res['classes'][0]
assert e['class'] == 'comp.graphics'
neg = {name for name, value in e['feature_weights']['neg']}
assert 'freedom' in neg
assert 'objective' in neg
assert 'comp.graphics' in expl
assert 'freedom' in expl
assert 'objective' in expl


@pytest.mark.parametrize(['vec'], [
[CountVectorizer()],
[HashingVectorizer()],
])
def test_explain_linear_binary(vec, newsgroups_train_binary):
docs, y, target_names = newsgroups_train_binary
clf = LogisticRegression(random_state=42)
X = vec.fit_transform(docs)
clf.fit(X, y)

get_res = lambda: explain_prediction(
clf, docs[0], vec, target_names=target_names, top=20)
res = get_res()
check_explain_linear_binary(res)
assert res == get_res()
res_vectorized = explain_prediction(
clf, vec.transform([docs[0]])[0], vec, target_names=target_names,
top=20, vectorized=True)
if isinstance(vec, HashingVectorizer):
# InvertableHashingVectorizer must be passed with vectorized=True
neg_vectorized = {name for name, value in
res_vectorized['classes'][0]['feature_weights']['neg']}
assert all(name.startswith('x') for name in neg_vectorized)
else:
assert res_vectorized == res


def test_explain_hashing_vectorizer(newsgroups_train_binary):
# test that we can pass InvertableHashingVectorizer explicitly
vec = HashingVectorizer()
ivec = InvertableHashingVectorizer(vec)
clf = LogisticRegression(random_state=42)
docs, y, target_names = newsgroups_train_binary
ivec.fit([docs[0]])
X = vec.fit_transform(docs)
clf.fit(X, y)

get_res = lambda **kwargs: explain_prediction(
clf, docs[0], ivec, target_names=target_names, top=20, **kwargs)
res = get_res()
check_explain_linear_binary(res)
assert res == get_res()
res_vectorized = explain_prediction(
clf, vec.transform([docs[0]])[0], ivec, target_names=target_names,
top=20, vectorized=True)
pprint(res_vectorized)
assert res_vectorized == res

assert res == get_res(
feature_names=ivec.get_feature_names(always_signed=False),
coef_scale=ivec.column_signs_)


def test_explain_linear_dense():
clf = LogisticRegression()
Expand Down Expand Up @@ -146,6 +191,8 @@ def test_explain_linear_regression(boston_train, clf):
assert '<BIAS>' in expl
assert "'y'" in expl

assert res == explain_prediction(clf, X[0])


@pytest.mark.parametrize(['clf'], [
[ElasticNet(random_state=42)],
Expand Down Expand Up @@ -174,3 +221,5 @@ def test_explain_linear_regression_multitarget(clf):
assert 'x8' in expl
assert '<BIAS>' in expl
assert "'y2'" in expl

assert res == explain_prediction(clf, X[0])
16 changes: 14 additions & 2 deletions tests/test_sklearn_explain_weights.py
Expand Up @@ -37,7 +37,9 @@


def check_newsgroups_explanation_linear(clf, vec, target_names):
res = explain_weights(clf, vec, target_names=target_names, top=20)
get_res = lambda: explain_weights(
clf, vec, target_names=target_names, top=20)
res = get_res()
expl = format_as_text(res)
print(expl)

Expand All @@ -58,6 +60,8 @@ def check_newsgroups_explanation_linear(clf, vec, target_names):
for label in target_names:
assert str(label) in expl

assert res == get_res()


@pytest.mark.parametrize(['clf'], [
[LogisticRegression()],
Expand Down Expand Up @@ -149,12 +153,16 @@ def test_explain_random_forest(newsgroups_train, clf):
X = vec.fit_transform(docs)
clf.fit(X.toarray(), y)

res = explain_weights(clf, vec, target_names=target_names, top=30)
get_res = lambda: explain_weights(
clf, vec, target_names=target_names, top=30)
res = get_res()
expl = format_as_text(res)
print(expl)
assert 'feature importances' in expl
assert 'that 0.' in expl # high-ranked feature

assert res == get_res()


def test_explain_empty(newsgroups_train):
clf = LogisticRegression(C=0.01, penalty='l1')
Expand Down Expand Up @@ -203,6 +211,8 @@ def test_explain_linear_regression(boston_train, clf):
assert 'x9' in neg or 'x9' in pos
assert '<BIAS>' in neg or '<BIAS>' in pos

assert res == explain_weights(clf)


@pytest.mark.parametrize(['clf'], [
[ElasticNet(random_state=42)],
Expand All @@ -225,3 +235,5 @@ def test_explain_linear_regression_multitarget(clf):
pos, neg = top_pos_neg(res['targets'], 'target', 'y2')
assert 'x9' in neg or 'x9' in pos
assert '<BIAS>' in neg or '<BIAS>' in pos

assert res == explain_weights(clf)

0 comments on commit 26c6538

Please sign in to comment.