Skip to content

Commit

Permalink
Move code from evaluation.py to Form Type Detection notebook. See GH-7.
Browse files Browse the repository at this point in the history
  • Loading branch information
kmike committed Dec 18, 2015
1 parent 6bb55ff commit 5c127f8
Show file tree
Hide file tree
Showing 4 changed files with 617 additions and 938 deletions.
10 changes: 10 additions & 0 deletions formasaurus/annotation.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,16 @@ def get_annotation_folds(annotations, n_folds):
Return (train_indices, test_indices) folds iterator.
It is guaranteed forms from the same website can't be both in
train and test parts.
We must be careful when splitting the dataset into training and
evaluation parts: forms from the same domain should be in the same
"bin". There could be several pages from the same domain, and these
pages may have duplicate or similar forms (e.g. a search form on each
page). If we put one such form in training dataset and another in
evaluation dataset then the metrics will be too optimistic, and they
can make us to choose wrong features/models. For example,
train_test_split from scikit-learn shouldn't be used here. To fix it
LabelKFold from scikit-learn is used.
"""
return LabelKFold(
labels=[get_domain(ann.url) for ann in annotations],
Expand Down
57 changes: 1 addition & 56 deletions formasaurus/evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,17 +3,9 @@
This module provides helper functions for evaluating formasaurus quality.
"""
from __future__ import absolute_import, print_function
from distutils.version import LooseVersion
import sys

import six
import numpy as np
import sklearn
from sklearn.cross_validation import cross_val_score
from sklearn.metrics import classification_report, confusion_matrix


SKLEARN_VERSION = LooseVersion(sklearn.__version__)
from sklearn.metrics import confusion_matrix


def print_sparsity(clf):
Expand Down Expand Up @@ -47,50 +39,3 @@ def print_confusion_matrix(y_test, y_pred, class_labels=None, ipython=False):
display(df)
else:
print(df)



def get_informative_features(vectorizers, clf, class_labels, N):
"""
Return text with features with the highest absolute coefficient
values, per class.
"""
feature_names = []
for vec_name, vec in vectorizers:
feature_names.extend(
"%30s %s" % (vec_name, name) for name in vec.get_feature_names()
)
features_by_class = []
for i, class_label in enumerate(class_labels):
topN = np.argsort(clf.coef_[i])[-N:]
bottomN = np.argsort(clf.coef_[i])[:N]
res = []

for j in reversed(topN):
coef = clf.coef_[i][j]
if coef > 0:
res.append("+%0.4f: %s" % (coef, feature_names[j]))

if (len(topN) >= N) or (len(bottomN) >= N):
res.append(" ...")

for j in reversed(bottomN):
coef = clf.coef_[i][j]
if coef < 0:
res.append("%0.4f: %s" % (coef, feature_names[j]))
features_by_class.append((class_label, '\n'.join(res)))
return features_by_class


def print_informative_features(features, clf, top, classes=None, class_map=None):
vectorizers = [(name, vec) for (name, fe, vec) in features]
feat_info = get_informative_features(vectorizers, clf, clf.classes_, top)
for cls, report in feat_info:
if classes is not None and cls not in classes:
continue
if class_map is not None:
print(class_map[cls])
else:
print(cls)
print(report)
print("-"*80)
3 changes: 1 addition & 2 deletions formasaurus/formtype_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,11 @@

import numpy as np
from formasaurus.annotation import get_annotation_folds
from formasaurus.evaluation import print_confusion_matrix
from sklearn.cross_validation import cross_val_predict

from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import f1_score, classification_report, accuracy_score
from sklearn.metrics import classification_report, accuracy_score
from sklearn.pipeline import make_pipeline, make_union
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.svm import LinearSVC
Expand Down

0 comments on commit 5c127f8

Please sign in to comment.