Move code from evaluation.py to Form Type Detection notebook. See GH-7.

TeamHG-Memex · Dec 18, 2015 · 5c127f8 · 5c127f8
1 parent 6bb55ff
commit 5c127f8
Show file tree

Hide file tree

Showing 4 changed files with 617 additions and 938 deletions.
diff --git a/formasaurus/annotation.py b/formasaurus/annotation.py
@@ -96,6 +96,16 @@ def get_annotation_folds(annotations, n_folds):
     Return (train_indices, test_indices) folds iterator.
     It is guaranteed forms from the same website can't be both in
     train and test parts.
+
+    We must be careful when splitting the dataset into training and
+    evaluation parts: forms from the same domain should be in the same
+    "bin". There could be several pages from the same domain, and these
+    pages may have duplicate or similar forms (e.g. a search form on each
+    page). If we put one such form in training dataset and another in
+    evaluation dataset then the metrics will be too optimistic, and they
+    can make us to choose wrong features/models. For example,
+    train_test_split from scikit-learn shouldn't be used here. To fix it
+    LabelKFold from scikit-learn is used.
     """
     return LabelKFold(
         labels=[get_domain(ann.url) for ann in annotations],

diff --git a/formasaurus/evaluation.py b/formasaurus/evaluation.py
@@ -3,17 +3,9 @@
 This module provides helper functions for evaluating formasaurus quality.
 """
 from __future__ import absolute_import, print_function
-from distutils.version import LooseVersion
-import sys
 
-import six
 import numpy as np
-import sklearn
-from sklearn.cross_validation import cross_val_score
-from sklearn.metrics import classification_report, confusion_matrix
-
-
-SKLEARN_VERSION = LooseVersion(sklearn.__version__)
+from sklearn.metrics import confusion_matrix
 
 
 def print_sparsity(clf):
@@ -47,50 +39,3 @@ def print_confusion_matrix(y_test, y_pred, class_labels=None, ipython=False):
         display(df)
     else:
         print(df)
-
-
-
-def get_informative_features(vectorizers, clf, class_labels, N):
-    """
-    Return text with features with the highest absolute coefficient
-    values, per class.
-    """
-    feature_names = []
-    for vec_name, vec in vectorizers:
-        feature_names.extend(
-            "%30s  %s" % (vec_name, name) for name in vec.get_feature_names()
-        )
-    features_by_class = []
-    for i, class_label in enumerate(class_labels):
-        topN = np.argsort(clf.coef_[i])[-N:]
-        bottomN = np.argsort(clf.coef_[i])[:N]
-        res = []
-
-        for j in reversed(topN):
-            coef = clf.coef_[i][j]
-            if coef > 0:
-                res.append("+%0.4f: %s" % (coef, feature_names[j]))
-
-        if (len(topN) >= N) or (len(bottomN) >= N):
-            res.append("   ...")
-
-        for j in reversed(bottomN):
-            coef = clf.coef_[i][j]
-            if coef < 0:
-                res.append("%0.4f: %s" % (coef, feature_names[j]))
-        features_by_class.append((class_label, '\n'.join(res)))
-    return features_by_class
-
-
-def print_informative_features(features, clf, top, classes=None, class_map=None):
-    vectorizers = [(name, vec) for (name, fe, vec) in features]
-    feat_info = get_informative_features(vectorizers, clf, clf.classes_, top)
-    for cls, report in feat_info:
-        if classes is not None and cls not in classes:
-            continue
-        if class_map is not None:
-            print(class_map[cls])
-        else:
-            print(cls)
-        print(report)
-        print("-"*80)
diff --git a/formasaurus/formtype_model.py b/formasaurus/formtype_model.py
@@ -7,12 +7,11 @@
 
 import numpy as np
 from formasaurus.annotation import get_annotation_folds
-from formasaurus.evaluation import print_confusion_matrix
 from sklearn.cross_validation import cross_val_predict
 
 from sklearn.feature_extraction import DictVectorizer
 from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
-from sklearn.metrics import f1_score, classification_report, accuracy_score
+from sklearn.metrics import classification_report, accuracy_score
 from sklearn.pipeline import make_pipeline, make_union
 from sklearn.linear_model import SGDClassifier, LogisticRegression
 from sklearn.svm import LinearSVC