evaluation cleanup

TeamHG-Memex · Dec 18, 2015 · 1c72720 · 1c72720
1 parent 111ad57
commit 1c72720
Show file tree

Hide file tree

Showing 13 changed files with 173 additions and 160 deletions.
diff --git a/docs/howitworks.rst b/docs/howitworks.rst
@@ -69,7 +69,7 @@ Based on held-out dataset it looks like (1) produces better results.
 We need noisy form type labels anyways, to check prediction quality.
 To get these 'realistic' noisy form type labels we split data into 10 folds,
 and then for each fold we predict its labels using form type detector
-trained on the rest 9 folds - see :func:`~.get_realistic_form_labels`.
+trained on the rest 9 folds.
 
 .. _Conditional Random Field: https://en.wikipedia.org/wiki/Conditional_random_field
 
diff --git a/docs/install.rst b/docs/install.rst
@@ -22,9 +22,8 @@ its other dependencies run
 
     pip install formasaurus[with-deps]
 
-These packages may require extra steps to install, so the command above
-may fail. In this case install dependencies manually, on by one
-(follow their install instructions),
-then run::
+These packages may require extra steps to install, so the command above may
+fail. In this case install dependencies manually, on by one (follow their
+install instructions), then run::
 
     pip install formasaurus
diff --git a/formasaurus/__main__.py b/formasaurus/__main__.py
@@ -7,14 +7,13 @@
     formasaurus train <modelfile> [--data-folder <path>]
     formasaurus run <modelfile> <url> [--threshold <probability>]
     formasaurus check-data [--data-folder <path>]
-    formasaurus evaluate [--test-size <ratio>] [--cv <n_folds>] [--data-folder <path>]
+    formasaurus evaluate (forms|fields|all) [--cv <n_folds>] [--data-folder <path>]
     formasaurus -h | --help
     formasaurus --version
 
 Options:
     --data-folder <path>       path to the data folder
-    --test-size <ratio>        ratio of data to use for evaluation, from 0 to 1.0 [default: 0.25]
-    --cv <n_folds>             use <n_folds> for cross-validation [default: 10]
+    --cv <n_folds>             use <n_folds> for cross-validation [default: 20]
     --threshold <probability>  don't display predictions with probability below this threshold [default: 0.01]
 
 To train an extractor for HTML form classification use "train" command.
@@ -23,11 +22,10 @@
 
 To check the storage for consistency and print some stats use "check-data" command.
 
-To check the expected quality of the default model trained on the
-training data provided use "evaluate" command.
+To check the estimated quality of the default form and form fields model
+use "evaluate" command.
 """
 from __future__ import absolute_import, print_function
-import os
 from collections import Counter
 
 import docopt
@@ -40,15 +38,15 @@
 from formasaurus.utils import download
 from formasaurus.storage import Storage
 from formasaurus.html import load_html
-from formasaurus import evaluation, formtype_model
+from formasaurus import formtype_model, fieldtype_model
+from formasaurus.classifiers import DEFAULT_DATA_PATH
 
 
 def main():
     args = docopt.docopt(__doc__, version=formasaurus.__version__)
 
     if args['--data-folder'] is None:
-        # by default, use 'data' folder relative to this file
-        args['--data-folder'] = os.path.join(os.path.dirname(__file__), 'data')
+        args['--data-folder'] = DEFAULT_DATA_PATH
 
     if args['check-data']:
         check_annotated_data(args['--data-folder'])
@@ -84,24 +82,23 @@ def main():
 
     elif args['evaluate']:
         n_folds = int(args["--cv"])
-        ratio = float(args['--test-size'])
-
         store = Storage(args["--data-folder"])
-        schema = store.get_form_schema()
-        model = formtype_model.get_model()
-
-        annotations = store.iter_annotations(verbose=True, leave=True,
-                                             simplify_form_types=True)
-        X, y = zip(*((a.form, a.type) for a in annotations))
+        annotations = list(
+            store.iter_annotations(verbose=True, leave=True,
+                                   simplify_form_types=True,
+                                   simplify_field_types=True)
+        )
 
-        test_size = int(len(y) * ratio)
-        train_size = len(y) - test_size
-        X_train, X_test = X[:train_size], X[train_size:]
-        y_train, y_test = y[:train_size], y[train_size:]
+        if args['forms'] or args['all']:
+            print("Evaluating form classifier...")
+            formtype_model.print_classification_report(annotations,
+                                                       n_folds=n_folds)
+            print("")
 
-        evaluation.print_metrics(model, X, y, X_train, X_test, y_train, y_test,
-                                 ipython=False, cv=n_folds, short_matrix=True,
-                                 class_map=schema.types_inv)
+        if args['fields'] or args['all']:
+            print("Evaluating form field classifier...")
+            fieldtype_model.print_classification_report(annotations,
+                                                        n_folds=n_folds)
 
 
 if __name__ == '__main__':

diff --git a/formasaurus/annotation.py b/formasaurus/annotation.py
@@ -30,14 +30,6 @@ def print_form_html(form):
     print(get_cleaned_form_html(form))
 
 
-
-def print_form_types(form_types):
-    print("\nAllowed form types and their shortcuts:")
-    for full_name, shortcuts in form_types.items():
-        print("  %s %s" % (shortcuts, full_name))
-    print("")
-
-
 def get_annotation_folds(annotations, n_folds):
     """
     Return (train_indices, test_indices) folds iterator.
@@ -48,17 +40,3 @@ def get_annotation_folds(annotations, n_folds):
         labels=[get_domain(ann.url) for ann in annotations],
         n_folds=n_folds
     )
-
-
-def get_annotation_train_test_indices(annotations, n_folds=4):
-    """
-    Split annotations into train and test parts, return train and test indices.
-    The size of test part is approximately ``len(annotations)/n_folds``.
-    it is guaranteed forms from the same website can't be both
-    in train and test parts.
-    """
-    for idx_train, idx_test in get_annotation_folds(annotations, n_folds):
-        break
-    return idx_train, idx_test
-
-
diff --git a/formasaurus/classifiers.py b/formasaurus/classifiers.py
@@ -117,7 +117,7 @@ def train(self, annotations):
         print("Training field type detector...")
         self._field_model = fieldtype_model.train(
             annotations=annotations,
-            use_precise_formtypes=True,
+            use_precise_form_types=True,
             full_field_type_names=True,
             full_form_type_names=self.form_classifier.full_type_names,
             verbose=True,

diff --git a/formasaurus/evaluation.py b/formasaurus/evaluation.py
@@ -27,39 +27,6 @@ def print_sparsity(clf):
     print("Active features: %d out of possible %d" % (n_active, n_possible))
 
 
-def print_cv_scores(model, X, y, cv=10):
-    """ Print cross-validation scores of a classifier """
-    scoring = 'f1' if SKLEARN_VERSION < '0.16' else 'f1_weighted'
-    cv_scores = cross_val_score(model, X, y, scoring=scoring, cv=cv,
-                                verbose=False)
-    msg = (
-        u"%d-fold cross-validation F1: %0.3f (±%0.3f)  "
-        u"min=%0.3f  max=%0.3f" % (
-            cv, cv_scores.mean(),  cv_scores.std() * 2,
-            cv_scores.min(), cv_scores.max()
-        )
-    )
-    if six.PY2:
-        encoding = getattr(sys.stdout, 'encoding', 'utf8') or 'ascii'
-        print(msg.encode(encoding, 'replace'))
-    else:
-        print(msg)
-
-
-def fit_and_predict(model, X_train, X_test, y_train):
-    model.fit(X_train, y_train)
-    return model.predict(X_test)
-
-
-def print_classification_report(y_train,y_test, y_pred, class_labels=None):
-    """ Print the classification report """
-    print(
-        "\nClassification report (%d training examples, %d testing "
-        "examples):\n" % (len(y_train), len(y_test))
-    )
-    print(classification_report(y_test, y_pred, target_names=class_labels))
-
-
 def df_confusion_matrix(y_test, y_pred, class_labels=None):
     """
     Return the confusion matrix as pandas.DataFrame.
@@ -82,26 +49,6 @@ def print_confusion_matrix(y_test, y_pred, class_labels=None, ipython=False):
         print(df)
 
 
-def print_metrics(model, X, y, X_train, X_test, y_train, y_test,
-                  ipython=False, cv=10, short_matrix=False, class_map=None):
-    clf = model.steps[-1][1]
-    y_pred = fit_and_predict(model, X_train, X_test, y_train)
-
-    if class_map is not None:
-        class_labels = [class_map[c] for c in clf.classes_]
-    else:
-        class_labels = clf.classes_
-
-    print_classification_report(y_train, y_test, y_pred, class_labels)
-    print_sparsity(clf)
-
-    if short_matrix:
-        class_labels = clf.classes_
-    print_confusion_matrix(y_test, y_pred, class_labels, ipython=ipython)
-
-    print("\nRunning cross validation...")
-    print_cv_scores(model, X, y, cv=cv)
-
 
 def get_informative_features(vectorizers, clf, class_labels, N):
     """

diff --git a/formasaurus/fieldtype_model.py b/formasaurus/fieldtype_model.py
@@ -22,7 +22,7 @@
 We need noisy form type labels anyways, to check prediction quality.
 To get these 'realistic' noisy form type labels we split data into 10 folds,
 and then for each fold we predict its labels using form type detector
-trained on the rest 9 folds - see :func:`get_realistic_form_labels`.
+trained on the rest 9 folds.
 """
 from __future__ import absolute_import, division
 import warnings
@@ -33,7 +33,13 @@
 from sklearn.metrics import make_scorer
 from sklearn.cross_validation import cross_val_predict
 from sklearn_crfsuite import CRF
-from sklearn_crfsuite.metrics import flat_f1_score
+from sklearn_crfsuite.metrics import (
+    flat_f1_score,
+    flat_accuracy_score,
+    flat_classification_report,
+    sequence_accuracy_score
+)
+from sklearn_crfsuite.utils import flatten
 
 from formasaurus import formtype_model
 from formasaurus.html import get_fields_to_annotate, get_text_around_elems
@@ -42,16 +48,12 @@
 from formasaurus.annotation import get_annotation_folds
 
 
-scorer = make_scorer(flat_f1_score, average='weighted')
-""" Default scorer for grid search. We're optimizing for F1. """
-
-
-_PRECISE_C1_C2 = 0.1655, 0.0236  # values found by randomized search
-_REALISTIC_C1_C2 = 0.247, 0.032  # values found by randomized search
+scorer = make_scorer(flat_f1_score, average='micro')
+""" Default scorer for grid search. We're optimizing for micro-averaged F1. """
 
 
 def train(annotations,
-          use_precise_formtypes=True,
+          use_precise_form_types=True,
           optimize_hyperparameters_iters=0,
           full_form_type_names=False,
           full_field_type_names=True,
@@ -64,22 +66,19 @@ def log(msg):
     annotations = [a for a in annotations if a.fields_annotated]
     log("Training on {} forms".format(len(annotations)))
 
-    if use_precise_formtypes:
+    if use_precise_form_types:
         log("Using precise form types")
         if full_form_type_names:
             form_types = np.asarray([a.type_full for a in annotations])
         else:
             form_types = np.asarray([a.type for a in annotations])
-        # c1, c2 = 0.0223, 0.0033  # values found by randomized search
-        c1, c2 = _PRECISE_C1_C2
     else:
         log("Computing realistic form types")
-        form_types = get_realistic_form_labels(
+        form_types = formtype_model.get_realistic_form_labels(
             annotations=annotations,
             n_folds=10,
             full_type_names=full_form_type_names
         )
-        c1, c2 = _REALISTIC_C1_C2
 
     log("Extracting features")
     X, y = get_Xy(
@@ -88,7 +87,7 @@ def log(msg):
         full_type_names=full_field_type_names,
     )
 
-    crf = CRF(all_possible_transitions=True, max_iterations=100, c1=c1, c2=c2)
+    crf = get_model(use_precise_form_types)
 
     if optimize_hyperparameters_iters != 0:
         if optimize_hyperparameters_iters < 50:
@@ -136,26 +135,6 @@ def get_Xy(annotations, form_types, full_type_names=False):
     return X, y
 
 
-def get_realistic_form_labels(annotations, n_folds=10, model=None,
-                              full_type_names=True):
-    """
-    Return form type labels which form type detection model
-    is likely to produce.
-    """
-    if model is None:
-        model = formtype_model.get_model()
-
-    X = [a.form for a in annotations]
-
-    if full_type_names:
-        y = np.asarray([a.type_full for a in annotations])
-    else:
-        y = np.asarray([a.type for a in annotations])
-
-    folds = get_annotation_folds(annotations, n_folds)
-    return cross_val_predict(model, X, y, cv=folds)
-
-
 def get_form_features(form, form_type, field_elems=None):
     """
     Return a list of feature dicts, a dict per visible submittable
@@ -227,3 +206,56 @@ def _elem_features(elem):
 
 def _elem_attr(elem, attr):
     return normalize(elem.get(attr, ''))
+
+
+_PRECISE_C1_C2 = 0.1655, 0.0236  # values found by randomized search
+_REALISTIC_C1_C2 = 0.247, 0.032  # values found by randomized search
+
+
+def get_model(use_precise_form_types=True):
+    """ Return default CRF model """
+    c1, c2 = _PRECISE_C1_C2 if use_precise_form_types else _REALISTIC_C1_C2
+    return CRF(
+        all_possible_transitions=True,
+        max_iterations=100,
+        c1=c1,
+        c2=c2
+    )
+
+
+def print_classification_report(annotations, n_folds=10, model=None):
+    """ Evaluate model, print classification report """
+    if model is None:
+        model = get_model(use_precise_form_types=True)
+
+    annotations = [a for a in annotations if a.fields_annotated]
+    form_types = formtype_model.get_realistic_form_labels(
+        annotations=annotations,
+        n_folds=n_folds,
+        full_type_names=False
+    )
+
+    X, y = get_Xy(
+        annotations=annotations,
+        form_types=form_types,
+        full_type_names=True,
+    )
+    cv = get_annotation_folds(annotations, n_folds=n_folds)
+    y_pred = cross_val_predict(model, X, y, cv=cv, n_jobs=-1)
+
+    all_labels = list(annotations[0].field_schema.types.keys())
+    labels = sorted(set(flatten(y_pred)), key=lambda k: all_labels.index(k))
+    print(flat_classification_report(y, y_pred, digits=2,
+                                     labels=labels, target_names=labels))
+
+    print(
+        "{:0.1f}% fields are classified correctly.".format(
+            flat_accuracy_score(y, y_pred) * 100
+        )
+    )
+    print(
+        "All fields are classified correctly in {:0.1f}% forms.".format(
+            sequence_accuracy_score(y, y_pred) * 100
+        )
+    )
+