Skip to content

Commit

Permalink
evaluation cleanup
Browse files Browse the repository at this point in the history
  • Loading branch information
kmike committed Dec 18, 2015
1 parent 111ad57 commit 1c72720
Show file tree
Hide file tree
Showing 13 changed files with 173 additions and 160 deletions.
2 changes: 1 addition & 1 deletion docs/howitworks.rst
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ Based on held-out dataset it looks like (1) produces better results.
We need noisy form type labels anyways, to check prediction quality.
To get these 'realistic' noisy form type labels we split data into 10 folds,
and then for each fold we predict its labels using form type detector
trained on the rest 9 folds - see :func:`~.get_realistic_form_labels`.
trained on the rest 9 folds.

.. _Conditional Random Field: https://en.wikipedia.org/wiki/Conditional_random_field

7 changes: 3 additions & 4 deletions docs/install.rst
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,8 @@ its other dependencies run

pip install formasaurus[with-deps]

These packages may require extra steps to install, so the command above
may fail. In this case install dependencies manually, on by one
(follow their install instructions),
then run::
These packages may require extra steps to install, so the command above may
fail. In this case install dependencies manually, on by one (follow their
install instructions), then run::

pip install formasaurus
45 changes: 21 additions & 24 deletions formasaurus/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,13 @@
formasaurus train <modelfile> [--data-folder <path>]
formasaurus run <modelfile> <url> [--threshold <probability>]
formasaurus check-data [--data-folder <path>]
formasaurus evaluate [--test-size <ratio>] [--cv <n_folds>] [--data-folder <path>]
formasaurus evaluate (forms|fields|all) [--cv <n_folds>] [--data-folder <path>]
formasaurus -h | --help
formasaurus --version
Options:
--data-folder <path> path to the data folder
--test-size <ratio> ratio of data to use for evaluation, from 0 to 1.0 [default: 0.25]
--cv <n_folds> use <n_folds> for cross-validation [default: 10]
--cv <n_folds> use <n_folds> for cross-validation [default: 20]
--threshold <probability> don't display predictions with probability below this threshold [default: 0.01]
To train an extractor for HTML form classification use "train" command.
Expand All @@ -23,11 +22,10 @@
To check the storage for consistency and print some stats use "check-data" command.
To check the expected quality of the default model trained on the
training data provided use "evaluate" command.
To check the estimated quality of the default form and form fields model
use "evaluate" command.
"""
from __future__ import absolute_import, print_function
import os
from collections import Counter

import docopt
Expand All @@ -40,15 +38,15 @@
from formasaurus.utils import download
from formasaurus.storage import Storage
from formasaurus.html import load_html
from formasaurus import evaluation, formtype_model
from formasaurus import formtype_model, fieldtype_model
from formasaurus.classifiers import DEFAULT_DATA_PATH


def main():
args = docopt.docopt(__doc__, version=formasaurus.__version__)

if args['--data-folder'] is None:
# by default, use 'data' folder relative to this file
args['--data-folder'] = os.path.join(os.path.dirname(__file__), 'data')
args['--data-folder'] = DEFAULT_DATA_PATH

if args['check-data']:
check_annotated_data(args['--data-folder'])
Expand Down Expand Up @@ -84,24 +82,23 @@ def main():

elif args['evaluate']:
n_folds = int(args["--cv"])
ratio = float(args['--test-size'])

store = Storage(args["--data-folder"])
schema = store.get_form_schema()
model = formtype_model.get_model()

annotations = store.iter_annotations(verbose=True, leave=True,
simplify_form_types=True)
X, y = zip(*((a.form, a.type) for a in annotations))
annotations = list(
store.iter_annotations(verbose=True, leave=True,
simplify_form_types=True,
simplify_field_types=True)
)

test_size = int(len(y) * ratio)
train_size = len(y) - test_size
X_train, X_test = X[:train_size], X[train_size:]
y_train, y_test = y[:train_size], y[train_size:]
if args['forms'] or args['all']:
print("Evaluating form classifier...")
formtype_model.print_classification_report(annotations,
n_folds=n_folds)
print("")

evaluation.print_metrics(model, X, y, X_train, X_test, y_train, y_test,
ipython=False, cv=n_folds, short_matrix=True,
class_map=schema.types_inv)
if args['fields'] or args['all']:
print("Evaluating form field classifier...")
fieldtype_model.print_classification_report(annotations,
n_folds=n_folds)


if __name__ == '__main__':
Expand Down
22 changes: 0 additions & 22 deletions formasaurus/annotation.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,14 +30,6 @@ def print_form_html(form):
print(get_cleaned_form_html(form))



def print_form_types(form_types):
print("\nAllowed form types and their shortcuts:")
for full_name, shortcuts in form_types.items():
print(" %s %s" % (shortcuts, full_name))
print("")


def get_annotation_folds(annotations, n_folds):
"""
Return (train_indices, test_indices) folds iterator.
Expand All @@ -48,17 +40,3 @@ def get_annotation_folds(annotations, n_folds):
labels=[get_domain(ann.url) for ann in annotations],
n_folds=n_folds
)


def get_annotation_train_test_indices(annotations, n_folds=4):
"""
Split annotations into train and test parts, return train and test indices.
The size of test part is approximately ``len(annotations)/n_folds``.
it is guaranteed forms from the same website can't be both
in train and test parts.
"""
for idx_train, idx_test in get_annotation_folds(annotations, n_folds):
break
return idx_train, idx_test


2 changes: 1 addition & 1 deletion formasaurus/classifiers.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ def train(self, annotations):
print("Training field type detector...")
self._field_model = fieldtype_model.train(
annotations=annotations,
use_precise_formtypes=True,
use_precise_form_types=True,
full_field_type_names=True,
full_form_type_names=self.form_classifier.full_type_names,
verbose=True,
Expand Down
53 changes: 0 additions & 53 deletions formasaurus/evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,39 +27,6 @@ def print_sparsity(clf):
print("Active features: %d out of possible %d" % (n_active, n_possible))


def print_cv_scores(model, X, y, cv=10):
""" Print cross-validation scores of a classifier """
scoring = 'f1' if SKLEARN_VERSION < '0.16' else 'f1_weighted'
cv_scores = cross_val_score(model, X, y, scoring=scoring, cv=cv,
verbose=False)
msg = (
u"%d-fold cross-validation F1: %0.3f (±%0.3f) "
u"min=%0.3f max=%0.3f" % (
cv, cv_scores.mean(), cv_scores.std() * 2,
cv_scores.min(), cv_scores.max()
)
)
if six.PY2:
encoding = getattr(sys.stdout, 'encoding', 'utf8') or 'ascii'
print(msg.encode(encoding, 'replace'))
else:
print(msg)


def fit_and_predict(model, X_train, X_test, y_train):
model.fit(X_train, y_train)
return model.predict(X_test)


def print_classification_report(y_train,y_test, y_pred, class_labels=None):
""" Print the classification report """
print(
"\nClassification report (%d training examples, %d testing "
"examples):\n" % (len(y_train), len(y_test))
)
print(classification_report(y_test, y_pred, target_names=class_labels))


def df_confusion_matrix(y_test, y_pred, class_labels=None):
"""
Return the confusion matrix as pandas.DataFrame.
Expand All @@ -82,26 +49,6 @@ def print_confusion_matrix(y_test, y_pred, class_labels=None, ipython=False):
print(df)


def print_metrics(model, X, y, X_train, X_test, y_train, y_test,
ipython=False, cv=10, short_matrix=False, class_map=None):
clf = model.steps[-1][1]
y_pred = fit_and_predict(model, X_train, X_test, y_train)

if class_map is not None:
class_labels = [class_map[c] for c in clf.classes_]
else:
class_labels = clf.classes_

print_classification_report(y_train, y_test, y_pred, class_labels)
print_sparsity(clf)

if short_matrix:
class_labels = clf.classes_
print_confusion_matrix(y_test, y_pred, class_labels, ipython=ipython)

print("\nRunning cross validation...")
print_cv_scores(model, X, y, cv=cv)


def get_informative_features(vectorizers, clf, class_labels, N):
"""
Expand Down
102 changes: 67 additions & 35 deletions formasaurus/fieldtype_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
We need noisy form type labels anyways, to check prediction quality.
To get these 'realistic' noisy form type labels we split data into 10 folds,
and then for each fold we predict its labels using form type detector
trained on the rest 9 folds - see :func:`get_realistic_form_labels`.
trained on the rest 9 folds.
"""
from __future__ import absolute_import, division
import warnings
Expand All @@ -33,7 +33,13 @@
from sklearn.metrics import make_scorer
from sklearn.cross_validation import cross_val_predict
from sklearn_crfsuite import CRF
from sklearn_crfsuite.metrics import flat_f1_score
from sklearn_crfsuite.metrics import (
flat_f1_score,
flat_accuracy_score,
flat_classification_report,
sequence_accuracy_score
)
from sklearn_crfsuite.utils import flatten

from formasaurus import formtype_model
from formasaurus.html import get_fields_to_annotate, get_text_around_elems
Expand All @@ -42,16 +48,12 @@
from formasaurus.annotation import get_annotation_folds


scorer = make_scorer(flat_f1_score, average='weighted')
""" Default scorer for grid search. We're optimizing for F1. """


_PRECISE_C1_C2 = 0.1655, 0.0236 # values found by randomized search
_REALISTIC_C1_C2 = 0.247, 0.032 # values found by randomized search
scorer = make_scorer(flat_f1_score, average='micro')
""" Default scorer for grid search. We're optimizing for micro-averaged F1. """


def train(annotations,
use_precise_formtypes=True,
use_precise_form_types=True,
optimize_hyperparameters_iters=0,
full_form_type_names=False,
full_field_type_names=True,
Expand All @@ -64,22 +66,19 @@ def log(msg):
annotations = [a for a in annotations if a.fields_annotated]
log("Training on {} forms".format(len(annotations)))

if use_precise_formtypes:
if use_precise_form_types:
log("Using precise form types")
if full_form_type_names:
form_types = np.asarray([a.type_full for a in annotations])
else:
form_types = np.asarray([a.type for a in annotations])
# c1, c2 = 0.0223, 0.0033 # values found by randomized search
c1, c2 = _PRECISE_C1_C2
else:
log("Computing realistic form types")
form_types = get_realistic_form_labels(
form_types = formtype_model.get_realistic_form_labels(
annotations=annotations,
n_folds=10,
full_type_names=full_form_type_names
)
c1, c2 = _REALISTIC_C1_C2

log("Extracting features")
X, y = get_Xy(
Expand All @@ -88,7 +87,7 @@ def log(msg):
full_type_names=full_field_type_names,
)

crf = CRF(all_possible_transitions=True, max_iterations=100, c1=c1, c2=c2)
crf = get_model(use_precise_form_types)

if optimize_hyperparameters_iters != 0:
if optimize_hyperparameters_iters < 50:
Expand Down Expand Up @@ -136,26 +135,6 @@ def get_Xy(annotations, form_types, full_type_names=False):
return X, y


def get_realistic_form_labels(annotations, n_folds=10, model=None,
full_type_names=True):
"""
Return form type labels which form type detection model
is likely to produce.
"""
if model is None:
model = formtype_model.get_model()

X = [a.form for a in annotations]

if full_type_names:
y = np.asarray([a.type_full for a in annotations])
else:
y = np.asarray([a.type for a in annotations])

folds = get_annotation_folds(annotations, n_folds)
return cross_val_predict(model, X, y, cv=folds)


def get_form_features(form, form_type, field_elems=None):
"""
Return a list of feature dicts, a dict per visible submittable
Expand Down Expand Up @@ -227,3 +206,56 @@ def _elem_features(elem):

def _elem_attr(elem, attr):
return normalize(elem.get(attr, ''))


_PRECISE_C1_C2 = 0.1655, 0.0236 # values found by randomized search
_REALISTIC_C1_C2 = 0.247, 0.032 # values found by randomized search


def get_model(use_precise_form_types=True):
""" Return default CRF model """
c1, c2 = _PRECISE_C1_C2 if use_precise_form_types else _REALISTIC_C1_C2
return CRF(
all_possible_transitions=True,
max_iterations=100,
c1=c1,
c2=c2
)


def print_classification_report(annotations, n_folds=10, model=None):
""" Evaluate model, print classification report """
if model is None:
model = get_model(use_precise_form_types=True)

annotations = [a for a in annotations if a.fields_annotated]
form_types = formtype_model.get_realistic_form_labels(
annotations=annotations,
n_folds=n_folds,
full_type_names=False
)

X, y = get_Xy(
annotations=annotations,
form_types=form_types,
full_type_names=True,
)
cv = get_annotation_folds(annotations, n_folds=n_folds)
y_pred = cross_val_predict(model, X, y, cv=cv, n_jobs=-1)

all_labels = list(annotations[0].field_schema.types.keys())
labels = sorted(set(flatten(y_pred)), key=lambda k: all_labels.index(k))
print(flat_classification_report(y, y_pred, digits=2,
labels=labels, target_names=labels))

print(
"{:0.1f}% fields are classified correctly.".format(
flat_accuracy_score(y, y_pred) * 100
)
)
print(
"All fields are classified correctly in {:0.1f}% forms.".format(
sequence_accuracy_score(y, y_pred) * 100
)
)

0 comments on commit 1c72720

Please sign in to comment.