Skip to content

Commit

Permalink
cleanup annotation.py
Browse files Browse the repository at this point in the history
  • Loading branch information
kmike committed Dec 18, 2015
1 parent 1c72720 commit dac3585
Show file tree
Hide file tree
Showing 3 changed files with 102 additions and 120 deletions.
34 changes: 18 additions & 16 deletions formasaurus/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,35 +26,38 @@
use "evaluate" command.
"""
from __future__ import absolute_import, print_function
import sys
from collections import Counter

import docopt

import formasaurus
from formasaurus.annotation import (
check_annotated_data,
print_form_html
)
from formasaurus.utils import download
from formasaurus.storage import Storage
from formasaurus.html import load_html
from formasaurus.html import load_html, get_cleaned_form_html
from formasaurus import formtype_model, fieldtype_model
from formasaurus.classifiers import DEFAULT_DATA_PATH


def main():
args = docopt.docopt(__doc__, version=formasaurus.__version__)

if args['--data-folder'] is None:
args['--data-folder'] = DEFAULT_DATA_PATH
data_folder = args['--data-folder']
if data_folder is None:
data_folder = DEFAULT_DATA_PATH

storage = Storage(data_folder)

if args['check-data']:
check_annotated_data(args['--data-folder'])
errors = storage.check()
storage.print_form_type_counts(simplify=False)
storage.print_form_type_counts(simplify=True)
print("Errors:", errors)
if errors:
sys.exit(1)

elif args['train']:
ex = formasaurus.FormFieldClassifier.trained_on(
data_folder=args["--data-folder"],
)
ex = formasaurus.FormFieldClassifier.trained_on(data_folder)
ex.save(args["<modelfile>"])

elif args['run']:
Expand All @@ -72,7 +75,7 @@ def main():

for form, probs in result:
print("-"*40)
print_form_html(form)
print(get_cleaned_form_html(form))
print("")
for tp, prob in Counter(probs).most_common():
tp_full = ex.form_types_inv[tp]
Expand All @@ -82,11 +85,10 @@ def main():

elif args['evaluate']:
n_folds = int(args["--cv"])
store = Storage(args["--data-folder"])
annotations = list(
store.iter_annotations(verbose=True, leave=True,
simplify_form_types=True,
simplify_field_types=True)
storage.iter_annotations(verbose=True, leave=True,
simplify_form_types=True,
simplify_field_types=True)
)

if args['forms'] or args['all']:
Expand Down
105 changes: 83 additions & 22 deletions formasaurus/annotation.py
Original file line number Diff line number Diff line change
@@ -1,33 +1,94 @@
# -*- coding: utf-8 -*-
"""
HTML forms interactive annotation utilities.
"""
from __future__ import absolute_import, print_function
import sys
import collections

from sklearn.cross_validation import LabelKFold

from formasaurus.html import get_cleaned_form_html
from formasaurus.html import get_fields_to_annotate
from formasaurus.utils import get_domain
from formasaurus.storage import Storage


def check_annotated_data(data_folder):
"""
Check that annotated data is correct; exit with code 1 if it is not.
"""
storage = Storage(data_folder)
errors = storage.check()
storage.print_form_type_counts(simplify=False)
storage.print_form_type_counts(simplify=True)
print("Errors:", errors)
if errors:
sys.exit(1)


def print_form_html(form):
""" Print a cleaned up version of <form> HTML contents """
print(get_cleaned_form_html(form))
AnnotationSchema = collections.namedtuple(
'AnnotationSchema',
'types types_inv na_value skip_value simplify_map'
)


_FormAnnotation = collections.namedtuple(
'FormAnnotation',
'form type index info key form_schema field_schema'
)

class FormAnnotation(_FormAnnotation):
""" Annotated HTML form """
@property
def url(self):
return self.info['url']

@property
def fields(self):
"""
{"field name": "field type"} dict.
"""
return self.info['visible_html_fields'][self.index]

@property
def fields_annotated(self):
""" True if form has fields and all fields are annotated. """
if not self.fields:
return False
return all(
v != self.field_schema.na_value
for v in self.fields.values()
)

@property
def fields_partially_annotated(self):
"""
True when some fields are annotated and some are not annotated.
"""
if not self.fields:
return False
values = self.fields.values()
has_na = any(v == self.field_schema.na_value for v in values)
has_annotated = not all(v == self.field_schema.na_value for v in values)
return has_na and has_annotated

@property
def field_elems(self):
"""
Return a list of lxml Elements for fields which are annotated.
Fields are returned in in order they appear in form;
only visible submittable fields are considered.
"""
return get_fields_to_annotate(self.form)

@property
def field_types(self):
"""
A list of field types, in order they appear in form.
Only visible submittable fields are considered.
"""
fields = self.fields
return [fields[field.name] for field in self.field_elems]

@property
def field_types_full(self):
"""
A list of long field type names, in order they appear in form.
Only visible submittable fields are considered.
"""
return [self.field_schema.types_inv[tp] for tp in self.field_types]

@property
def type_full(self):
""" Full form type name """
return self.form_schema.types_inv[self.type]

def __repr__(self):
return "FormAnnotation(form={!r}, type={!r}, index={!r}, url={!r}, key={!r}, fields={!r})".format(
self.form, self.type, self.index, self.url, self.key, self.fields
)


def get_annotation_folds(annotations, n_folds):
Expand Down
83 changes: 1 addition & 82 deletions formasaurus/storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

from tqdm import tqdm

from formasaurus.annotation import AnnotationSchema, FormAnnotation
from formasaurus.formhash import get_form_hash
from formasaurus.utils import get_domain, inverse_mapping
from formasaurus.html import (
Expand All @@ -20,88 +21,6 @@
get_field_names,
)

AnnotationSchema = collections.namedtuple(
'AnnotationSchema',
'types types_inv na_value skip_value simplify_map'
)


_FormAnnotation = collections.namedtuple(
'FormAnnotation',
'form type index info key form_schema field_schema'
)

class FormAnnotation(_FormAnnotation):
""" Annotated HTML form """
@property
def url(self):
return self.info['url']

@property
def fields(self):
"""
{"field name": "field type"} dict.
"""
return self.info['visible_html_fields'][self.index]

@property
def fields_annotated(self):
""" True if form has fields and all fields are annotated. """
if not self.fields:
return False
return all(
v != self.field_schema.na_value
for v in self.fields.values()
)

@property
def fields_partially_annotated(self):
"""
True when some fields are annotated and some are not annotated.
"""
if not self.fields:
return False
values = self.fields.values()
has_na = any(v == self.field_schema.na_value for v in values)
has_annotated = not all(v == self.field_schema.na_value for v in values)
return has_na and has_annotated

@property
def field_elems(self):
"""
Return a list of lxml Elements for fields which are annotated.
Fields are returned in in order they appear in form;
only visible submittable fields are considered.
"""
return get_fields_to_annotate(self.form)

@property
def field_types(self):
"""
A list of field types, in order they appear in form.
Only visible submittable fields are considered.
"""
fields = self.fields
return [fields[field.name] for field in self.field_elems]

@property
def field_types_full(self):
"""
A list of long field type names, in order they appear in form.
Only visible submittable fields are considered.
"""
return [self.field_schema.types_inv[tp] for tp in self.field_types]

@property
def type_full(self):
""" Full form type name """
return self.form_schema.types_inv[self.type]

def __repr__(self):
return "FormAnnotation(form={!r}, type={!r}, index={!r}, url={!r}, key={!r}, fields={!r})".format(
self.form, self.type, self.index, self.url, self.key, self.fields
)


class Storage(object):
"""
Expand Down

0 comments on commit dac3585

Please sign in to comment.