Skip to content

Commit

Permalink
Update annotation notebooks and IPython widgets. Fix GH-10, GH-7.
Browse files Browse the repository at this point in the history
  • Loading branch information
kmike committed Dec 18, 2015
1 parent 48835f4 commit c0d50f9
Show file tree
Hide file tree
Showing 4 changed files with 56 additions and 120 deletions.
4 changes: 2 additions & 2 deletions formasaurus/data/index.json
Original file line number Diff line number Diff line change
Expand Up @@ -18630,14 +18630,14 @@
"html/yadman.net-1.html": {
"url": "http://yadman.net/DefaultEn.aspx",
"forms": [
"X"
"l"
],
"visible_html_fields": [
{
"cboVorod": "OS",
"chkRemember": "XX",
"cmdAzhansLogin": "XX",
"imgChange": "XX",
"imgChange": "Bs",
"mgPackage1": "XX",
"mgPackage2": "XX",
"mgPackage3": "XX",
Expand Down
31 changes: 15 additions & 16 deletions formasaurus/widgets.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ def on_submit(_):
display(box)


def MultiFormAnnotator(annotations, form_types, field_types,
def MultiFormAnnotator(annotations,
annotate_fields=True, annotate_types=True,
save_func=None):
"""
Expand All @@ -51,8 +51,6 @@ def MultiFormAnnotator(annotations, form_types, field_types,
def render(i):
widget = FormAnnotator(
ann=annotations[i],
form_types=form_types,
field_types=field_types,
annotate_fields=annotate_fields,
annotate_types=annotate_types,
)
Expand All @@ -79,18 +77,17 @@ def on_change(name, value):
on_change('value', slider.value)


def FormAnnotator(ann, form_types, field_types, annotate_fields=True,
annotate_types=True, max_fields=80):
def FormAnnotator(ann, annotate_fields=True, annotate_types=True, max_fields=80):
"""
Widget for annotating a single HTML form.
"""
assert annotate_fields or annotate_types
form_types_inv = inverse_mapping(form_types)
form_types_inv = ann.form_schema.types_inv

children = []

if annotate_types:
children += [FormTypeSelect(ann, form_types)]
children += [FormTypeSelect(ann)]

tpl = """
<h4>
Expand All @@ -99,7 +96,7 @@ def FormAnnotator(ann, form_types, field_types, annotate_fields=True,
</h4>
"""
header = widgets.HTML(tpl.format(
url=ann.info['url'],
url=ann.url,
index=ann.index,
key=ann.key,
tp=form_types_inv.get(ann.type, '?')
Expand All @@ -115,7 +112,7 @@ def FormAnnotator(ann, form_types, field_types, annotate_fields=True,
]
else:
for name in names:
field_type_select = FieldTypeSelect(ann, name, field_types)
field_type_select = FieldTypeSelect(ann, name)
html_view = HtmlView(ann.form, name)
page = widgets.Box(children=[field_type_select, html_view])
pages.append(page)
Expand All @@ -131,13 +128,14 @@ def FormAnnotator(ann, form_types, field_types, annotate_fields=True,
return widgets.VBox(children, padding=8)


def FormTypeSelect(ann, form_types):
def FormTypeSelect(ann):
""" Form type edit widget """
form_types_inv = inverse_mapping(form_types)

form_types = ann.form_schema.types
tp = ann.info['forms'][ann.index]
type_select = widgets.ToggleButtons(
options=list(form_types.keys()),
value=form_types_inv[tp],
value=ann.form_schema.types_inv[tp],
padding=4,
description='form type:',
)
Expand All @@ -149,17 +147,18 @@ def on_change(name, value):
return type_select


def FieldTypeSelect(ann, field_name, field_types):
def FieldTypeSelect(ann, field_name):
""" Form field type edit widget """
tp = ann.info['visible_html_fields'][ann.index][field_name]
field_types_inv = inverse_mapping(field_types)
field_types = ann.field_schema.types
field_types_inv = ann.field_schema.types_inv
tp = ann.fields[field_name]
type_select = widgets.ToggleButtons(
options=list(field_types.keys()),
value=field_types_inv[tp],
)

def on_change(name, value):
ann.info['visible_html_fields'][ann.index][field_name] = field_types[value]
ann.fields[field_name] = field_types[value]

type_select.on_trait_change(on_change, 'value')
return type_select
Expand Down
2 changes: 1 addition & 1 deletion notebooks/Add New Pages.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 1,
"metadata": {
"collapsed": false
},
Expand Down
139 changes: 38 additions & 101 deletions notebooks/Annotate.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -31,47 +31,22 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 2,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"import re\n",
"\n",
"from ipywidgets import widgets\n",
"from ipywidgets import interact, interactive, fixed\n",
"from IPython.display import HTML, display\n",
"\n",
"from formasaurus.html import load_html, get_cleaned_form_html, get_field_names, get_fields_to_annotate\n",
"from formasaurus.annotation import print_form_html\n",
"from formasaurus.storage import Storage\n",
"from formasaurus.widgets import (\n",
" HtmlView, FormTypeSelect, FieldTypeSelect, FormAnnotator, MultiFormAnnotator, \n",
")\n",
"from formasaurus.utils import inverse_mapping\n",
"from formasaurus import annotation"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"from formasaurus.widgets import MultiFormAnnotator\n",
"\n",
"storage = Storage(\"../formasaurus/data/\")\n",
"index = storage.get_index()\n",
"\n",
"def save_func():\n",
" storage.write_index(index) \n",
" \n",
"field_schema = storage.get_field_schema()\n",
"form_schema = storage.get_form_schema()\n",
"all_annotations = list(storage.iter_annotations(index, drop_na=False))\n",
"\n",
"def ann_by_type(types):\n",
"def ann_by_form_type(types):\n",
" \"\"\" Return all annotations with given form types (short names) \"\"\"\n",
" return [a for a in all_annotations if a.type in types]\n",
"\n",
Expand All @@ -80,124 +55,86 @@
" \"\"\" Return all annotations which has at least one field of a given type \"\"\"\n",
" return [\n",
" a for a in all_annotations \n",
" if field_type in a.info['visible_html_fields'][a.index].values()\n",
" if field_type in a.fields.values()\n",
" ] \n",
"\n",
"def get_fields_annotation(ann):\n",
" return ann.info['visible_html_fields'][ann.index]\n",
"\n",
"def fields_annotation_complete(ann, na_value):\n",
" field_ann = get_fields_annotation(ann)\n",
" if not field_ann:\n",
" return True\n",
" return all(v != na_value for v in field_ann.values())\n",
"\n",
"def fields_annotation_partial(ann, na_value):\n",
" field_ann = get_fields_annotation(ann)\n",
" if not field_ann:\n",
" return False\n",
" values = field_ann.values()\n",
" return any(v == na_value for v in values) and not all(v == na_value for v in values) "
"all_annotations = list(storage.iter_annotations(index, drop_na=False, simplify_form_types=False))"
]
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 3,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": []
},
{
"data": {
"text/plain": [
"Counter({'N': 9,\n",
" 'P': 6,\n",
" 'V': 16,\n",
" 'b': 22,\n",
" 'c': 91,\n",
" 'l': 185,\n",
" 'm': 78,\n",
" 'o': 77,\n",
" 'p': 78,\n",
" 'r': 123,\n",
" 's': 287})"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"outputs": [],
"source": [
"#storage.get_form_type_counts()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Usage\n",
"\n",
"To annotate fields, get a list of FormAnnotation objects and create MultiFormAnnotator. \n",
"Data is saved when you move to next form. Below is a couple of examples: a widget for annotating unfinished forms and a widget for reviewing forms with captchas."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"storage.get_form_type_counts()"
"### Annotate forms which are not fully annotated yet"
]
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 4,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"partial_annotations = [a for a in all_annotations if fields_annotation_partial(a, field_schema.na_value)]\n",
"partial_annotations = [a for a in all_annotations if a.fields_partially_annotated]\n",
"not_annotated_annotations = [\n",
" a for a in all_annotations if not fields_annotation_complete(a, field_schema.na_value)\n",
" a for a in all_annotations if a.fields and not a.fields_annotated\n",
"]\n",
"_annotations = partial_annotations + not_annotated_annotations\n",
"MultiFormAnnotator(_annotations, form_schema.types, field_schema.types, annotate_types=True, save_func=save_func)"
"MultiFormAnnotator(_annotations, annotate_types=True, save_func=save_func)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"FormAnnotation(form=<Element form at 0x110a58048>, type='l', index=0, info={'url': 'http://www.acozinhar.pt/portal/parceiros/', 'visible_html_fields': [{'pwd': 'p1', 'log': 'us', 'submit': 'Bs', 'rememberme': 'rC'}], 'forms': ['l']}, key='html/www.acozinhar.pt-0.html')"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"cell_type": "markdown",
"metadata": {},
"source": [
"all_annotations[0]"
"### View and change annotations for forms with a specific field type"
]
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 6,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"_annotations = ann_by_field_type('ab')\n",
"MultiFormAnnotator(_annotations, form_schema.types, field_schema.types, \n",
" annotate_types=True, annotate_fields=True, save_func=save_func)"
"_annotations = ann_by_field_type('ca') # forms with captchas\n",
"MultiFormAnnotator(_annotations, annotate_types=True, annotate_fields=True, save_func=save_func)"
]
},
{
"cell_type": "code",
"execution_count": 21,
"execution_count": 7,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# _annotations = [a for a in all_annotations if a.type == 'b']\n",
"# MultiFormAnnotator(form_na_annotations, form_types, field_types, annotate_types=True, save_func=save_func)"
"# MultiFormAnnotator(form_na_annotations, annotate_types=True, save_func=save_func)"
]
},
{
Expand Down

0 comments on commit c0d50f9

Please sign in to comment.