Skip to content

Commit

Permalink
Feat: form parsing placeholders (#3034)
Browse files Browse the repository at this point in the history
Allows introduction of form extraction in the future - sets up the
FormKeysValues element & format, puts in an empty function call in the
partition_pdf_or_image pipeline.
  • Loading branch information
MillCheck authored May 16, 2024
1 parent 1fb0fe5 commit e6ada05
Show file tree
Hide file tree
Showing 9 changed files with 285 additions and 29 deletions.
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
## 0.13.8-dev12
## 0.13.8-dev13

### Enhancements

Expand All @@ -7,6 +7,7 @@
* **Add strategy parameter to `partition_docx()`.** Behavior of future enhancements may be sensitive the partitioning strategy. Add this parameter so `partition_docx()` is aware of the requested strategy.

### Features
* **Add form extraction basics (document elements and placeholder code in partition)**. This is to lay the ground work for the future. Form extraction models are not currently available in the library. An attempt to use this functionality will end in a `NotImplementedError`.

### Fixes

Expand Down
149 changes: 149 additions & 0 deletions example-docs/test_evaluate_files/unstructured_output/form.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
[
{
"type": "FormKeysValues",
"element_id": "MOCK_FORM_ID",
"text": "",
"metadata": {
"coordinates": {
"points": [
[
35.15625,
95.556640625
],
[
710.357666015625,
95.556640625
],
[
710.357666015625,
887.890625
],
[
35.15625,
887.890625
]
],
"system": "PixelSpace",
"layout_width": 754,
"layout_height": 1000
},
"page_number": 1,
"key_value_pairs": [
{
"key": {
"text": "MOCK KEY",
"custom_element": {
"type": "UncategorizedText",
"element_id": "MOCK_KEY_ID_1",
"text": "MOCK KEY",
"metadata": {
"coordinates": {
"points": [
[
503.271484375,
96.3897705078125
],
[
503.271484375,
107.5164794921875
],
[
606.103515625,
107.5164794921875
],
[
606.103515625,
96.3897705078125
]
],
"system": "PixelSpace",
"layout_width": 754,
"layout_height": 1000
},
"page_number": 1
}
},
"layout_element_id": null
},
"value": {
"text": "MOCK VALUE",
"custom_element": {
"type": "UncategorizedText",
"element_id": "MOCK_VALUE_ID",
"text": "MOCK VALUE",
"metadata": {
"coordinates": {
"points": [
[
557.568359375,
124.8626708984375
],
[
557.568359375,
136.6607666015625
],
[
595.556640625,
136.6607666015625
],
[
595.556640625,
124.8626708984375
]
],
"system": "PixelSpace",
"layout_width": 754,
"layout_height": 1000
},
"page_number": 1
}
},
"layout_element_id": null
},
"confidence": 0.0
},
{
"key": {
"text": "MOCK KEY 2",
"custom_element": {
"type": "UncategorizedText",
"element_id": "MOCK_KEY_ID_2",
"text": "MOCK KEY 2",
"metadata": {
"coordinates": {
"points": [
[
428.52783203125,
124.0478515625
],
[
428.52783203125,
136.6943359375
],
[
473.81591796875,
136.6943359375
],
[
473.81591796875,
124.0478515625
]
],
"system": "PixelSpace",
"layout_width": 754,
"layout_height": 1000
},
"page_number": 1
}
},
"layout_element_id": null
},
"value": null,
"confidence": 0.0
}
],
"file_directory": "dataset/testing_data/images",
"filename": "MOCK.png"
}
}
]
14 changes: 13 additions & 1 deletion test_unstructured/documents/test_elements.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,14 @@
from __future__ import annotations

import copy
import io
import json
import pathlib
from functools import partial

import pytest

from test_unstructured.unit_utils import assign_hash_ids
from test_unstructured.unit_utils import assign_hash_ids, example_doc_path
from unstructured.cleaners.core import clean_bullets, clean_prefix
from unstructured.documents.coordinates import (
CoordinateSystem,
Expand All @@ -31,6 +32,7 @@
Title,
assign_and_map_hash_ids,
)
from unstructured.partition.json import partition_json


@pytest.mark.parametrize("element", [Element(), Text(text=""), CheckBox()])
Expand Down Expand Up @@ -744,3 +746,13 @@ def test_id_to_hash_calculates(text, sequence_number, filename, page_number, exp
)
assert element.id_to_hash(sequence_number) == expected_hash, "Returned ID does not match"
assert element.id == expected_hash, "ID should be set"


def test_formskeysvalues_reads_saves():
filename = example_doc_path("test_evaluate_files/unstructured_output/form.json")
as_read = partition_json(filename=filename)
tmp_file = io.StringIO()
json.dump([element.to_dict() for element in as_read], tmp_file)
tmp_file.seek(0)
as_read_2 = partition_json(file=tmp_file)
assert as_read == as_read_2
2 changes: 1 addition & 1 deletion unstructured/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.13.8-dev12" # pragma: no cover
__version__ = "0.13.8-dev13" # pragma: no cover
64 changes: 64 additions & 0 deletions unstructured/documents/elements.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,18 @@ class Link(TypedDict):
start_index: int


class FormKeyOrValue(TypedDict):
text: str
layout_element_id: Optional[str]
custom_element: Optional[Text]


class FormKeyValuePair(TypedDict):
key: FormKeyOrValue
value: Optional[FormKeyOrValue]
confidence: float


class ElementMetadata:
"""Fully-dynamic replacement for dataclass-based ElementMetadata."""

Expand Down Expand Up @@ -176,6 +188,7 @@ class ElementMetadata:
header_footer_type: Optional[str]
# -- used in chunks only, when chunk must be split mid-text to fit window --
is_continuation: Optional[bool]
key_value_pairs: Optional[list[FormKeyValuePair]]
languages: Optional[list[str]]
last_modified: Optional[str]
link_texts: Optional[list[str]]
Expand Down Expand Up @@ -327,6 +340,8 @@ def from_dict(cls, meta_dict: dict[str, Any]) -> ElementMetadata:
self.data_source = DataSourceMetadata.from_dict(field_value)
elif field_name == "orig_elements":
self.orig_elements = elements_from_base64_gzipped_json(field_value)
elif field_name == "key_value_pairs":
self.key_value_pairs = _kvform_rehydrate_internal_elements(field_value)
else:
setattr(self, field_name, field_value)

Expand Down Expand Up @@ -392,6 +407,8 @@ def to_dict(self) -> dict[str, Any]:
meta_dict["data_source"] = self.data_source.to_dict()
if self.orig_elements is not None:
meta_dict["orig_elements"] = elements_to_base64_gzipped_json(self.orig_elements)
if self.key_value_pairs is not None:
meta_dict["key_value_pairs"] = _kvform_pairs_to_dict(self.key_value_pairs)

return meta_dict

Expand Down Expand Up @@ -494,6 +511,7 @@ def field_consolidation_strategies(cls) -> dict[str, ConsolidationStrategy]:
"text_as_html": cls.FIRST, # -- only occurs in Table --
"table_as_cells": cls.FIRST, # -- only occurs in Table --
"url": cls.FIRST,
"key_value_pairs": cls.DROP, # -- only occurs in FormKeysValues --
}


Expand Down Expand Up @@ -660,6 +678,7 @@ class ElementType:
PAGE_FOOTER = "Page-footer"
PAGE_NUMBER = "PageNumber"
CODE_SNIPPET = "CodeSnippet"
FORM_KEYS_VALUES = "FormKeysValues"

@classmethod
def to_dict(cls):
Expand Down Expand Up @@ -992,6 +1011,12 @@ class PageNumber(Text):
category = "PageNumber"


class FormKeysValues(Text):
"""An element for capturing Key-Value dicts (forms)."""

category = "FormKeysValues"


TYPE_TO_TEXT_ELEMENT_MAP: dict[str, type[Text]] = {
ElementType.TITLE: Title,
ElementType.SECTION_HEADER: Title,
Expand Down Expand Up @@ -1029,4 +1054,43 @@ class PageNumber(Text):
ElementType.PAGE_BREAK: PageBreak,
ElementType.CODE_SNIPPET: CodeSnippet,
ElementType.PAGE_NUMBER: PageNumber,
ElementType.FORM_KEYS_VALUES: FormKeysValues,
}


def _kvform_rehydrate_internal_elements(kv_pairs: list[dict]) -> list[FormKeyValuePair]:
"""
The key_value_pairs metadata field contains (in the vast majority of cases)
nested Text elements. Those need to be turned from dicts into Elements explicitly,
e.g. when partition_json is used.
"""
from unstructured.staging.base import elements_from_dicts

# safe to overwrite - deepcopy already happened
for kv_pair in kv_pairs:
if kv_pair["key"]["custom_element"] is not None:
(kv_pair["key"]["custom_element"],) = elements_from_dicts(
[kv_pair["key"]["custom_element"]]
)
if kv_pair["value"] is not None and kv_pair["value"]["custom_element"] is not None:
(kv_pair["value"]["custom_element"],) = elements_from_dicts(
[kv_pair["value"]["custom_element"]]
)
return kv_pairs


def _kvform_pairs_to_dict(kv_pairs: list[FormKeyValuePair]) -> list[dict]:
"""
The key_value_pairs metadata field contains (in the vast majority of cases)
nested Text elements. Those need to be turned from Elements to dicts recursively,
e.g. when FormKeysValues.to_dict() is used.
"""
kv_pairs: list[dict] = copy.deepcopy(kv_pairs)
for kv_pair in kv_pairs:
if kv_pair["key"]["custom_element"] is not None:
kv_pair["key"]["custom_element"] = kv_pair["key"]["custom_element"].to_dict()
if kv_pair["value"] is not None and kv_pair["value"]["custom_element"] is not None:
kv_pair["value"]["custom_element"] = kv_pair["value"]["custom_element"].to_dict()

return kv_pairs
13 changes: 10 additions & 3 deletions unstructured/partition/image.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,7 @@
from unstructured.documents.elements import Element, process_metadata
from unstructured.file_utils.filetype import add_metadata
from unstructured.partition.common import exactly_one
from unstructured.partition.lang import (
check_language_args,
)
from unstructured.partition.lang import check_language_args
from unstructured.partition.pdf import partition_pdf_or_image
from unstructured.partition.utils.constants import PartitionStrategy

Expand All @@ -33,6 +31,8 @@ def partition_image(
extract_image_block_to_payload: bool = False,
date_from_file_object: bool = False,
starting_page_number: int = 1,
extract_forms: bool = False,
form_extraction_skip_tables: bool = True,
**kwargs: Any,
) -> list[Element]:
"""Parses an image into a list of interpreted elements.
Expand Down Expand Up @@ -90,6 +90,11 @@ def partition_image(
date_from_file_object
Applies only when providing file via `file` parameter. If this option is True, attempt
infer last_modified metadata from bytes, otherwise set it to None.
extract_forms
Whether the form extraction logic should be run
(results in adding FormKeysValues elements to output).
form_extraction_skip_tables
Whether the form extraction logic should ignore regions designated as Tables.
"""
exactly_one(filename=filename, file=file)

Expand All @@ -111,5 +116,7 @@ def partition_image(
extract_image_block_to_payload=extract_image_block_to_payload,
date_from_file_object=date_from_file_object,
starting_page_number=starting_page_number,
extract_forms=extract_forms,
form_extraction_skip_tables=form_extraction_skip_tables,
**kwargs,
)
Loading

0 comments on commit e6ada05

Please sign in to comment.