Feat: form parsing placeholders (#3034)

Allows introduction of form extraction in the future - sets up the FormKeysValues element & format, puts in an empty function call in the partition_pdf_or_image pipeline.
Unstructured-IO · May 16, 2024 · e6ada05 · e6ada05
1 parent 1fb0fe5
commit e6ada05
Show file tree

Hide file tree

Showing 9 changed files with 285 additions and 29 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,4 +1,4 @@
-## 0.13.8-dev12
+## 0.13.8-dev13
 
 ### Enhancements
 
@@ -7,6 +7,7 @@
 * **Add strategy parameter to `partition_docx()`.** Behavior of future enhancements may be sensitive the partitioning strategy. Add this parameter so `partition_docx()` is aware of the requested strategy.
 
 ### Features
+* **Add form extraction basics (document elements and placeholder code in partition)**. This is to lay the ground work for the future. Form extraction models are not currently available in the library. An attempt to use this functionality will end in a `NotImplementedError`.
 
 ### Fixes
 

diff --git a/example-docs/test_evaluate_files/unstructured_output/form.json b/example-docs/test_evaluate_files/unstructured_output/form.json
@@ -0,0 +1,149 @@
+[
+    {
+        "type": "FormKeysValues",
+        "element_id": "MOCK_FORM_ID",
+        "text": "",
+        "metadata": {
+            "coordinates": {
+                "points": [
+                    [
+                        35.15625,
+                        95.556640625
+                    ],
+                    [
+                        710.357666015625,
+                        95.556640625
+                    ],
+                    [
+                        710.357666015625,
+                        887.890625
+                    ],
+                    [
+                        35.15625,
+                        887.890625
+                    ]
+                ],
+                "system": "PixelSpace",
+                "layout_width": 754,
+                "layout_height": 1000
+            },
+            "page_number": 1,
+            "key_value_pairs": [
+                {
+                    "key": {
+                        "text": "MOCK KEY",
+                        "custom_element": {
+                            "type": "UncategorizedText",
+                            "element_id": "MOCK_KEY_ID_1",
+                            "text": "MOCK KEY",
+                            "metadata": {
+                                "coordinates": {
+                                    "points": [
+                                        [
+                                            503.271484375,
+                                            96.3897705078125
+                                        ],
+                                        [
+                                            503.271484375,
+                                            107.5164794921875
+                                        ],
+                                        [
+                                            606.103515625,
+                                            107.5164794921875
+                                        ],
+                                        [
+                                            606.103515625,
+                                            96.3897705078125
+                                        ]
+                                    ],
+                                    "system": "PixelSpace",
+                                    "layout_width": 754,
+                                    "layout_height": 1000
+                                },
+                                "page_number": 1
+                            }
+                        },
+                        "layout_element_id": null
+                    },
+                    "value": {
+                        "text": "MOCK VALUE",
+                        "custom_element": {
+                            "type": "UncategorizedText",
+                            "element_id": "MOCK_VALUE_ID",
+                            "text": "MOCK VALUE",
+                            "metadata": {
+                                "coordinates": {
+                                    "points": [
+                                        [
+                                            557.568359375,
+                                            124.8626708984375
+                                        ],
+                                        [
+                                            557.568359375,
+                                            136.6607666015625
+                                        ],
+                                        [
+                                            595.556640625,
+                                            136.6607666015625
+                                        ],
+                                        [
+                                            595.556640625,
+                                            124.8626708984375
+                                        ]
+                                    ],
+                                    "system": "PixelSpace",
+                                    "layout_width": 754,
+                                    "layout_height": 1000
+                                },
+                                "page_number": 1
+                            }
+                        },
+                        "layout_element_id": null
+                    },
+                    "confidence": 0.0
+                },
+                {
+                    "key": {
+                        "text": "MOCK KEY 2",
+                        "custom_element": {
+                            "type": "UncategorizedText",
+                            "element_id": "MOCK_KEY_ID_2",
+                            "text": "MOCK KEY 2",
+                            "metadata": {
+                                "coordinates": {
+                                    "points": [
+                                        [
+                                            428.52783203125,
+                                            124.0478515625
+                                        ],
+                                        [
+                                            428.52783203125,
+                                            136.6943359375
+                                        ],
+                                        [
+                                            473.81591796875,
+                                            136.6943359375
+                                        ],
+                                        [
+                                            473.81591796875,
+                                            124.0478515625
+                                        ]
+                                    ],
+                                    "system": "PixelSpace",
+                                    "layout_width": 754,
+                                    "layout_height": 1000
+                                },
+                                "page_number": 1
+                            }
+                        },
+                        "layout_element_id": null
+                    },
+                    "value": null,
+                    "confidence": 0.0
+                }
+            ],
+            "file_directory": "dataset/testing_data/images",
+            "filename": "MOCK.png"
+        }
+    }
+]
diff --git a/test_unstructured/documents/test_elements.py b/test_unstructured/documents/test_elements.py
@@ -5,13 +5,14 @@
 from __future__ import annotations
 
 import copy
+import io
 import json
 import pathlib
 from functools import partial
 
 import pytest
 
-from test_unstructured.unit_utils import assign_hash_ids
+from test_unstructured.unit_utils import assign_hash_ids, example_doc_path
 from unstructured.cleaners.core import clean_bullets, clean_prefix
 from unstructured.documents.coordinates import (
     CoordinateSystem,
@@ -31,6 +32,7 @@
     Title,
     assign_and_map_hash_ids,
 )
+from unstructured.partition.json import partition_json
 
 
 @pytest.mark.parametrize("element", [Element(), Text(text=""), CheckBox()])
@@ -744,3 +746,13 @@ def test_id_to_hash_calculates(text, sequence_number, filename, page_number, exp
     )
     assert element.id_to_hash(sequence_number) == expected_hash, "Returned ID does not match"
     assert element.id == expected_hash, "ID should be set"
+
+
+def test_formskeysvalues_reads_saves():
+    filename = example_doc_path("test_evaluate_files/unstructured_output/form.json")
+    as_read = partition_json(filename=filename)
+    tmp_file = io.StringIO()
+    json.dump([element.to_dict() for element in as_read], tmp_file)
+    tmp_file.seek(0)
+    as_read_2 = partition_json(file=tmp_file)
+    assert as_read == as_read_2
diff --git a/unstructured/__version__.py b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.13.8-dev12"  # pragma: no cover
+__version__ = "0.13.8-dev13"  # pragma: no cover
diff --git a/unstructured/documents/elements.py b/unstructured/documents/elements.py
@@ -143,6 +143,18 @@ class Link(TypedDict):
     start_index: int
 
 
+class FormKeyOrValue(TypedDict):
+    text: str
+    layout_element_id: Optional[str]
+    custom_element: Optional[Text]
+
+
+class FormKeyValuePair(TypedDict):
+    key: FormKeyOrValue
+    value: Optional[FormKeyOrValue]
+    confidence: float
+
+
 class ElementMetadata:
     """Fully-dynamic replacement for dataclass-based ElementMetadata."""
 
@@ -176,6 +188,7 @@ class ElementMetadata:
     header_footer_type: Optional[str]
     # -- used in chunks only, when chunk must be split mid-text to fit window --
     is_continuation: Optional[bool]
+    key_value_pairs: Optional[list[FormKeyValuePair]]
     languages: Optional[list[str]]
     last_modified: Optional[str]
     link_texts: Optional[list[str]]
@@ -327,6 +340,8 @@ def from_dict(cls, meta_dict: dict[str, Any]) -> ElementMetadata:
                 self.data_source = DataSourceMetadata.from_dict(field_value)
             elif field_name == "orig_elements":
                 self.orig_elements = elements_from_base64_gzipped_json(field_value)
+            elif field_name == "key_value_pairs":
+                self.key_value_pairs = _kvform_rehydrate_internal_elements(field_value)
             else:
                 setattr(self, field_name, field_value)
 
@@ -392,6 +407,8 @@ def to_dict(self) -> dict[str, Any]:
             meta_dict["data_source"] = self.data_source.to_dict()
         if self.orig_elements is not None:
             meta_dict["orig_elements"] = elements_to_base64_gzipped_json(self.orig_elements)
+        if self.key_value_pairs is not None:
+            meta_dict["key_value_pairs"] = _kvform_pairs_to_dict(self.key_value_pairs)
 
         return meta_dict
 
@@ -494,6 +511,7 @@ def field_consolidation_strategies(cls) -> dict[str, ConsolidationStrategy]:
             "text_as_html": cls.FIRST,  # -- only occurs in Table --
             "table_as_cells": cls.FIRST,  # -- only occurs in Table --
             "url": cls.FIRST,
+            "key_value_pairs": cls.DROP,  # -- only occurs in FormKeysValues --
         }
 
 
@@ -660,6 +678,7 @@ class ElementType:
     PAGE_FOOTER = "Page-footer"
     PAGE_NUMBER = "PageNumber"
     CODE_SNIPPET = "CodeSnippet"
+    FORM_KEYS_VALUES = "FormKeysValues"
 
     @classmethod
     def to_dict(cls):
@@ -992,6 +1011,12 @@ class PageNumber(Text):
     category = "PageNumber"
 
 
+class FormKeysValues(Text):
+    """An element for capturing Key-Value dicts (forms)."""
+
+    category = "FormKeysValues"
+
+
 TYPE_TO_TEXT_ELEMENT_MAP: dict[str, type[Text]] = {
     ElementType.TITLE: Title,
     ElementType.SECTION_HEADER: Title,
@@ -1029,4 +1054,43 @@ class PageNumber(Text):
     ElementType.PAGE_BREAK: PageBreak,
     ElementType.CODE_SNIPPET: CodeSnippet,
     ElementType.PAGE_NUMBER: PageNumber,
+    ElementType.FORM_KEYS_VALUES: FormKeysValues,
 }
+
+
+def _kvform_rehydrate_internal_elements(kv_pairs: list[dict]) -> list[FormKeyValuePair]:
+    """
+    The key_value_pairs metadata field contains (in the vast majority of cases)
+    nested Text elements. Those need to be turned from dicts into Elements explicitly,
+    e.g. when partition_json is used.
+    """
+    from unstructured.staging.base import elements_from_dicts
+
+    # safe to overwrite - deepcopy already happened
+    for kv_pair in kv_pairs:
+        if kv_pair["key"]["custom_element"] is not None:
+            (kv_pair["key"]["custom_element"],) = elements_from_dicts(
+                [kv_pair["key"]["custom_element"]]
+            )
+        if kv_pair["value"] is not None and kv_pair["value"]["custom_element"] is not None:
+            (kv_pair["value"]["custom_element"],) = elements_from_dicts(
+                [kv_pair["value"]["custom_element"]]
+            )
+    return kv_pairs
+
+
+def _kvform_pairs_to_dict(kv_pairs: list[FormKeyValuePair]) -> list[dict]:
+    """
+    The key_value_pairs metadata field contains (in the vast majority of cases)
+    nested Text elements. Those need to be turned from Elements to dicts recursively,
+    e.g. when FormKeysValues.to_dict() is used.
+
+    """
+    kv_pairs: list[dict] = copy.deepcopy(kv_pairs)
+    for kv_pair in kv_pairs:
+        if kv_pair["key"]["custom_element"] is not None:
+            kv_pair["key"]["custom_element"] = kv_pair["key"]["custom_element"].to_dict()
+        if kv_pair["value"] is not None and kv_pair["value"]["custom_element"] is not None:
+            kv_pair["value"]["custom_element"] = kv_pair["value"]["custom_element"].to_dict()
+
+    return kv_pairs
diff --git a/unstructured/partition/image.py b/unstructured/partition/image.py
@@ -6,9 +6,7 @@
 from unstructured.documents.elements import Element, process_metadata
 from unstructured.file_utils.filetype import add_metadata
 from unstructured.partition.common import exactly_one
-from unstructured.partition.lang import (
-    check_language_args,
-)
+from unstructured.partition.lang import check_language_args
 from unstructured.partition.pdf import partition_pdf_or_image
 from unstructured.partition.utils.constants import PartitionStrategy
 
@@ -33,6 +31,8 @@ def partition_image(
     extract_image_block_to_payload: bool = False,
     date_from_file_object: bool = False,
     starting_page_number: int = 1,
+    extract_forms: bool = False,
+    form_extraction_skip_tables: bool = True,
     **kwargs: Any,
 ) -> list[Element]:
     """Parses an image into a list of interpreted elements.
@@ -90,6 +90,11 @@ def partition_image(
     date_from_file_object
         Applies only when providing file via `file` parameter. If this option is True, attempt
         infer last_modified metadata from bytes, otherwise set it to None.
+    extract_forms
+        Whether the form extraction logic should be run
+        (results in adding FormKeysValues elements to output).
+    form_extraction_skip_tables
+        Whether the form extraction logic should ignore regions designated as Tables.
     """
     exactly_one(filename=filename, file=file)
 
@@ -111,5 +116,7 @@ def partition_image(
         extract_image_block_to_payload=extract_image_block_to_payload,
         date_from_file_object=date_from_file_object,
         starting_page_number=starting_page_number,
+        extract_forms=extract_forms,
+        form_extraction_skip_tables=form_extraction_skip_tables,
         **kwargs,
     )