From 2bc8e01e626a4b11eb45ec0880d48fd39ec4486a Mon Sep 17 00:00:00 2001 From: Alan Bertl Date: Wed, 8 Mar 2023 22:12:03 -0600 Subject: [PATCH 1/4] Catch case where no elements are captured in filter --- unstructured_inference/inference/layout.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/unstructured_inference/inference/layout.py b/unstructured_inference/inference/layout.py index ba13eff9..a3430cb8 100644 --- a/unstructured_inference/inference/layout.py +++ b/unstructured_inference/inference/layout.py @@ -298,6 +298,13 @@ def aggregate_by_block( """Extracts the text aggregated from the elements of the given layout that lie within the given block.""" filtered_blocks = layout.filter_by(text_block, center=True) + # NOTE(alan): For now, if none of the elements discovered by layoutparser are in the block + # we can try interpreting the whole block. This still doesn't handle edge cases, like when there + # are some text elements within the block, but there are image elements overlapping the block + # with text lying within the block. In this case the text in the image would likely be ignored. + if not filtered_blocks: + text = interpret_text_block(text_block, image, ocr_strategy) + return text for little_block in filtered_blocks: little_block.text = interpret_text_block(little_block, image, ocr_strategy) text = " ".join([x for x in filtered_blocks.get_texts() if x]) From 7e63b7d4ab303876671f10857d0f3b1255552468 Mon Sep 17 00:00:00 2001 From: Alan Bertl Date: Wed, 8 Mar 2023 22:24:46 -0600 Subject: [PATCH 2/4] Update changelog --- CHANGELOG.md | 4 ++++ unstructured_inference/__version__.py | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f43c7976..9eb44fc4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,7 @@ +## 0.2.11 + +* Fixed some cases where image elements were not being OCR'd + ## 0.2.10 * Removed control characters from tesseract output diff --git a/unstructured_inference/__version__.py b/unstructured_inference/__version__.py index 47aedffe..514448fa 100644 --- a/unstructured_inference/__version__.py +++ b/unstructured_inference/__version__.py @@ -1 +1 @@ -__version__ = "0.2.10" # pragma: no cover +__version__ = "0.2.110" # pragma: no cover From 21752e6878308f146fb9a01d9beee57405cd3f44 Mon Sep 17 00:00:00 2001 From: Alan Bertl Date: Wed, 8 Mar 2023 22:54:01 -0600 Subject: [PATCH 3/4] fix version sync bug --- scripts/version-sync.sh | 2 +- unstructured_inference/__version__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/version-sync.sh b/scripts/version-sync.sh index 3b3e17bf..e0e873c5 100755 --- a/scripts/version-sync.sh +++ b/scripts/version-sync.sh @@ -16,7 +16,7 @@ done # Version appearing earliest in CHANGELOGFILE will be used as ground truth. CHANGELOGFILE="CHANGELOG.md" VERSIONFILE="unstructured_inference/__version__.py" -RE_SEMVER_FULL="(0|[1-9]\d*)\.(0|[1-9]\d*)\.(0|[1-9]\d*)(-((0|[1-9]\d*|\d*[a-zA-Z-][0-9a-zA-Z-]*)(\.(0|[1-9]\d*|\d*[a-zA-Z-][0-9a-zA-Z-]*))*))?(\+([0-9a-zA-Z-]+(\.[0-9a-zA-Z-]+)*))?" +RE_SEMVER_FULL="(0|[1-9][0-9]*)\.(0|[1-9][0-9]*)\.(0|[1-9][0-9]*)(-((0|[1-9][0-9]*|[0-9]*[a-zA-Z-][0-9a-zA-Z-]*)(\.(0|[1-9][0-9]*|[0-9]*[a-zA-Z-][0-9a-zA-Z-]*))*))?(\+([0-9a-zA-Z-]+(\.[0-9a-zA-Z-]+)*))?" # Pull out semver appearing earliest in CHANGELOGFILE. LAST_VERSION=$(grep -o -m 1 -E "${RE_SEMVER_FULL}" "$CHANGELOGFILE") diff --git a/unstructured_inference/__version__.py b/unstructured_inference/__version__.py index 514448fa..ac745293 100644 --- a/unstructured_inference/__version__.py +++ b/unstructured_inference/__version__.py @@ -1 +1 @@ -__version__ = "0.2.110" # pragma: no cover +__version__ = "0.2.11" # pragma: no cover From 0bf0a2ccfbc00208ee6a8d21363c27104276e3e3 Mon Sep 17 00:00:00 2001 From: Alan Bertl Date: Wed, 8 Mar 2023 23:40:15 -0600 Subject: [PATCH 4/4] Add test that failed before, passes now --- test_unstructured_inference/inference/test_layout.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/test_unstructured_inference/inference/test_layout.py b/test_unstructured_inference/inference/test_layout.py index e1cd08d7..251ee4cb 100644 --- a/test_unstructured_inference/inference/test_layout.py +++ b/test_unstructured_inference/inference/test_layout.py @@ -230,6 +230,9 @@ class MockLayout: def __init__(self, *elements): self.elements = elements + def __len__(self): + return len(self.elements) + def sort(self, key, inplace): return self.elements @@ -239,6 +242,9 @@ def __iter__(self): def get_texts(self): return [el.text for el in self.elements] + def filter_by(self, *args, **kwargs): + return MockLayout() + @pytest.mark.parametrize( "block_text, layout_texts, expected_text", @@ -334,3 +340,9 @@ def test_invalid_ocr_strategy_raises(mock_image): ) def test_remove_control_characters(text, expected): assert layout.remove_control_characters(text) == expected + + +def test_interpret_called_when_filter_empty(mock_image): + with patch("unstructured_inference.inference.layout.interpret_text_block"): + layout.aggregate_by_block(MockTextBlock(), mock_image, MockLayout()) + layout.interpret_text_block.assert_called_once()