From 2bc8e01e626a4b11eb45ec0880d48fd39ec4486a Mon Sep 17 00:00:00 2001
From: Alan Bertl <alan@unstructured.io>
Date: Wed, 8 Mar 2023 22:12:03 -0600
Subject: [PATCH 1/4] Catch case where no elements are captured in filter

---
 unstructured_inference/inference/layout.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/unstructured_inference/inference/layout.py b/unstructured_inference/inference/layout.py
index ba13eff9..a3430cb8 100644
--- a/unstructured_inference/inference/layout.py
+++ b/unstructured_inference/inference/layout.py
@@ -298,6 +298,13 @@ def aggregate_by_block(
     """Extracts the text aggregated from the elements of the given layout that lie within the given
     block."""
     filtered_blocks = layout.filter_by(text_block, center=True)
+    # NOTE(alan): For now, if none of the elements discovered by layoutparser are in the block
+    # we can try interpreting the whole block. This still doesn't handle edge cases, like when there
+    # are some text elements within the block, but there are image elements overlapping the block
+    # with text lying within the block. In this case the text in the image would likely be ignored.
+    if not filtered_blocks:
+        text = interpret_text_block(text_block, image, ocr_strategy)
+        return text
     for little_block in filtered_blocks:
         little_block.text = interpret_text_block(little_block, image, ocr_strategy)
     text = " ".join([x for x in filtered_blocks.get_texts() if x])

From 7e63b7d4ab303876671f10857d0f3b1255552468 Mon Sep 17 00:00:00 2001
From: Alan Bertl <alan@unstructured.io>
Date: Wed, 8 Mar 2023 22:24:46 -0600
Subject: [PATCH 2/4] Update changelog

---
 CHANGELOG.md                          | 4 ++++
 unstructured_inference/__version__.py | 2 +-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index f43c7976..9eb44fc4 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,7 @@
+## 0.2.11
+
+* Fixed some cases where image elements were not being OCR'd
+
 ## 0.2.10
 
 * Removed control characters from tesseract output
diff --git a/unstructured_inference/__version__.py b/unstructured_inference/__version__.py
index 47aedffe..514448fa 100644
--- a/unstructured_inference/__version__.py
+++ b/unstructured_inference/__version__.py
@@ -1 +1 @@
-__version__ = "0.2.10"  # pragma: no cover
+__version__ = "0.2.110"  # pragma: no cover

From 21752e6878308f146fb9a01d9beee57405cd3f44 Mon Sep 17 00:00:00 2001
From: Alan Bertl <alan@unstructured.io>
Date: Wed, 8 Mar 2023 22:54:01 -0600
Subject: [PATCH 3/4] fix version sync bug

---
 scripts/version-sync.sh               | 2 +-
 unstructured_inference/__version__.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/version-sync.sh b/scripts/version-sync.sh
index 3b3e17bf..e0e873c5 100755
--- a/scripts/version-sync.sh
+++ b/scripts/version-sync.sh
@@ -16,7 +16,7 @@ done
 # Version appearing earliest in CHANGELOGFILE will be used as ground truth.
 CHANGELOGFILE="CHANGELOG.md"
 VERSIONFILE="unstructured_inference/__version__.py"
-RE_SEMVER_FULL="(0|[1-9]\d*)\.(0|[1-9]\d*)\.(0|[1-9]\d*)(-((0|[1-9]\d*|\d*[a-zA-Z-][0-9a-zA-Z-]*)(\.(0|[1-9]\d*|\d*[a-zA-Z-][0-9a-zA-Z-]*))*))?(\+([0-9a-zA-Z-]+(\.[0-9a-zA-Z-]+)*))?"
+RE_SEMVER_FULL="(0|[1-9][0-9]*)\.(0|[1-9][0-9]*)\.(0|[1-9][0-9]*)(-((0|[1-9][0-9]*|[0-9]*[a-zA-Z-][0-9a-zA-Z-]*)(\.(0|[1-9][0-9]*|[0-9]*[a-zA-Z-][0-9a-zA-Z-]*))*))?(\+([0-9a-zA-Z-]+(\.[0-9a-zA-Z-]+)*))?"
 # Pull out semver appearing earliest in CHANGELOGFILE.
 LAST_VERSION=$(grep -o -m 1 -E "${RE_SEMVER_FULL}" "$CHANGELOGFILE")
 
diff --git a/unstructured_inference/__version__.py b/unstructured_inference/__version__.py
index 514448fa..ac745293 100644
--- a/unstructured_inference/__version__.py
+++ b/unstructured_inference/__version__.py
@@ -1 +1 @@
-__version__ = "0.2.110"  # pragma: no cover
+__version__ = "0.2.11"  # pragma: no cover

From 0bf0a2ccfbc00208ee6a8d21363c27104276e3e3 Mon Sep 17 00:00:00 2001
From: Alan Bertl <alan@unstructured.io>
Date: Wed, 8 Mar 2023 23:40:15 -0600
Subject: [PATCH 4/4] Add test that failed before, passes now

---
 test_unstructured_inference/inference/test_layout.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/test_unstructured_inference/inference/test_layout.py b/test_unstructured_inference/inference/test_layout.py
index e1cd08d7..251ee4cb 100644
--- a/test_unstructured_inference/inference/test_layout.py
+++ b/test_unstructured_inference/inference/test_layout.py
@@ -230,6 +230,9 @@ class MockLayout:
     def __init__(self, *elements):
         self.elements = elements
 
+    def __len__(self):
+        return len(self.elements)
+
     def sort(self, key, inplace):
         return self.elements
 
@@ -239,6 +242,9 @@ def __iter__(self):
     def get_texts(self):
         return [el.text for el in self.elements]
 
+    def filter_by(self, *args, **kwargs):
+        return MockLayout()
+
 
 @pytest.mark.parametrize(
     "block_text, layout_texts, expected_text",
@@ -334,3 +340,9 @@ def test_invalid_ocr_strategy_raises(mock_image):
 )
 def test_remove_control_characters(text, expected):
     assert layout.remove_control_characters(text) == expected
+
+
+def test_interpret_called_when_filter_empty(mock_image):
+    with patch("unstructured_inference.inference.layout.interpret_text_block"):
+        layout.aggregate_by_block(MockTextBlock(), mock_image, MockLayout())
+        layout.interpret_text_block.assert_called_once()