Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
## 0.2.11

* Fixed some cases where image elements were not being OCR'd

## 0.2.10

* Removed control characters from tesseract output
Expand Down
2 changes: 1 addition & 1 deletion scripts/version-sync.sh
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ done
# Version appearing earliest in CHANGELOGFILE will be used as ground truth.
CHANGELOGFILE="CHANGELOG.md"
VERSIONFILE="unstructured_inference/__version__.py"
RE_SEMVER_FULL="(0|[1-9]\d*)\.(0|[1-9]\d*)\.(0|[1-9]\d*)(-((0|[1-9]\d*|\d*[a-zA-Z-][0-9a-zA-Z-]*)(\.(0|[1-9]\d*|\d*[a-zA-Z-][0-9a-zA-Z-]*))*))?(\+([0-9a-zA-Z-]+(\.[0-9a-zA-Z-]+)*))?"
RE_SEMVER_FULL="(0|[1-9][0-9]*)\.(0|[1-9][0-9]*)\.(0|[1-9][0-9]*)(-((0|[1-9][0-9]*|[0-9]*[a-zA-Z-][0-9a-zA-Z-]*)(\.(0|[1-9][0-9]*|[0-9]*[a-zA-Z-][0-9a-zA-Z-]*))*))?(\+([0-9a-zA-Z-]+(\.[0-9a-zA-Z-]+)*))?"
# Pull out semver appearing earliest in CHANGELOGFILE.
LAST_VERSION=$(grep -o -m 1 -E "${RE_SEMVER_FULL}" "$CHANGELOGFILE")

Expand Down
12 changes: 12 additions & 0 deletions test_unstructured_inference/inference/test_layout.py
Original file line number Diff line number Diff line change
Expand Up @@ -230,6 +230,9 @@ class MockLayout:
def __init__(self, *elements):
self.elements = elements

def __len__(self):
return len(self.elements)

def sort(self, key, inplace):
return self.elements

Expand All @@ -239,6 +242,9 @@ def __iter__(self):
def get_texts(self):
return [el.text for el in self.elements]

def filter_by(self, *args, **kwargs):
return MockLayout()


@pytest.mark.parametrize(
"block_text, layout_texts, expected_text",
Expand Down Expand Up @@ -334,3 +340,9 @@ def test_invalid_ocr_strategy_raises(mock_image):
)
def test_remove_control_characters(text, expected):
assert layout.remove_control_characters(text) == expected


def test_interpret_called_when_filter_empty(mock_image):
with patch("unstructured_inference.inference.layout.interpret_text_block"):
layout.aggregate_by_block(MockTextBlock(), mock_image, MockLayout())
layout.interpret_text_block.assert_called_once()
2 changes: 1 addition & 1 deletion unstructured_inference/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.2.10" # pragma: no cover
__version__ = "0.2.11" # pragma: no cover
7 changes: 7 additions & 0 deletions unstructured_inference/inference/layout.py
Original file line number Diff line number Diff line change
Expand Up @@ -298,6 +298,13 @@ def aggregate_by_block(
"""Extracts the text aggregated from the elements of the given layout that lie within the given
block."""
filtered_blocks = layout.filter_by(text_block, center=True)
# NOTE(alan): For now, if none of the elements discovered by layoutparser are in the block
# we can try interpreting the whole block. This still doesn't handle edge cases, like when there
# are some text elements within the block, but there are image elements overlapping the block
# with text lying within the block. In this case the text in the image would likely be ignored.
if not filtered_blocks:
text = interpret_text_block(text_block, image, ocr_strategy)
return text
for little_block in filtered_blocks:
little_block.text = interpret_text_block(little_block, image, ocr_strategy)
text = " ".join([x for x in filtered_blocks.get_texts() if x])
Expand Down