From e2f51d0d450fe2130da0a5e0cf9b3827f70df7e2 Mon Sep 17 00:00:00 2001 From: Benjamin Torres Date: Tue, 17 Oct 2023 15:54:42 -0600 Subject: [PATCH 1/3] fix: skips ordering elements coming from Chipper --- unstructured_inference/inference/layout.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/unstructured_inference/inference/layout.py b/unstructured_inference/inference/layout.py index 4c2ebd70..89f1bded 100644 --- a/unstructured_inference/inference/layout.py +++ b/unstructured_inference/inference/layout.py @@ -25,6 +25,7 @@ from unstructured_inference.inference.pdf import get_images_from_pdf_element from unstructured_inference.logger import logger from unstructured_inference.models.base import get_model +from unstructured_inference.models.chipper import UnstructuredChipperModel from unstructured_inference.models.detectron2onnx import ( UnstructuredDetectronONNXModel, ) @@ -253,7 +254,11 @@ def get_elements_with_detection_model( else: merged_layout = inferred_layout - elements = self.get_elements_from_layout(cast(List[TextRegion], merged_layout)) + order_elements = isinstance(self.detection_model, UnstructuredChipperModel) + elements = self.get_elements_from_layout( + cast(List[TextRegion], merged_layout), + order_elements=order_elements, + ) if self.analysis: self.inferred_layout = inferred_layout @@ -264,10 +269,15 @@ def get_elements_with_detection_model( return elements - def get_elements_from_layout(self, layout: List[TextRegion]) -> List[LayoutElement]: + def get_elements_from_layout( + self, + layout: List[TextRegion], + order_elements: bool = True, + ) -> List[LayoutElement]: """Uses the given Layout to separate the page text into elements, either extracting the text from the discovered layout blocks.""" - layout = order_layout(layout) + if order_elements: + layout = order_layout(layout) elements = [ get_element_from_block( block=e, From f5abd9b8e079b743baa3b759885cf7baa05178dc Mon Sep 17 00:00:00 2001 From: Benjamin Torres Date: Tue, 17 Oct 2023 15:54:58 -0600 Subject: [PATCH 2/3] Changelog update --- CHANGELOG.md | 4 ++++ unstructured_inference/__version__.py | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ae489a56..aab18929 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,7 @@ +## 0.7.10-dev0 + +* fix: Skips ordering elements coming from Chipper + ## 0.7.9 * Allow table model to accept optional OCR tokens diff --git a/unstructured_inference/__version__.py b/unstructured_inference/__version__.py index 64b76f70..cadee0c9 100644 --- a/unstructured_inference/__version__.py +++ b/unstructured_inference/__version__.py @@ -1 +1 @@ -__version__ = "0.7.9" # pragma: no cover +__version__ = "0.7.10-dev0" # pragma: no cover From f0c86ffe1618130b1f9670de2d03b24b10374953 Mon Sep 17 00:00:00 2001 From: Benjamin Torres Date: Tue, 17 Oct 2023 16:03:39 -0600 Subject: [PATCH 3/3] fix: inverted logic to sort elements from Chipper --- unstructured_inference/inference/layout.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/unstructured_inference/inference/layout.py b/unstructured_inference/inference/layout.py index 89f1bded..9a86c2fc 100644 --- a/unstructured_inference/inference/layout.py +++ b/unstructured_inference/inference/layout.py @@ -253,8 +253,9 @@ def get_elements_with_detection_model( else: merged_layout = inferred_layout - - order_elements = isinstance(self.detection_model, UnstructuredChipperModel) + # If the model is a chipper model, we don't want to order the + # elements, as they are already ordered + order_elements = not isinstance(self.detection_model, UnstructuredChipperModel) elements = self.get_elements_from_layout( cast(List[TextRegion], merged_layout), order_elements=order_elements,