From b1c8ec0b1188b5e6233049da797e36afcbffc9cd Mon Sep 17 00:00:00 2001 From: Alan Bertl Date: Fri, 20 Oct 2023 02:26:54 -0500 Subject: [PATCH 1/6] Handle kwargs explicitly in signature --- test_unstructured_inference/inference/test_layout.py | 4 +++- unstructured_inference/inference/layout.py | 12 ++++++++++++ 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/test_unstructured_inference/inference/test_layout.py b/test_unstructured_inference/inference/test_layout.py index 49f8d6e4..b57fd9b0 100644 --- a/test_unstructured_inference/inference/test_layout.py +++ b/test_unstructured_inference/inference/test_layout.py @@ -2,7 +2,7 @@ import os.path import tempfile from functools import partial -from unittest.mock import mock_open, patch +from unittest.mock import mock_open, patch, ANY import numpy as np import pytest @@ -675,6 +675,8 @@ def test_process_file_with_model_routing(monkeypatch, model_type, is_detection_m fixed_layouts=None, extract_tables=False, pdf_image_dpi=200, + extract_images_in_pdf=ANY, + image_output_dir_path=ANY, ) diff --git a/unstructured_inference/inference/layout.py b/unstructured_inference/inference/layout.py index 5ab39aab..ff2c1918 100644 --- a/unstructured_inference/inference/layout.py +++ b/unstructured_inference/inference/layout.py @@ -76,6 +76,8 @@ def from_file( fixed_layouts: Optional[List[Optional[List[TextRegion]]]] = None, extract_tables: bool = False, pdf_image_dpi: int = 200, + extract_images_in_pdf: bool = False, + image_output_dir_path: Optional[str] = None, **kwargs, ) -> DocumentLayout: """Creates a DocumentLayout from a pdf file.""" @@ -113,6 +115,8 @@ def from_file( layout=layout, fixed_layout=fixed_layout, extract_tables=extract_tables, + extract_images_in_pdf=extract_images_in_pdf, + image_output_dir_path=image_output_dir_path, **kwargs, ) pages.append(page) @@ -457,6 +461,8 @@ def process_data_with_model( fixed_layouts: Optional[List[Optional[List[TextRegion]]]] = None, extract_tables: bool = False, pdf_image_dpi: int = 200, + extract_images_in_pdf: bool = False, + image_output_dir_path: Optional[str] = None, **kwargs, ) -> DocumentLayout: """Processes pdf file in the form of a file handler (supporting a read method) into a @@ -471,6 +477,8 @@ def process_data_with_model( fixed_layouts=fixed_layouts, extract_tables=extract_tables, pdf_image_dpi=pdf_image_dpi, + extract_images_in_pdf=extract_images_in_pdf, + image_output_dir_path=image_output_dir_path, **kwargs, ) @@ -484,6 +492,8 @@ def process_file_with_model( fixed_layouts: Optional[List[Optional[List[TextRegion]]]] = None, extract_tables: bool = False, pdf_image_dpi: int = 200, + extract_images_in_pdf: bool = False, + image_output_dir_path: Optional[str] = None, **kwargs, ) -> DocumentLayout: """Processes pdf file with name filename into a DocumentLayout by using a model identified by @@ -514,6 +524,8 @@ def process_file_with_model( fixed_layouts=fixed_layouts, extract_tables=extract_tables, pdf_image_dpi=pdf_image_dpi, + extract_images_in_pdf=extract_images_in_pdf, + image_output_dir_path=image_output_dir_path, **kwargs, ) ) From ebb86c46c648c80c93e547044e37c3d0da78f88b Mon Sep 17 00:00:00 2001 From: Alan Bertl Date: Fri, 20 Oct 2023 02:28:12 -0500 Subject: [PATCH 2/6] update changelog --- CHANGELOG.md | 3 ++- unstructured_inference/__version__.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 45d58139..24b260f1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,6 @@ -## 0.7.10-dev1 +## 0.7.10-dev2 +* Handle kwargs related to pdf processing in signature * fix: Reduce Chipper memory consumption on x86_64 cpus * fix: Skips ordering elements coming from Chipper * fix: After refactoring to introduce Chipper, annotate() weren't able to show text with extra info from elements, this is fixed now. diff --git a/unstructured_inference/__version__.py b/unstructured_inference/__version__.py index a28807cc..5a48fbf0 100644 --- a/unstructured_inference/__version__.py +++ b/unstructured_inference/__version__.py @@ -1 +1 @@ -__version__ = "0.7.10-dev1" # pragma: no cover +__version__ = "0.7.10-dev2" # pragma: no cover From 90072b77d4ff728e3a029c2b63f118f952c3316e Mon Sep 17 00:00:00 2001 From: Alan Bertl Date: Fri, 20 Oct 2023 02:45:25 -0500 Subject: [PATCH 3/6] Suppress kwargs when not used --- CHANGELOG.md | 2 +- unstructured_inference/inference/layout.py | 22 ---------------------- 2 files changed, 1 insertion(+), 23 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 24b260f1..36065d0e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,6 @@ ## 0.7.10-dev2 -* Handle kwargs related to pdf processing in signature +* Handle kwargs explicitly when needed, suppress otherwise * fix: Reduce Chipper memory consumption on x86_64 cpus * fix: Skips ordering elements coming from Chipper * fix: After refactoring to introduce Chipper, annotate() weren't able to show text with extra info from elements, this is fixed now. diff --git a/unstructured_inference/inference/layout.py b/unstructured_inference/inference/layout.py index ff2c1918..e8303e66 100644 --- a/unstructured_inference/inference/layout.py +++ b/unstructured_inference/inference/layout.py @@ -71,13 +71,8 @@ def from_pages(cls, pages: List[PageLayout]) -> DocumentLayout: def from_file( cls, filename: str, - detection_model: Optional[UnstructuredObjectDetectionModel] = None, - element_extraction_model: Optional[UnstructuredElementExtractionModel] = None, fixed_layouts: Optional[List[Optional[List[TextRegion]]]] = None, - extract_tables: bool = False, pdf_image_dpi: int = 200, - extract_images_in_pdf: bool = False, - image_output_dir_path: Optional[str] = None, **kwargs, ) -> DocumentLayout: """Creates a DocumentLayout from a pdf file.""" @@ -110,13 +105,8 @@ def from_file( image, number=i + 1, document_filename=filename, - detection_model=detection_model, - element_extraction_model=element_extraction_model, layout=layout, fixed_layout=fixed_layout, - extract_tables=extract_tables, - extract_images_in_pdf=extract_images_in_pdf, - image_output_dir_path=image_output_dir_path, **kwargs, ) pages.append(page) @@ -457,12 +447,6 @@ def from_image( def process_data_with_model( data: BinaryIO, model_name: Optional[str], - is_image: bool = False, - fixed_layouts: Optional[List[Optional[List[TextRegion]]]] = None, - extract_tables: bool = False, - pdf_image_dpi: int = 200, - extract_images_in_pdf: bool = False, - image_output_dir_path: Optional[str] = None, **kwargs, ) -> DocumentLayout: """Processes pdf file in the form of a file handler (supporting a read method) into a @@ -473,12 +457,6 @@ def process_data_with_model( layout = process_file_with_model( tmp_file.name, model_name, - is_image=is_image, - fixed_layouts=fixed_layouts, - extract_tables=extract_tables, - pdf_image_dpi=pdf_image_dpi, - extract_images_in_pdf=extract_images_in_pdf, - image_output_dir_path=image_output_dir_path, **kwargs, ) From 39efb7a07235975dca6f9b23c474842c2a05cf84 Mon Sep 17 00:00:00 2001 From: Alan Bertl Date: Fri, 20 Oct 2023 02:54:05 -0500 Subject: [PATCH 4/6] liinting --- test_unstructured_inference/inference/test_layout.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test_unstructured_inference/inference/test_layout.py b/test_unstructured_inference/inference/test_layout.py index b57fd9b0..a75b505c 100644 --- a/test_unstructured_inference/inference/test_layout.py +++ b/test_unstructured_inference/inference/test_layout.py @@ -2,7 +2,7 @@ import os.path import tempfile from functools import partial -from unittest.mock import mock_open, patch, ANY +from unittest.mock import ANY, mock_open, patch import numpy as np import pytest From f139757bba2fa638b7dcb2368d905f28b70d454d Mon Sep 17 00:00:00 2001 From: Alan Bertl Date: Fri, 20 Oct 2023 11:28:58 -0500 Subject: [PATCH 5/6] release version --- CHANGELOG.md | 2 +- unstructured_inference/__version__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 36065d0e..5f00ac29 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.7.10-dev2 +## 0.7.10 * Handle kwargs explicitly when needed, suppress otherwise * fix: Reduce Chipper memory consumption on x86_64 cpus diff --git a/unstructured_inference/__version__.py b/unstructured_inference/__version__.py index 5a48fbf0..8f560143 100644 --- a/unstructured_inference/__version__.py +++ b/unstructured_inference/__version__.py @@ -1 +1 @@ -__version__ = "0.7.10-dev2" # pragma: no cover +__version__ = "0.7.10" # pragma: no cover From 237080b9c851aa14a4cc6dd387ece76b6dcb9060 Mon Sep 17 00:00:00 2001 From: Alan Bertl Date: Sat, 21 Oct 2023 02:25:59 -0500 Subject: [PATCH 6/6] trigger CI