From 35fec15066b858667306329021f737e3becdc4d0 Mon Sep 17 00:00:00 2001 From: Alan Bertl Date: Fri, 31 Mar 2023 16:32:47 -0500 Subject: [PATCH 1/4] Add annotate method for PageLayout --- unstructured_inference/inference/layout.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/unstructured_inference/inference/layout.py b/unstructured_inference/inference/layout.py index e187f17f..a91ae7c7 100644 --- a/unstructured_inference/inference/layout.py +++ b/unstructured_inference/inference/layout.py @@ -170,6 +170,15 @@ def _get_image_array(self) -> Union[np.ndarray, None]: self.image_array = np.array(self.image) return self.image_array + def annotate(self) -> Image.Image: + """Returns image annotated with bounding boxes for the elements.""" + ann_img = self.image.copy() + draw_image = ImageDraw.ImageDraw(ann_img) + for el in self.elements: + box = (el.x1, el.y1, el.x2, el.y2) + draw_image = draw_image.rectangle(box, outline="red") + return ann_img + @classmethod def from_image( cls, From e98e92ef41bb448265899e238c0105552bb09093 Mon Sep 17 00:00:00 2001 From: Alan Bertl Date: Fri, 31 Mar 2023 16:33:28 -0500 Subject: [PATCH 2/4] extend extract_table through rest of interface --- unstructured_inference/inference/layout.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/unstructured_inference/inference/layout.py b/unstructured_inference/inference/layout.py index a91ae7c7..113e4bca 100644 --- a/unstructured_inference/inference/layout.py +++ b/unstructured_inference/inference/layout.py @@ -8,7 +8,7 @@ import numpy as np import pdfplumber import pdf2image -from PIL import Image +from PIL import Image, ImageDraw from unstructured_inference.inference.elements import TextRegion, ImageTextRegion, LayoutElement from unstructured_inference.logger import logger @@ -115,7 +115,7 @@ class PageLayout: def __init__( self, number: int, - image: Image, + image: Image.Image, layout: Optional[List[TextRegion]], model: Optional[UnstructuredModel] = None, ocr_strategy: str = "auto", @@ -211,6 +211,7 @@ def process_data_with_model( is_image: bool = False, ocr_strategy: str = "auto", fixed_layouts: Optional[List[Optional[List[TextRegion]]]] = None, + extract_tables: bool = False, ) -> DocumentLayout: """Processes pdf file in the form of a file handler (supporting a read method) into a DocumentLayout by using a model identified by model_name.""" @@ -222,6 +223,7 @@ def process_data_with_model( is_image=is_image, ocr_strategy=ocr_strategy, fixed_layouts=fixed_layouts, + extract_tables=extract_tables, ) return layout @@ -233,15 +235,22 @@ def process_file_with_model( is_image: bool = False, ocr_strategy: str = "auto", fixed_layouts: Optional[List[Optional[List[TextRegion]]]] = None, + extract_tables: bool = False, ) -> DocumentLayout: """Processes pdf file with name filename into a DocumentLayout by using a model identified by model_name.""" model = get_model(model_name) layout = ( - DocumentLayout.from_image_file(filename, model=model, ocr_strategy=ocr_strategy) + DocumentLayout.from_image_file( + filename, model=model, ocr_strategy=ocr_strategy, extract_tables=extract_tables + ) if is_image else DocumentLayout.from_file( - filename, model=model, ocr_strategy=ocr_strategy, fixed_layouts=fixed_layouts + filename, + model=model, + ocr_strategy=ocr_strategy, + fixed_layouts=fixed_layouts, + extract_tables=extract_tables, ) ) return layout From ba64328fef570ef6005a7d407ac3751afd4b27a5 Mon Sep 17 00:00:00 2001 From: Alan Bertl Date: Fri, 31 Mar 2023 16:38:06 -0500 Subject: [PATCH 3/4] Remove annotate method (for separate PR) --- unstructured_inference/inference/layout.py | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/unstructured_inference/inference/layout.py b/unstructured_inference/inference/layout.py index 113e4bca..f58442f4 100644 --- a/unstructured_inference/inference/layout.py +++ b/unstructured_inference/inference/layout.py @@ -8,7 +8,7 @@ import numpy as np import pdfplumber import pdf2image -from PIL import Image, ImageDraw +from PIL import Image from unstructured_inference.inference.elements import TextRegion, ImageTextRegion, LayoutElement from unstructured_inference.logger import logger @@ -170,15 +170,6 @@ def _get_image_array(self) -> Union[np.ndarray, None]: self.image_array = np.array(self.image) return self.image_array - def annotate(self) -> Image.Image: - """Returns image annotated with bounding boxes for the elements.""" - ann_img = self.image.copy() - draw_image = ImageDraw.ImageDraw(ann_img) - for el in self.elements: - box = (el.x1, el.y1, el.x2, el.y2) - draw_image = draw_image.rectangle(box, outline="red") - return ann_img - @classmethod def from_image( cls, From 5f1f2b608bb01541484da891e0c8c96654aded5d Mon Sep 17 00:00:00 2001 From: Alan Bertl Date: Fri, 31 Mar 2023 16:45:29 -0500 Subject: [PATCH 4/4] Bump version --- CHANGELOG.md | 4 ++++ unstructured_inference/__version__.py | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index aa229625..08611dd1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,7 @@ +## 0.3.2 + +* Allow extracting tables from higher level functions + ## 0.3.1 * Pin protobuf version to avoid errors diff --git a/unstructured_inference/__version__.py b/unstructured_inference/__version__.py index 48f4028d..9ed9c2a0 100644 --- a/unstructured_inference/__version__.py +++ b/unstructured_inference/__version__.py @@ -1 +1 @@ -__version__ = "0.3.1" # pragma: no cover +__version__ = "0.3.2" # pragma: no cover