From ab42c06ad1bacca7894a5ecd90514e69b6e85e49 Mon Sep 17 00:00:00 2001 From: christinestraub Date: Wed, 15 Nov 2023 11:09:13 -0800 Subject: [PATCH 1/6] feat: support extracting elements with types `Picture` and `Figure` --- unstructured_inference/inference/layout.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/unstructured_inference/inference/layout.py b/unstructured_inference/inference/layout.py index 86132275..f8b56172 100644 --- a/unstructured_inference/inference/layout.py +++ b/unstructured_inference/inference/layout.py @@ -296,8 +296,9 @@ def extract_images(self, output_dir_path: Optional[str] = None): os.makedirs(output_dir_path, exist_ok=True) figure_number = 0 + image_element_types = ["Image", "Picture", "Figure"] for el in self.elements: - if (el.bbox is None) or (el.type not in ["Image"]): + if (el.bbox is None) or (el.type not in image_element_types): continue figure_number += 1 From 72572328a3d3637c46a1fc8be19a83aada707283 Mon Sep 17 00:00:00 2001 From: christinestraub Date: Wed, 15 Nov 2023 11:33:50 -0800 Subject: [PATCH 2/6] feat: add constants for the element types --- unstructured_inference/constants.py | 19 +++++++++++++++ unstructured_inference/inference/layout.py | 4 ++-- .../inference/layoutelement.py | 20 ++++++++++------ unstructured_inference/models/detectron2.py | 11 +++++---- .../models/unstructuredmodel.py | 5 +++- unstructured_inference/models/yolox.py | 24 +++++++++---------- 6 files changed, 56 insertions(+), 27 deletions(-) diff --git a/unstructured_inference/constants.py b/unstructured_inference/constants.py index 900802ce..a500876c 100644 --- a/unstructured_inference/constants.py +++ b/unstructured_inference/constants.py @@ -18,6 +18,25 @@ class Source(Enum): SUPER_GRADIENTS = "super-gradients" +class ElementType: + IMAGE = "Image" + FIGURE = "Figure" + PICTURE = "Picture" + TABLE = "Table" + LIST = "List" + LIST_ITEM = "List-item" + FORMULA = "Formula" + CAPTION = "Caption" + PAGE_HEADER = "Page-header" + SECTION_HEADER = "Section-header" + PAGE_FOOTER = "Page-footer" + FOOTNOTE = "Footnote" + TITLE = "Title" + TEXT = "Text" + UNCATEGORIZED_TEXT = "UncategorizedText" + PAGE_BREAK = "PageBreak" + + FULL_PAGE_REGION_THRESHOLD = 0.99 # this field is defined by pytesseract/unstructured.pytesseract diff --git a/unstructured_inference/inference/layout.py b/unstructured_inference/inference/layout.py index f8b56172..04503d7c 100644 --- a/unstructured_inference/inference/layout.py +++ b/unstructured_inference/inference/layout.py @@ -11,7 +11,7 @@ from pdfminer.high_level import extract_pages from PIL import Image, ImageSequence -from unstructured_inference.constants import Source +from unstructured_inference.constants import ElementType, Source from unstructured_inference.inference.elements import ( EmbeddedTextRegion, ImageTextRegion, @@ -296,7 +296,7 @@ def extract_images(self, output_dir_path: Optional[str] = None): os.makedirs(output_dir_path, exist_ok=True) figure_number = 0 - image_element_types = ["Image", "Picture", "Figure"] + image_element_types = [ElementType.IMAGE, ElementType.PICTURE, ElementType.FIGURE] for el in self.elements: if (el.bbox is None) or (el.type not in image_element_types): continue diff --git a/unstructured_inference/inference/layoutelement.py b/unstructured_inference/inference/layoutelement.py index a602faf3..7efd7063 100644 --- a/unstructured_inference/inference/layoutelement.py +++ b/unstructured_inference/inference/layoutelement.py @@ -12,6 +12,7 @@ from unstructured_inference.config import inference_config from unstructured_inference.constants import ( FULL_PAGE_REGION_THRESHOLD, + ElementType, Source, ) from unstructured_inference.inference.elements import ( @@ -42,7 +43,7 @@ def extract_text( objects=objects, extract_tables=extract_tables, ) - if extract_tables and self.type == "Table": + if extract_tables and self.type == ElementType.TABLE: self.text_as_html = interpret_table_block(self, image) return text @@ -139,10 +140,10 @@ def merge_inferred_layout_with_extracted_layout( subregion_threshold=subregion_threshold, ) inferred_is_text = inferred_region.type not in ( - "Figure", - "Image", - "PageBreak", - "Table", + ElementType.FIGURE, + ElementType.IMAGE, + ElementType.PAGE_BREAK, + ElementType.TABLE, ) extracted_is_subregion_of_inferred = extracted_region.bbox.is_almost_subregion_of( inferred_region.bbox, @@ -169,7 +170,10 @@ def merge_inferred_layout_with_extracted_layout( # keep inferred region, remove extracted region grow_region_to_match_region(inferred_region.bbox, extracted_region.bbox) region_matched = True - elif either_region_is_subregion_of_other and inferred_region.type != "Table": + elif ( + either_region_is_subregion_of_other + and inferred_region.type != ElementType.TABLE + ): # keep extracted region, remove inferred region inferred_regions_to_remove.append(inferred_region) if not region_matched: @@ -178,7 +182,9 @@ def merge_inferred_layout_with_extracted_layout( categorized_extracted_elements_to_add = [ LayoutElement( text=el.text, - type="Image" if isinstance(el, ImageTextRegion) else "UncategorizedText", + type=ElementType.IMAGE + if isinstance(el, ImageTextRegion) + else ElementType.UNCATEGORIZED_TEXT, source=el.source, bbox=el.bbox, ) diff --git a/unstructured_inference/models/detectron2.py b/unstructured_inference/models/detectron2.py index 4f490c33..98939f88 100644 --- a/unstructured_inference/models/detectron2.py +++ b/unstructured_inference/models/detectron2.py @@ -9,6 +9,7 @@ from layoutparser.models.model_config import LayoutModelConfig from PIL import Image +from unstructured_inference.constants import ElementType from unstructured_inference.inference.layoutelement import LayoutElement from unstructured_inference.logger import logger from unstructured_inference.models.unstructuredmodel import ( @@ -18,11 +19,11 @@ DETECTRON_CONFIG: Final = "lp://PubLayNet/faster_rcnn_R_50_FPN_3x/config" DEFAULT_LABEL_MAP: Final[Dict[int, str]] = { - 0: "Text", - 1: "Title", - 2: "List", - 3: "Table", - 4: "Figure", + 0: ElementType.TEXT, + 1: ElementType.TITLE, + 2: ElementType.LIST, + 3: ElementType.TABLE, + 4: ElementType.FIGURE, } DEFAULT_EXTRA_CONFIG: Final[List[Any]] = ["MODEL.ROI_HEADS.SCORE_THRESH_TEST", 0.8] diff --git a/unstructured_inference/models/unstructuredmodel.py b/unstructured_inference/models/unstructuredmodel.py index 9305f8ea..376acb2d 100644 --- a/unstructured_inference/models/unstructuredmodel.py +++ b/unstructured_inference/models/unstructuredmodel.py @@ -6,6 +6,7 @@ import numpy as np from PIL.Image import Image +from unstructured_inference.constants import ElementType from unstructured_inference.inference.elements import ( grow_region_to_match_region, intersections, @@ -123,7 +124,9 @@ def enhance_regions( return elements @staticmethod - def clean_type(elements: List[LayoutElement], type_to_clean="Table") -> List[LayoutElement]: + def clean_type( + elements: List[LayoutElement], type_to_clean=ElementType.TABLE + ) -> List[LayoutElement]: """After this function, the list of elements will not contain any element inside of the type specified""" target_elements = [e for e in elements if e.type == type_to_clean] diff --git a/unstructured_inference/models/yolox.py b/unstructured_inference/models/yolox.py index 813ea638..47455cf4 100644 --- a/unstructured_inference/models/yolox.py +++ b/unstructured_inference/models/yolox.py @@ -12,23 +12,23 @@ from onnxruntime.capi import _pybind_state as C from PIL import Image -from unstructured_inference.constants import Source +from unstructured_inference.constants import ElementType, Source from unstructured_inference.inference.layoutelement import LayoutElement from unstructured_inference.models.unstructuredmodel import UnstructuredObjectDetectionModel from unstructured_inference.utils import LazyDict, LazyEvaluateInfo YOLOX_LABEL_MAP = { - 0: "Caption", - 1: "Footnote", - 2: "Formula", - 3: "List-item", - 4: "Page-footer", - 5: "Page-header", - 6: "Picture", - 7: "Section-header", - 8: "Table", - 9: "Text", - 10: "Title", + 0: ElementType.CAPTION, + 1: ElementType.FOOTNOTE, + 2: ElementType.FORMULA, + 3: ElementType.LIST_ITEM, + 4: ElementType.PAGE_FOOTER, + 5: ElementType.PAGE_HEADER, + 6: ElementType.PICTURE, + 7: ElementType.SECTION_HEADER, + 8: ElementType.TABLE, + 9: ElementType.TEXT, + 10: ElementType.TITLE, } MODEL_TYPES = { From de150ce614530a26b7fbbaf069672d6feddcf3db Mon Sep 17 00:00:00 2001 From: christinestraub Date: Wed, 15 Nov 2023 11:39:41 -0800 Subject: [PATCH 3/6] chore: update changelog & version --- CHANGELOG.md | 4 +++- unstructured_inference/__version__.py | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 88517419..2a9bf9c8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,7 @@ -## 0.7.13-dev0 +## 0.7.13-dev1 +* refactor: add a class `ElementType` for the element type constants and use the constants to replace element type strings +* enhancement: support extracting elements with types `Picture` and `Figure` * chore: supress UserWarning about specified model providers ## 0.7.12 diff --git a/unstructured_inference/__version__.py b/unstructured_inference/__version__.py index 7be10279..1615ba8d 100644 --- a/unstructured_inference/__version__.py +++ b/unstructured_inference/__version__.py @@ -1 +1 @@ -__version__ = "0.7.13-dev0" # pragma: no cover +__version__ = "0.7.13-dev1" # pragma: no cover From 09b4af9e529c3c69c3d881b1be6c9871e7c01020 Mon Sep 17 00:00:00 2001 From: christinestraub Date: Wed, 15 Nov 2023 11:41:27 -0800 Subject: [PATCH 4/6] chore: update .gitignore --- .gitignore | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 8c105a8d..22dae48b 100644 --- a/.gitignore +++ b/.gitignore @@ -143,4 +143,5 @@ dmypy.json .vscode/ sample-docs/*_images -examples/**/output \ No newline at end of file +examples/**/output +figures From b2514732361b3390a723d7f5b352d486537d8e84 Mon Sep 17 00:00:00 2001 From: christinestraub Date: Thu, 16 Nov 2023 13:25:37 -0800 Subject: [PATCH 5/6] chore: update version --- unstructured_inference/__version__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unstructured_inference/__version__.py b/unstructured_inference/__version__.py index 1615ba8d..6825a3de 100644 --- a/unstructured_inference/__version__.py +++ b/unstructured_inference/__version__.py @@ -1 +1 @@ -__version__ = "0.7.13-dev1" # pragma: no cover +__version__ = "0.7.13-dev2" # pragma: no cover From c8683d4a9289429fb795059d6818d624b33609e0 Mon Sep 17 00:00:00 2001 From: christinestraub Date: Thu, 16 Nov 2023 13:33:46 -0800 Subject: [PATCH 6/6] chore: update dev version to non-dev release --- CHANGELOG.md | 2 +- unstructured_inference/__version__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 93c52299..0310110b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.7.13-dev2 +## 0.7.13 * refactor: add a class `ElementType` for the element type constants and use the constants to replace element type strings * enhancement: support extracting elements with types `Picture` and `Figure` diff --git a/unstructured_inference/__version__.py b/unstructured_inference/__version__.py index 6825a3de..d3edd746 100644 --- a/unstructured_inference/__version__.py +++ b/unstructured_inference/__version__.py @@ -1 +1 @@ -__version__ = "0.7.13-dev2" # pragma: no cover +__version__ = "0.7.13" # pragma: no cover