diff --git a/.gitignore b/.gitignore index 8c105a8d..22dae48b 100644 --- a/.gitignore +++ b/.gitignore @@ -143,4 +143,5 @@ dmypy.json .vscode/ sample-docs/*_images -examples/**/output \ No newline at end of file +examples/**/output +figures diff --git a/CHANGELOG.md b/CHANGELOG.md index 9cd3e275..0310110b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,7 @@ -## 0.7.13-dev1 +## 0.7.13 +* refactor: add a class `ElementType` for the element type constants and use the constants to replace element type strings +* enhancement: support extracting elements with types `Picture` and `Figure` * fix: update logger in table initalization where the logger info was not showing * chore: supress UserWarning about specified model providers diff --git a/unstructured_inference/__version__.py b/unstructured_inference/__version__.py index 1615ba8d..d3edd746 100644 --- a/unstructured_inference/__version__.py +++ b/unstructured_inference/__version__.py @@ -1 +1 @@ -__version__ = "0.7.13-dev1" # pragma: no cover +__version__ = "0.7.13" # pragma: no cover diff --git a/unstructured_inference/constants.py b/unstructured_inference/constants.py index 900802ce..a500876c 100644 --- a/unstructured_inference/constants.py +++ b/unstructured_inference/constants.py @@ -18,6 +18,25 @@ class Source(Enum): SUPER_GRADIENTS = "super-gradients" +class ElementType: + IMAGE = "Image" + FIGURE = "Figure" + PICTURE = "Picture" + TABLE = "Table" + LIST = "List" + LIST_ITEM = "List-item" + FORMULA = "Formula" + CAPTION = "Caption" + PAGE_HEADER = "Page-header" + SECTION_HEADER = "Section-header" + PAGE_FOOTER = "Page-footer" + FOOTNOTE = "Footnote" + TITLE = "Title" + TEXT = "Text" + UNCATEGORIZED_TEXT = "UncategorizedText" + PAGE_BREAK = "PageBreak" + + FULL_PAGE_REGION_THRESHOLD = 0.99 # this field is defined by pytesseract/unstructured.pytesseract diff --git a/unstructured_inference/inference/layout.py b/unstructured_inference/inference/layout.py index 86132275..04503d7c 100644 --- a/unstructured_inference/inference/layout.py +++ b/unstructured_inference/inference/layout.py @@ -11,7 +11,7 @@ from pdfminer.high_level import extract_pages from PIL import Image, ImageSequence -from unstructured_inference.constants import Source +from unstructured_inference.constants import ElementType, Source from unstructured_inference.inference.elements import ( EmbeddedTextRegion, ImageTextRegion, @@ -296,8 +296,9 @@ def extract_images(self, output_dir_path: Optional[str] = None): os.makedirs(output_dir_path, exist_ok=True) figure_number = 0 + image_element_types = [ElementType.IMAGE, ElementType.PICTURE, ElementType.FIGURE] for el in self.elements: - if (el.bbox is None) or (el.type not in ["Image"]): + if (el.bbox is None) or (el.type not in image_element_types): continue figure_number += 1 diff --git a/unstructured_inference/inference/layoutelement.py b/unstructured_inference/inference/layoutelement.py index a602faf3..7efd7063 100644 --- a/unstructured_inference/inference/layoutelement.py +++ b/unstructured_inference/inference/layoutelement.py @@ -12,6 +12,7 @@ from unstructured_inference.config import inference_config from unstructured_inference.constants import ( FULL_PAGE_REGION_THRESHOLD, + ElementType, Source, ) from unstructured_inference.inference.elements import ( @@ -42,7 +43,7 @@ def extract_text( objects=objects, extract_tables=extract_tables, ) - if extract_tables and self.type == "Table": + if extract_tables and self.type == ElementType.TABLE: self.text_as_html = interpret_table_block(self, image) return text @@ -139,10 +140,10 @@ def merge_inferred_layout_with_extracted_layout( subregion_threshold=subregion_threshold, ) inferred_is_text = inferred_region.type not in ( - "Figure", - "Image", - "PageBreak", - "Table", + ElementType.FIGURE, + ElementType.IMAGE, + ElementType.PAGE_BREAK, + ElementType.TABLE, ) extracted_is_subregion_of_inferred = extracted_region.bbox.is_almost_subregion_of( inferred_region.bbox, @@ -169,7 +170,10 @@ def merge_inferred_layout_with_extracted_layout( # keep inferred region, remove extracted region grow_region_to_match_region(inferred_region.bbox, extracted_region.bbox) region_matched = True - elif either_region_is_subregion_of_other and inferred_region.type != "Table": + elif ( + either_region_is_subregion_of_other + and inferred_region.type != ElementType.TABLE + ): # keep extracted region, remove inferred region inferred_regions_to_remove.append(inferred_region) if not region_matched: @@ -178,7 +182,9 @@ def merge_inferred_layout_with_extracted_layout( categorized_extracted_elements_to_add = [ LayoutElement( text=el.text, - type="Image" if isinstance(el, ImageTextRegion) else "UncategorizedText", + type=ElementType.IMAGE + if isinstance(el, ImageTextRegion) + else ElementType.UNCATEGORIZED_TEXT, source=el.source, bbox=el.bbox, ) diff --git a/unstructured_inference/models/detectron2.py b/unstructured_inference/models/detectron2.py index 4f490c33..98939f88 100644 --- a/unstructured_inference/models/detectron2.py +++ b/unstructured_inference/models/detectron2.py @@ -9,6 +9,7 @@ from layoutparser.models.model_config import LayoutModelConfig from PIL import Image +from unstructured_inference.constants import ElementType from unstructured_inference.inference.layoutelement import LayoutElement from unstructured_inference.logger import logger from unstructured_inference.models.unstructuredmodel import ( @@ -18,11 +19,11 @@ DETECTRON_CONFIG: Final = "lp://PubLayNet/faster_rcnn_R_50_FPN_3x/config" DEFAULT_LABEL_MAP: Final[Dict[int, str]] = { - 0: "Text", - 1: "Title", - 2: "List", - 3: "Table", - 4: "Figure", + 0: ElementType.TEXT, + 1: ElementType.TITLE, + 2: ElementType.LIST, + 3: ElementType.TABLE, + 4: ElementType.FIGURE, } DEFAULT_EXTRA_CONFIG: Final[List[Any]] = ["MODEL.ROI_HEADS.SCORE_THRESH_TEST", 0.8] diff --git a/unstructured_inference/models/unstructuredmodel.py b/unstructured_inference/models/unstructuredmodel.py index 9305f8ea..376acb2d 100644 --- a/unstructured_inference/models/unstructuredmodel.py +++ b/unstructured_inference/models/unstructuredmodel.py @@ -6,6 +6,7 @@ import numpy as np from PIL.Image import Image +from unstructured_inference.constants import ElementType from unstructured_inference.inference.elements import ( grow_region_to_match_region, intersections, @@ -123,7 +124,9 @@ def enhance_regions( return elements @staticmethod - def clean_type(elements: List[LayoutElement], type_to_clean="Table") -> List[LayoutElement]: + def clean_type( + elements: List[LayoutElement], type_to_clean=ElementType.TABLE + ) -> List[LayoutElement]: """After this function, the list of elements will not contain any element inside of the type specified""" target_elements = [e for e in elements if e.type == type_to_clean] diff --git a/unstructured_inference/models/yolox.py b/unstructured_inference/models/yolox.py index 813ea638..47455cf4 100644 --- a/unstructured_inference/models/yolox.py +++ b/unstructured_inference/models/yolox.py @@ -12,23 +12,23 @@ from onnxruntime.capi import _pybind_state as C from PIL import Image -from unstructured_inference.constants import Source +from unstructured_inference.constants import ElementType, Source from unstructured_inference.inference.layoutelement import LayoutElement from unstructured_inference.models.unstructuredmodel import UnstructuredObjectDetectionModel from unstructured_inference.utils import LazyDict, LazyEvaluateInfo YOLOX_LABEL_MAP = { - 0: "Caption", - 1: "Footnote", - 2: "Formula", - 3: "List-item", - 4: "Page-footer", - 5: "Page-header", - 6: "Picture", - 7: "Section-header", - 8: "Table", - 9: "Text", - 10: "Title", + 0: ElementType.CAPTION, + 1: ElementType.FOOTNOTE, + 2: ElementType.FORMULA, + 3: ElementType.LIST_ITEM, + 4: ElementType.PAGE_FOOTER, + 5: ElementType.PAGE_HEADER, + 6: ElementType.PICTURE, + 7: ElementType.SECTION_HEADER, + 8: ElementType.TABLE, + 9: ElementType.TEXT, + 10: ElementType.TITLE, } MODEL_TYPES = {