Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -143,4 +143,5 @@ dmypy.json
.vscode/

sample-docs/*_images
examples/**/output
examples/**/output
figures
4 changes: 3 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
## 0.7.13-dev1
## 0.7.13

* refactor: add a class `ElementType` for the element type constants and use the constants to replace element type strings
* enhancement: support extracting elements with types `Picture` and `Figure`
* fix: update logger in table initalization where the logger info was not showing
* chore: supress UserWarning about specified model providers

Expand Down
2 changes: 1 addition & 1 deletion unstructured_inference/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.7.13-dev1" # pragma: no cover
__version__ = "0.7.13" # pragma: no cover
19 changes: 19 additions & 0 deletions unstructured_inference/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,25 @@ class Source(Enum):
SUPER_GRADIENTS = "super-gradients"


class ElementType:
IMAGE = "Image"
FIGURE = "Figure"
PICTURE = "Picture"
TABLE = "Table"
LIST = "List"
LIST_ITEM = "List-item"
FORMULA = "Formula"
CAPTION = "Caption"
PAGE_HEADER = "Page-header"
SECTION_HEADER = "Section-header"
PAGE_FOOTER = "Page-footer"
FOOTNOTE = "Footnote"
TITLE = "Title"
TEXT = "Text"
UNCATEGORIZED_TEXT = "UncategorizedText"
PAGE_BREAK = "PageBreak"


FULL_PAGE_REGION_THRESHOLD = 0.99

# this field is defined by pytesseract/unstructured.pytesseract
Expand Down
5 changes: 3 additions & 2 deletions unstructured_inference/inference/layout.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from pdfminer.high_level import extract_pages
from PIL import Image, ImageSequence

from unstructured_inference.constants import Source
from unstructured_inference.constants import ElementType, Source
from unstructured_inference.inference.elements import (
EmbeddedTextRegion,
ImageTextRegion,
Expand Down Expand Up @@ -296,8 +296,9 @@ def extract_images(self, output_dir_path: Optional[str] = None):
os.makedirs(output_dir_path, exist_ok=True)

figure_number = 0
image_element_types = [ElementType.IMAGE, ElementType.PICTURE, ElementType.FIGURE]
for el in self.elements:
if (el.bbox is None) or (el.type not in ["Image"]):
if (el.bbox is None) or (el.type not in image_element_types):
continue

figure_number += 1
Expand Down
20 changes: 13 additions & 7 deletions unstructured_inference/inference/layoutelement.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from unstructured_inference.config import inference_config
from unstructured_inference.constants import (
FULL_PAGE_REGION_THRESHOLD,
ElementType,
Source,
)
from unstructured_inference.inference.elements import (
Expand Down Expand Up @@ -42,7 +43,7 @@ def extract_text(
objects=objects,
extract_tables=extract_tables,
)
if extract_tables and self.type == "Table":
if extract_tables and self.type == ElementType.TABLE:
self.text_as_html = interpret_table_block(self, image)
return text

Expand Down Expand Up @@ -139,10 +140,10 @@ def merge_inferred_layout_with_extracted_layout(
subregion_threshold=subregion_threshold,
)
inferred_is_text = inferred_region.type not in (
"Figure",
"Image",
"PageBreak",
"Table",
ElementType.FIGURE,
ElementType.IMAGE,
ElementType.PAGE_BREAK,
ElementType.TABLE,
)
extracted_is_subregion_of_inferred = extracted_region.bbox.is_almost_subregion_of(
inferred_region.bbox,
Expand All @@ -169,7 +170,10 @@ def merge_inferred_layout_with_extracted_layout(
# keep inferred region, remove extracted region
grow_region_to_match_region(inferred_region.bbox, extracted_region.bbox)
region_matched = True
elif either_region_is_subregion_of_other and inferred_region.type != "Table":
elif (
either_region_is_subregion_of_other
and inferred_region.type != ElementType.TABLE
):
# keep extracted region, remove inferred region
inferred_regions_to_remove.append(inferred_region)
if not region_matched:
Expand All @@ -178,7 +182,9 @@ def merge_inferred_layout_with_extracted_layout(
categorized_extracted_elements_to_add = [
LayoutElement(
text=el.text,
type="Image" if isinstance(el, ImageTextRegion) else "UncategorizedText",
type=ElementType.IMAGE
if isinstance(el, ImageTextRegion)
else ElementType.UNCATEGORIZED_TEXT,
source=el.source,
bbox=el.bbox,
)
Expand Down
11 changes: 6 additions & 5 deletions unstructured_inference/models/detectron2.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from layoutparser.models.model_config import LayoutModelConfig
from PIL import Image

from unstructured_inference.constants import ElementType
from unstructured_inference.inference.layoutelement import LayoutElement
from unstructured_inference.logger import logger
from unstructured_inference.models.unstructuredmodel import (
Expand All @@ -18,11 +19,11 @@

DETECTRON_CONFIG: Final = "lp://PubLayNet/faster_rcnn_R_50_FPN_3x/config"
DEFAULT_LABEL_MAP: Final[Dict[int, str]] = {
0: "Text",
1: "Title",
2: "List",
3: "Table",
4: "Figure",
0: ElementType.TEXT,
1: ElementType.TITLE,
2: ElementType.LIST,
3: ElementType.TABLE,
4: ElementType.FIGURE,
}
DEFAULT_EXTRA_CONFIG: Final[List[Any]] = ["MODEL.ROI_HEADS.SCORE_THRESH_TEST", 0.8]

Expand Down
5 changes: 4 additions & 1 deletion unstructured_inference/models/unstructuredmodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import numpy as np
from PIL.Image import Image

from unstructured_inference.constants import ElementType
from unstructured_inference.inference.elements import (
grow_region_to_match_region,
intersections,
Expand Down Expand Up @@ -123,7 +124,9 @@ def enhance_regions(
return elements

@staticmethod
def clean_type(elements: List[LayoutElement], type_to_clean="Table") -> List[LayoutElement]:
def clean_type(
elements: List[LayoutElement], type_to_clean=ElementType.TABLE
) -> List[LayoutElement]:
"""After this function, the list of elements will not contain any element inside
of the type specified"""
target_elements = [e for e in elements if e.type == type_to_clean]
Expand Down
24 changes: 12 additions & 12 deletions unstructured_inference/models/yolox.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,23 +12,23 @@
from onnxruntime.capi import _pybind_state as C
from PIL import Image

from unstructured_inference.constants import Source
from unstructured_inference.constants import ElementType, Source
from unstructured_inference.inference.layoutelement import LayoutElement
from unstructured_inference.models.unstructuredmodel import UnstructuredObjectDetectionModel
from unstructured_inference.utils import LazyDict, LazyEvaluateInfo

YOLOX_LABEL_MAP = {
0: "Caption",
1: "Footnote",
2: "Formula",
3: "List-item",
4: "Page-footer",
5: "Page-header",
6: "Picture",
7: "Section-header",
8: "Table",
9: "Text",
10: "Title",
0: ElementType.CAPTION,
1: ElementType.FOOTNOTE,
2: ElementType.FORMULA,
3: ElementType.LIST_ITEM,
4: ElementType.PAGE_FOOTER,
5: ElementType.PAGE_HEADER,
6: ElementType.PICTURE,
7: ElementType.SECTION_HEADER,
8: ElementType.TABLE,
9: ElementType.TEXT,
10: ElementType.TITLE,
}

MODEL_TYPES = {
Expand Down