From ab42c06ad1bacca7894a5ecd90514e69b6e85e49 Mon Sep 17 00:00:00 2001
From: christinestraub <christinemstraub@gmail.com>
Date: Wed, 15 Nov 2023 11:09:13 -0800
Subject: [PATCH 1/6] feat: support extracting elements with types `Picture`
 and `Figure`

---
 unstructured_inference/inference/layout.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/unstructured_inference/inference/layout.py b/unstructured_inference/inference/layout.py
index 86132275..f8b56172 100644
--- a/unstructured_inference/inference/layout.py
+++ b/unstructured_inference/inference/layout.py
@@ -296,8 +296,9 @@ def extract_images(self, output_dir_path: Optional[str] = None):
         os.makedirs(output_dir_path, exist_ok=True)
 
         figure_number = 0
+        image_element_types = ["Image", "Picture", "Figure"]
         for el in self.elements:
-            if (el.bbox is None) or (el.type not in ["Image"]):
+            if (el.bbox is None) or (el.type not in image_element_types):
                 continue
 
             figure_number += 1

From 72572328a3d3637c46a1fc8be19a83aada707283 Mon Sep 17 00:00:00 2001
From: christinestraub <christinemstraub@gmail.com>
Date: Wed, 15 Nov 2023 11:33:50 -0800
Subject: [PATCH 2/6] feat: add constants for the element types

---
 unstructured_inference/constants.py           | 19 +++++++++++++++
 unstructured_inference/inference/layout.py    |  4 ++--
 .../inference/layoutelement.py                | 20 ++++++++++------
 unstructured_inference/models/detectron2.py   | 11 +++++----
 .../models/unstructuredmodel.py               |  5 +++-
 unstructured_inference/models/yolox.py        | 24 +++++++++----------
 6 files changed, 56 insertions(+), 27 deletions(-)

diff --git a/unstructured_inference/constants.py b/unstructured_inference/constants.py
index 900802ce..a500876c 100644
--- a/unstructured_inference/constants.py
+++ b/unstructured_inference/constants.py
@@ -18,6 +18,25 @@ class Source(Enum):
     SUPER_GRADIENTS = "super-gradients"
 
 
+class ElementType:
+    IMAGE = "Image"
+    FIGURE = "Figure"
+    PICTURE = "Picture"
+    TABLE = "Table"
+    LIST = "List"
+    LIST_ITEM = "List-item"
+    FORMULA = "Formula"
+    CAPTION = "Caption"
+    PAGE_HEADER = "Page-header"
+    SECTION_HEADER = "Section-header"
+    PAGE_FOOTER = "Page-footer"
+    FOOTNOTE = "Footnote"
+    TITLE = "Title"
+    TEXT = "Text"
+    UNCATEGORIZED_TEXT = "UncategorizedText"
+    PAGE_BREAK = "PageBreak"
+
+
 FULL_PAGE_REGION_THRESHOLD = 0.99
 
 # this field is defined by pytesseract/unstructured.pytesseract
diff --git a/unstructured_inference/inference/layout.py b/unstructured_inference/inference/layout.py
index f8b56172..04503d7c 100644
--- a/unstructured_inference/inference/layout.py
+++ b/unstructured_inference/inference/layout.py
@@ -11,7 +11,7 @@
 from pdfminer.high_level import extract_pages
 from PIL import Image, ImageSequence
 
-from unstructured_inference.constants import Source
+from unstructured_inference.constants import ElementType, Source
 from unstructured_inference.inference.elements import (
     EmbeddedTextRegion,
     ImageTextRegion,
@@ -296,7 +296,7 @@ def extract_images(self, output_dir_path: Optional[str] = None):
         os.makedirs(output_dir_path, exist_ok=True)
 
         figure_number = 0
-        image_element_types = ["Image", "Picture", "Figure"]
+        image_element_types = [ElementType.IMAGE, ElementType.PICTURE, ElementType.FIGURE]
         for el in self.elements:
             if (el.bbox is None) or (el.type not in image_element_types):
                 continue
diff --git a/unstructured_inference/inference/layoutelement.py b/unstructured_inference/inference/layoutelement.py
index a602faf3..7efd7063 100644
--- a/unstructured_inference/inference/layoutelement.py
+++ b/unstructured_inference/inference/layoutelement.py
@@ -12,6 +12,7 @@
 from unstructured_inference.config import inference_config
 from unstructured_inference.constants import (
     FULL_PAGE_REGION_THRESHOLD,
+    ElementType,
     Source,
 )
 from unstructured_inference.inference.elements import (
@@ -42,7 +43,7 @@ def extract_text(
             objects=objects,
             extract_tables=extract_tables,
         )
-        if extract_tables and self.type == "Table":
+        if extract_tables and self.type == ElementType.TABLE:
             self.text_as_html = interpret_table_block(self, image)
         return text
 
@@ -139,10 +140,10 @@ def merge_inferred_layout_with_extracted_layout(
                     subregion_threshold=subregion_threshold,
                 )
                 inferred_is_text = inferred_region.type not in (
-                    "Figure",
-                    "Image",
-                    "PageBreak",
-                    "Table",
+                    ElementType.FIGURE,
+                    ElementType.IMAGE,
+                    ElementType.PAGE_BREAK,
+                    ElementType.TABLE,
                 )
                 extracted_is_subregion_of_inferred = extracted_region.bbox.is_almost_subregion_of(
                     inferred_region.bbox,
@@ -169,7 +170,10 @@ def merge_inferred_layout_with_extracted_layout(
                         # keep inferred region, remove extracted region
                         grow_region_to_match_region(inferred_region.bbox, extracted_region.bbox)
                         region_matched = True
-                elif either_region_is_subregion_of_other and inferred_region.type != "Table":
+                elif (
+                    either_region_is_subregion_of_other
+                    and inferred_region.type != ElementType.TABLE
+                ):
                     # keep extracted region, remove inferred region
                     inferred_regions_to_remove.append(inferred_region)
         if not region_matched:
@@ -178,7 +182,9 @@ def merge_inferred_layout_with_extracted_layout(
     categorized_extracted_elements_to_add = [
         LayoutElement(
             text=el.text,
-            type="Image" if isinstance(el, ImageTextRegion) else "UncategorizedText",
+            type=ElementType.IMAGE
+            if isinstance(el, ImageTextRegion)
+            else ElementType.UNCATEGORIZED_TEXT,
             source=el.source,
             bbox=el.bbox,
         )
diff --git a/unstructured_inference/models/detectron2.py b/unstructured_inference/models/detectron2.py
index 4f490c33..98939f88 100644
--- a/unstructured_inference/models/detectron2.py
+++ b/unstructured_inference/models/detectron2.py
@@ -9,6 +9,7 @@
 from layoutparser.models.model_config import LayoutModelConfig
 from PIL import Image
 
+from unstructured_inference.constants import ElementType
 from unstructured_inference.inference.layoutelement import LayoutElement
 from unstructured_inference.logger import logger
 from unstructured_inference.models.unstructuredmodel import (
@@ -18,11 +19,11 @@
 
 DETECTRON_CONFIG: Final = "lp://PubLayNet/faster_rcnn_R_50_FPN_3x/config"
 DEFAULT_LABEL_MAP: Final[Dict[int, str]] = {
-    0: "Text",
-    1: "Title",
-    2: "List",
-    3: "Table",
-    4: "Figure",
+    0: ElementType.TEXT,
+    1: ElementType.TITLE,
+    2: ElementType.LIST,
+    3: ElementType.TABLE,
+    4: ElementType.FIGURE,
 }
 DEFAULT_EXTRA_CONFIG: Final[List[Any]] = ["MODEL.ROI_HEADS.SCORE_THRESH_TEST", 0.8]
 
diff --git a/unstructured_inference/models/unstructuredmodel.py b/unstructured_inference/models/unstructuredmodel.py
index 9305f8ea..376acb2d 100644
--- a/unstructured_inference/models/unstructuredmodel.py
+++ b/unstructured_inference/models/unstructuredmodel.py
@@ -6,6 +6,7 @@
 import numpy as np
 from PIL.Image import Image
 
+from unstructured_inference.constants import ElementType
 from unstructured_inference.inference.elements import (
     grow_region_to_match_region,
     intersections,
@@ -123,7 +124,9 @@ def enhance_regions(
         return elements
 
     @staticmethod
-    def clean_type(elements: List[LayoutElement], type_to_clean="Table") -> List[LayoutElement]:
+    def clean_type(
+        elements: List[LayoutElement], type_to_clean=ElementType.TABLE
+    ) -> List[LayoutElement]:
         """After this function, the list of elements will not contain any element inside
         of the type specified"""
         target_elements = [e for e in elements if e.type == type_to_clean]
diff --git a/unstructured_inference/models/yolox.py b/unstructured_inference/models/yolox.py
index 813ea638..47455cf4 100644
--- a/unstructured_inference/models/yolox.py
+++ b/unstructured_inference/models/yolox.py
@@ -12,23 +12,23 @@
 from onnxruntime.capi import _pybind_state as C
 from PIL import Image
 
-from unstructured_inference.constants import Source
+from unstructured_inference.constants import ElementType, Source
 from unstructured_inference.inference.layoutelement import LayoutElement
 from unstructured_inference.models.unstructuredmodel import UnstructuredObjectDetectionModel
 from unstructured_inference.utils import LazyDict, LazyEvaluateInfo
 
 YOLOX_LABEL_MAP = {
-    0: "Caption",
-    1: "Footnote",
-    2: "Formula",
-    3: "List-item",
-    4: "Page-footer",
-    5: "Page-header",
-    6: "Picture",
-    7: "Section-header",
-    8: "Table",
-    9: "Text",
-    10: "Title",
+    0: ElementType.CAPTION,
+    1: ElementType.FOOTNOTE,
+    2: ElementType.FORMULA,
+    3: ElementType.LIST_ITEM,
+    4: ElementType.PAGE_FOOTER,
+    5: ElementType.PAGE_HEADER,
+    6: ElementType.PICTURE,
+    7: ElementType.SECTION_HEADER,
+    8: ElementType.TABLE,
+    9: ElementType.TEXT,
+    10: ElementType.TITLE,
 }
 
 MODEL_TYPES = {

From de150ce614530a26b7fbbaf069672d6feddcf3db Mon Sep 17 00:00:00 2001
From: christinestraub <christinemstraub@gmail.com>
Date: Wed, 15 Nov 2023 11:39:41 -0800
Subject: [PATCH 3/6] chore: update changelog & version

---
 CHANGELOG.md                          | 4 +++-
 unstructured_inference/__version__.py | 2 +-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 88517419..2a9bf9c8 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,7 @@
-## 0.7.13-dev0
+## 0.7.13-dev1
 
+* refactor: add a class `ElementType` for the element type constants and use the constants to replace element type strings
+* enhancement: support extracting elements with types `Picture` and `Figure`
 * chore: supress UserWarning about specified model providers
 
 ## 0.7.12
diff --git a/unstructured_inference/__version__.py b/unstructured_inference/__version__.py
index 7be10279..1615ba8d 100644
--- a/unstructured_inference/__version__.py
+++ b/unstructured_inference/__version__.py
@@ -1 +1 @@
-__version__ = "0.7.13-dev0"  # pragma: no cover
+__version__ = "0.7.13-dev1"  # pragma: no cover

From 09b4af9e529c3c69c3d881b1be6c9871e7c01020 Mon Sep 17 00:00:00 2001
From: christinestraub <christinemstraub@gmail.com>
Date: Wed, 15 Nov 2023 11:41:27 -0800
Subject: [PATCH 4/6] chore: update .gitignore

---
 .gitignore | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index 8c105a8d..22dae48b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -143,4 +143,5 @@ dmypy.json
 .vscode/
 
 sample-docs/*_images
-examples/**/output
\ No newline at end of file
+examples/**/output
+figures

From b2514732361b3390a723d7f5b352d486537d8e84 Mon Sep 17 00:00:00 2001
From: christinestraub <christinemstraub@gmail.com>
Date: Thu, 16 Nov 2023 13:25:37 -0800
Subject: [PATCH 5/6] chore: update version

---
 unstructured_inference/__version__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unstructured_inference/__version__.py b/unstructured_inference/__version__.py
index 1615ba8d..6825a3de 100644
--- a/unstructured_inference/__version__.py
+++ b/unstructured_inference/__version__.py
@@ -1 +1 @@
-__version__ = "0.7.13-dev1"  # pragma: no cover
+__version__ = "0.7.13-dev2"  # pragma: no cover

From c8683d4a9289429fb795059d6818d624b33609e0 Mon Sep 17 00:00:00 2001
From: christinestraub <christinemstraub@gmail.com>
Date: Thu, 16 Nov 2023 13:33:46 -0800
Subject: [PATCH 6/6] chore: update dev version to non-dev release

---
 CHANGELOG.md                          | 2 +-
 unstructured_inference/__version__.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 93c52299..0310110b 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,4 +1,4 @@
-## 0.7.13-dev2
+## 0.7.13
 
 * refactor: add a class `ElementType` for the element type constants and use the constants to replace element type strings
 * enhancement: support extracting elements with types `Picture` and `Figure`
diff --git a/unstructured_inference/__version__.py b/unstructured_inference/__version__.py
index 6825a3de..d3edd746 100644
--- a/unstructured_inference/__version__.py
+++ b/unstructured_inference/__version__.py
@@ -1 +1 @@
-__version__ = "0.7.13-dev2"  # pragma: no cover
+__version__ = "0.7.13"  # pragma: no cover