From c8ed1e33a343987fe9b684b020ca22f1e48c0bf9 Mon Sep 17 00:00:00 2001
From: christinestraub <christinemstraub@gmail.com>
Date: Fri, 1 Dec 2023 15:03:19 -0800
Subject: [PATCH 1/7] feat: add functionality to support image extraction

---
 unstructured/documents/elements.py            |  78 +++++------
 unstructured/partition/pdf_image/pdf.py       |  14 +-
 .../partition/pdf_image/pdf_image_utils.py    | 123 ++++++++++++++++++
 3 files changed, 172 insertions(+), 43 deletions(-)
 create mode 100644 unstructured/partition/pdf_image/pdf_image_utils.py

diff --git a/unstructured/documents/elements.py b/unstructured/documents/elements.py
index fe1c2ce18e..8fbbf96104 100644
--- a/unstructured/documents/elements.py
+++ b/unstructured/documents/elements.py
@@ -555,6 +555,44 @@ def _add_regex_metadata(
     return elements
 
 
+class ElementType:
+    TITLE = "Title"
+    TEXT = "Text"
+    UNCATEGORIZED_TEXT = "UncategorizedText"
+    NARRATIVE_TEXT = "NarrativeText"
+    BULLETED_TEXT = "BulletedText"
+    ABSTRACT = "Abstract"
+    THREADING = "Threading"
+    FORM = "Form"
+    FIELD_NAME = "Field-Name"
+    VALUE = "Value"
+    LINK = "Link"
+    COMPOSITE_ELEMENT = "CompositeElement"
+    IMAGE = "Image"
+    PICTURE = "Picture"
+    FIGURE_CAPTION = "FigureCaption"
+    FIGURE = "Figure"
+    CAPTION = "Caption"
+    LIST = "List"
+    LIST_ITEM = "ListItem"
+    LIST_ITEM_OTHER = "List-item"
+    CHECKED = "Checked"
+    UNCHECKED = "Unchecked"
+    ADDRESS = "Address"
+    EMAIL_ADDRESS = "EmailAddress"
+    PAGE_BREAK = "PageBreak"
+    FORMULA = "Formula"
+    TABLE = "Table"
+    HEADER = "Header"
+    HEADLINE = "Headline"
+    SUB_HEADLINE = "Subheadline"
+    PAGE_HEADER = "Page-header"  # Title?
+    SECTION_HEADER = "Section-header"
+    FOOTER = "Footer"
+    FOOTNOTE = "Footnote"
+    PAGE_FOOTER = "Page-footer"
+
+
 class Element(abc.ABC):
     """An element is a section of a page in the document."""
 
@@ -764,7 +802,7 @@ class EmailAddress(Text):
 class Image(Text):
     """A text element for capturing image metadata."""
 
-    category = "Image"
+    category = ElementType.IMAGE
 
 
 class PageBreak(Text):
@@ -797,44 +835,6 @@ class Footer(Text):
     category = "Footer"
 
 
-class ElementType:
-    TITLE = "Title"
-    TEXT = "Text"
-    UNCATEGORIZED_TEXT = "UncategorizedText"
-    NARRATIVE_TEXT = "NarrativeText"
-    BULLETED_TEXT = "BulletedText"
-    ABSTRACT = "Abstract"
-    THREADING = "Threading"
-    FORM = "Form"
-    FIELD_NAME = "Field-Name"
-    VALUE = "Value"
-    LINK = "Link"
-    COMPOSITE_ELEMENT = "CompositeElement"
-    IMAGE = "Image"
-    PICTURE = "Picture"
-    FIGURE_CAPTION = "FigureCaption"
-    FIGURE = "Figure"
-    CAPTION = "Caption"
-    LIST = "List"
-    LIST_ITEM = "ListItem"
-    LIST_ITEM_OTHER = "List-item"
-    CHECKED = "Checked"
-    UNCHECKED = "Unchecked"
-    ADDRESS = "Address"
-    EMAIL_ADDRESS = "EmailAddress"
-    PAGE_BREAK = "PageBreak"
-    FORMULA = "Formula"
-    TABLE = "Table"
-    HEADER = "Header"
-    HEADLINE = "Headline"
-    SUB_HEADLINE = "Subheadline"
-    PAGE_HEADER = "Page-header"  # Title?
-    SECTION_HEADER = "Section-header"
-    FOOTER = "Footer"
-    FOOTNOTE = "Footnote"
-    PAGE_FOOTER = "Page-footer"
-
-
 TYPE_TO_TEXT_ELEMENT_MAP: Dict[str, Any] = {
     ElementType.TITLE: Title,
     ElementType.SECTION_HEADER: Title,
diff --git a/unstructured/partition/pdf_image/pdf.py b/unstructured/partition/pdf_image/pdf.py
index 1d911fc8a4..c93070189f 100644
--- a/unstructured/partition/pdf_image/pdf.py
+++ b/unstructured/partition/pdf_image/pdf.py
@@ -70,6 +70,7 @@
     check_languages,
     prepare_languages_for_tesseract,
 )
+from unstructured.partition.pdf_image.pdf_image_utils import extract_images_from_elements
 from unstructured.partition.pdf_image.pdfminer_utils import (
     open_pdfminer_pages_generator,
     rect_to_bbox,
@@ -381,8 +382,6 @@ def _partition_pdf_or_image_local(
             is_image=is_image,
             model_name=model_name,
             pdf_image_dpi=pdf_image_dpi,
-            extract_images_in_pdf=extract_images_in_pdf,
-            image_output_dir_path=image_output_dir_path,
         )
 
         # NOTE(christine): merged_document_layout = extracted_layout + inferred_layout
@@ -411,8 +410,6 @@ def _partition_pdf_or_image_local(
             is_image=is_image,
             model_name=model_name,
             pdf_image_dpi=pdf_image_dpi,
-            extract_images_in_pdf=extract_images_in_pdf,
-            image_output_dir_path=image_output_dir_path,
         )
         if hasattr(file, "seek"):
             file.seek(0)
@@ -458,6 +455,15 @@ def _partition_pdf_or_image_local(
         **kwargs,
     )
 
+    if extract_images_in_pdf:
+        extract_images_from_elements(
+            elements=elements,
+            filename=filename,
+            file=file,
+            pdf_image_dpi=pdf_image_dpi,
+            output_dir_path=image_output_dir_path,
+        )
+
     out_elements = []
     for el in elements:
         if isinstance(el, PageBreak) and not include_page_breaks:
diff --git a/unstructured/partition/pdf_image/pdf_image_utils.py b/unstructured/partition/pdf_image/pdf_image_utils.py
new file mode 100644
index 0000000000..c61698b06f
--- /dev/null
+++ b/unstructured/partition/pdf_image/pdf_image_utils.py
@@ -0,0 +1,123 @@
+import os
+import tempfile
+from pathlib import PurePath
+from typing import Union, Optional, List, cast, BinaryIO, TYPE_CHECKING
+import cv2
+import numpy as np
+import pdf2image
+from PIL import Image
+
+from unstructured.documents.elements import ElementType
+from unstructured.logger import logger
+from unstructured.partition.common import convert_to_bytes
+
+if TYPE_CHECKING:
+    from unstructured.documents.elements import Element
+
+
+def write_image(image: Union[Image.Image, np.ndarray], output_image_path: str):
+    """
+    Write an image to a specified file path, supporting both PIL Image and numpy ndarray formats.
+
+    Parameters:
+    - image (Union[Image.Image, np.ndarray]): The image to be written, which can be in PIL Image
+      format or a numpy ndarray format.
+    - output_image_path (str): The path to which the image will be written.
+
+    Raises:
+    - ValueError: If the provided image type is neither PIL Image nor numpy ndarray.
+
+    Returns:
+    - None: The function writes the image to the specified path but does not return any value.
+    """
+
+    if isinstance(image, Image.Image):
+        image.save(output_image_path)
+    elif isinstance(image, np.ndarray):
+        cv2.imwrite(output_image_path, image)
+    else:
+        raise ValueError("Unsupported Image Type")
+
+
+def convert_pdf_to_image(
+    filename: str,
+    file: Optional[Union[bytes, BinaryIO]] = None,
+    dpi: int = 200,
+    output_folder: Optional[Union[str, PurePath]] = None,
+    path_only: bool = False,
+) -> Union[List[Image.Image], List[str]]:
+    """Get the image renderings of the pdf pages using pdf2image"""
+
+    if path_only and not output_folder:
+        raise ValueError("output_folder must be specified if path_only is true")
+
+    if file is not None:
+        f_bytes = convert_to_bytes(file)
+        images = pdf2image.convert_from_bytes(
+            f_bytes,
+            dpi=dpi,
+            output_folder=output_folder,
+            paths_only=path_only,
+        )
+    else:
+        images = pdf2image.convert_from_path(
+            filename,
+            dpi=dpi,
+            output_folder=output_folder,
+            paths_only=path_only,
+        )
+
+    return images
+
+
+def extract_images_from_elements(
+    elements: List["Element"],
+    pdf_image_dpi: int,
+    filename: str = "",
+    file: Optional[Union[bytes, BinaryIO]] = None,
+    output_dir_path: Optional[str] = None
+):
+    """
+    Extract and save images from the page. This method iterates through the layout elements
+    of the page, identifies image regions, and extracts and saves them as separate image files.
+    """
+
+    if not output_dir_path:
+        output_dir_path = os.path.join(os.getcwd(), "figures")
+    os.makedirs(output_dir_path, exist_ok=True)
+
+    with tempfile.TemporaryDirectory() as temp_dir:
+        _image_paths = convert_pdf_to_image(
+            filename,
+            file,
+            pdf_image_dpi,
+            output_folder=temp_dir,
+            path_only=True,
+        )
+        image_paths = cast(List[str], _image_paths)
+
+        figure_number = 0
+        for el in elements:
+            coordinates = el.metadata.coordinates
+            if not coordinates or not coordinates.points or el.category != ElementType.IMAGE:
+                continue
+
+            points = coordinates.points
+            x1, y1 = points[0]
+            x2, y2 = points[2]
+            page_number = el.metadata.page_number
+
+            figure_number += 1
+            try:
+                output_f_path = os.path.join(
+                    output_dir_path,
+                    f"figure-{page_number}-{figure_number}.jpg",
+                )
+                image_path = image_paths[page_number - 1]
+                image = Image.open(image_path)
+                cropped_image = image.crop((x1, y1, x2, y2))
+                write_image(cropped_image, output_f_path)
+                # add image path to element metadata
+                el.metadata.image_path = output_f_path
+            except (ValueError, IOError):
+                logger.warning("Image Extraction Error: Skipping the failed image", exc_info=True)

From a488f164970fe573478862b17648f44f73511173 Mon Sep 17 00:00:00 2001
From: christinestraub <christinemstraub@gmail.com>
Date: Sun, 3 Dec 2023 23:43:57 -0800
Subject: [PATCH 2/7] test: add test cases for `pdf_image_utils` module

---
 .../pdf_image/test_pdf_image_utils.py         | 114 ++++++++++++++++++
 .../partition/pdf_image/pdf_image_utils.py    |  10 +-
 2 files changed, 119 insertions(+), 5 deletions(-)
 create mode 100644 test_unstructured/partition/pdf_image/test_pdf_image_utils.py

diff --git a/test_unstructured/partition/pdf_image/test_pdf_image_utils.py b/test_unstructured/partition/pdf_image/test_pdf_image_utils.py
new file mode 100644
index 0000000000..de0cbe0265
--- /dev/null
+++ b/test_unstructured/partition/pdf_image/test_pdf_image_utils.py
@@ -0,0 +1,114 @@
+import os
+import tempfile
+from typing import List
+
+import pytest
+from PIL import Image as PILImg
+
+from test_unstructured.unit_utils import example_doc_path
+from unstructured.documents.coordinates import PixelSpace
+from unstructured.documents.elements import ElementMetadata, Image
+from unstructured.partition.pdf_image import pdf_image_utils
+
+
+@pytest.mark.parametrize("image_type", ["pil", "numpy_array"])
+def test_write_image(image_type, mock_pil_image, mock_numpy_image):
+    image_map = {
+        "pil": mock_pil_image,
+        "numpy_array": mock_numpy_image,
+    }
+    image = image_map[image_type]
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        output_image_path = os.path.join(tmpdir, "test_image.jpg")
+        pdf_image_utils.write_image(image, output_image_path)
+        assert os.path.exists(output_image_path)
+
+        # Additional check to see if the written image can be read
+        read_image = PILImg.open(output_image_path)
+        assert read_image is not None
+
+
+@pytest.mark.parametrize("file_mode", ["filename", "rb"])
+@pytest.mark.parametrize("path_only", [True, False])
+def test_convert_pdf_to_image(
+    file_mode,
+    path_only,
+    filename=example_doc_path("embedded-images.pdf")
+):
+    with tempfile.TemporaryDirectory() as tmpdir:
+        if file_mode == "filename":
+            images = pdf_image_utils.convert_pdf_to_image(
+                filename=filename,
+                file=None,
+                output_folder=tmpdir,
+                path_only=path_only,
+            )
+        else:
+            with open(filename, "rb") as f:
+                images = pdf_image_utils.convert_pdf_to_image(
+                    filename="",
+                    file=f,
+                    output_folder=tmpdir,
+                    path_only=path_only,
+                )
+
+        if path_only:
+            assert isinstance(images[0], str)
+        else:
+            assert isinstance(images[0], PILImg.Image)
+
+
+def test_extract_images_from_elements(filename=example_doc_path("embedded-images.pdf")):
+    with tempfile.TemporaryDirectory() as tmpdir:
+        elements = [
+            Image(
+                text="3",
+                coordinates=(
+                    (78.7401411111111, 86.61545694444455),
+                    (78.7401411111111, 519.9487805555556),
+                    (512.0734647222223, 519.9487805555556),
+                    (512.0734647222223, 86.61545694444455),
+                ),
+                coordinate_system=PixelSpace(width=1575, height=1166),
+                metadata=ElementMetadata(page_number=1),
+            ),
+            Image(
+                text="4",
+                coordinates=(
+                    (570.8661397222222, 86.6154566666667),
+                    (570.8661397222222, 519.6862825000001),
+                    (1003.9369655555556, 519.6862825000001),
+                    (1003.9369655555556, 86.6154566666667),
+                ),
+                coordinate_system=PixelSpace(width=1575, height=1166),
+                metadata=ElementMetadata(page_number=1),
+            ),
+            Image(
+                text="5",
+                coordinates=(
+                    (1062.9921808333331, 86.61545694444455),
+                    (1062.9921808333331, 519.9487805555556),
+                    (1496.3255044444445, 519.9487805555556),
+                    (1496.3255044444445, 86.61545694444455),
+                ),
+                coordinate_system=PixelSpace(width=1575, height=1166),
+                metadata=ElementMetadata(page_number=1),
+            ),
+        ]
+
+        pdf_image_utils.extract_images_from_elements(
+            elements=elements, pdf_image_dpi=200, filename=filename, output_dir_path=str(tmpdir)
+        )
+
+        for i, el in enumerate(elements):
+            expected_image_path = os.path.join(
+                str(tmpdir), f"figure-{el.metadata.page_number}-{i + 1}.jpg"
+            )
+            assert os.path.isfile(el.metadata.image_path)
+            assert el.metadata.image_path == expected_image_path
+
+
+def test_write_image_raises_error():
+    with pytest.raises(ValueError):
+        pdf_image_utils.write_image("invalid_type", "test_image.jpg")
diff --git a/unstructured/partition/pdf_image/pdf_image_utils.py b/unstructured/partition/pdf_image/pdf_image_utils.py
index c61698b06f..1c73a7c07f 100644
--- a/unstructured/partition/pdf_image/pdf_image_utils.py
+++ b/unstructured/partition/pdf_image/pdf_image_utils.py
@@ -1,7 +1,8 @@
 import os
 import tempfile
 from pathlib import PurePath
-from typing import Union, Optional, List, cast, BinaryIO, TYPE_CHECKING
+from typing import TYPE_CHECKING, BinaryIO, List, Optional, Union, cast
+
 import cv2
 import numpy as np
 import pdf2image
@@ -75,7 +76,7 @@ def extract_images_from_elements(
     pdf_image_dpi: int,
     filename: str = "",
     file: Optional[Union[bytes, BinaryIO]] = None,
-    output_dir_path: Optional[str] = None
+    output_dir_path: Optional[str] = None,
 ):
     """
     Extract and save images from the page. This method iterates through the layout elements
@@ -96,8 +97,7 @@ def extract_images_from_elements(
         )
         image_paths = cast(List[str], _image_paths)
 
-        figure_number = 0
-        for el in elements:
+        for i, el in enumerate(elements):
             coordinates = el.metadata.coordinates
             if not coordinates or not coordinates.points or el.category != ElementType.IMAGE:
                 continue
@@ -107,7 +107,7 @@ def extract_images_from_elements(
             x2, y2 = points[2]
             page_number = el.metadata.page_number
 
-            figure_number += 1
+            figure_number = i + 1
             try:
                 output_f_path = os.path.join(
                     output_dir_path,

From 1478b0c83a830d2e6e25eaee713128cdc76f33d7 Mon Sep 17 00:00:00 2001
From: christinestraub <christinemstraub@gmail.com>
Date: Sun, 3 Dec 2023 23:46:09 -0800
Subject: [PATCH 3/7] test: fix lint errors

---
 .../partition/pdf_image/test_pdf_image_utils.py              | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/test_unstructured/partition/pdf_image/test_pdf_image_utils.py b/test_unstructured/partition/pdf_image/test_pdf_image_utils.py
index de0cbe0265..72c92e8f7e 100644
--- a/test_unstructured/partition/pdf_image/test_pdf_image_utils.py
+++ b/test_unstructured/partition/pdf_image/test_pdf_image_utils.py
@@ -1,6 +1,5 @@
 import os
 import tempfile
-from typing import List
 
 import pytest
 from PIL import Image as PILImg
@@ -32,9 +31,7 @@ def test_write_image(image_type, mock_pil_image, mock_numpy_image):
 @pytest.mark.parametrize("file_mode", ["filename", "rb"])
 @pytest.mark.parametrize("path_only", [True, False])
 def test_convert_pdf_to_image(
-    file_mode,
-    path_only,
-    filename=example_doc_path("embedded-images.pdf")
+    file_mode, path_only, filename=example_doc_path("embedded-images.pdf")
 ):
     with tempfile.TemporaryDirectory() as tmpdir:
         if file_mode == "filename":

From cad7bfe0efde539a065a85039ac3f8ecaf40ee04 Mon Sep 17 00:00:00 2001
From: christinestraub <christinemstraub@gmail.com>
Date: Sun, 3 Dec 2023 23:52:41 -0800
Subject: [PATCH 4/7] chore: update changelog & version

---
 CHANGELOG.md                | 3 ++-
 unstructured/__version__.py | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index c1fa0cdf5c..c198d4fd84 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,7 +1,8 @@
-## 0.11.4-dev2
+## 0.11.4-dev3
 
 ### Enhancements
 
+* **Refactor image extraction code.** The image extraction code is moved from `unstructured-inference` to `unstructured`. 
 * **Refactor pdfminer code.** The pdfminer code is moved from `unstructured-inference` to `unstructured`.
 
 ### Features
diff --git a/unstructured/__version__.py b/unstructured/__version__.py
index 6feed12e46..18e5d9c421 100644
--- a/unstructured/__version__.py
+++ b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.11.4-dev2"  # pragma: no cover
+__version__ = "0.11.4-dev3"  # pragma: no cover

From d43e8176a53f39c4405c9c650fe7c164dde0ac59 Mon Sep 17 00:00:00 2001
From: christinestraub <christinemstraub@gmail.com>
Date: Mon, 4 Dec 2023 00:05:03 -0800
Subject: [PATCH 5/7] feat: revert image file name change

---
 unstructured/partition/pdf_image/pdf_image_utils.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/unstructured/partition/pdf_image/pdf_image_utils.py b/unstructured/partition/pdf_image/pdf_image_utils.py
index 1c73a7c07f..a3dd83774f 100644
--- a/unstructured/partition/pdf_image/pdf_image_utils.py
+++ b/unstructured/partition/pdf_image/pdf_image_utils.py
@@ -97,7 +97,8 @@ def extract_images_from_elements(
         )
         image_paths = cast(List[str], _image_paths)
 
-        for i, el in enumerate(elements):
+        figure_number = 0
+        for el in elements:
             coordinates = el.metadata.coordinates
             if not coordinates or not coordinates.points or el.category != ElementType.IMAGE:
                 continue
@@ -107,7 +108,7 @@ def extract_images_from_elements(
             x2, y2 = points[2]
             page_number = el.metadata.page_number
 
-            figure_number = i + 1
+            figure_number += 1
             try:
                 output_f_path = os.path.join(
                     output_dir_path,

From 0663e5cfc870ffaf0277c4acae76112f230ce1fe Mon Sep 17 00:00:00 2001
From: christinestraub <christinemstraub@gmail.com>
Date: Mon, 4 Dec 2023 10:38:05 -0800
Subject: [PATCH 6/7] test: fix unit test errors

---
 .../partition/pdf_image/test_pdf_image_utils.py             | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/test_unstructured/partition/pdf_image/test_pdf_image_utils.py b/test_unstructured/partition/pdf_image/test_pdf_image_utils.py
index 72c92e8f7e..91c991b926 100644
--- a/test_unstructured/partition/pdf_image/test_pdf_image_utils.py
+++ b/test_unstructured/partition/pdf_image/test_pdf_image_utils.py
@@ -1,6 +1,7 @@
 import os
 import tempfile
 
+import numpy as np
 import pytest
 from PIL import Image as PILImg
 
@@ -11,7 +12,10 @@
 
 
 @pytest.mark.parametrize("image_type", ["pil", "numpy_array"])
-def test_write_image(image_type, mock_pil_image, mock_numpy_image):
+def test_write_image(image_type):
+    mock_pil_image = PILImg.new("RGB", (50, 50))
+    mock_numpy_image = np.zeros((50, 50, 3), np.uint8)
+
     image_map = {
         "pil": mock_pil_image,
         "numpy_array": mock_numpy_image,

From fcf538f922c70a4b747d7c9e67117484995bc983 Mon Sep 17 00:00:00 2001
From: christinestraub <christinemstraub@gmail.com>
Date: Tue, 5 Dec 2023 09:16:26 -0800
Subject: [PATCH 7/7] chore: bump unstructured-inference==0.7.18 & make
 pip-compile

---
 docs/requirements.txt                     |  2 +-
 requirements/build.txt                    |  2 +-
 requirements/dev.txt                      | 10 ++++-----
 requirements/extra-markdown.txt           |  2 +-
 requirements/extra-paddleocr.txt          |  4 ++--
 requirements/extra-pdf-image.in           |  2 +-
 requirements/extra-pdf-image.txt          |  6 ++---
 requirements/ingest/azure.txt             |  2 +-
 requirements/ingest/delta-table.txt       |  2 +-
 requirements/ingest/embed-aws-bedrock.txt |  6 ++---
 requirements/ingest/embed-huggingface.txt |  6 ++---
 requirements/ingest/embed-openai.txt      |  8 +++----
 requirements/ingest/onedrive.txt          |  2 +-
 requirements/ingest/outlook.txt           |  2 +-
 requirements/ingest/reddit.txt            |  2 +-
 requirements/ingest/sharepoint.txt        |  2 +-
 requirements/ingest/slack.txt             |  2 +-
 requirements/ingest/weaviate.txt          | 27 ++++++++++-------------
 requirements/test.txt                     |  4 ++--
 19 files changed, 45 insertions(+), 48 deletions(-)

diff --git a/docs/requirements.txt b/docs/requirements.txt
index 129ce79f3d..ee5fdd1d2d 100644
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -36,7 +36,7 @@ idna==3.6
     #   requests
 imagesize==1.4.1
     # via sphinx
-importlib-metadata==6.9.0
+importlib-metadata==7.0.0
     # via sphinx
 jinja2==3.1.2
     # via
diff --git a/requirements/build.txt b/requirements/build.txt
index 129ce79f3d..ee5fdd1d2d 100644
--- a/requirements/build.txt
+++ b/requirements/build.txt
@@ -36,7 +36,7 @@ idna==3.6
     #   requests
 imagesize==1.4.1
     # via sphinx
-importlib-metadata==6.9.0
+importlib-metadata==7.0.0
     # via sphinx
 jinja2==3.1.2
     # via
diff --git a/requirements/dev.txt b/requirements/dev.txt
index c5592adbd0..0f880f65b3 100644
--- a/requirements/dev.txt
+++ b/requirements/dev.txt
@@ -91,7 +91,7 @@ idna==3.6
     #   anyio
     #   jsonschema
     #   requests
-importlib-metadata==6.9.0
+importlib-metadata==7.0.0
     # via
     #   build
     #   jupyter-client
@@ -167,7 +167,7 @@ jupyter-events==0.9.0
     # via jupyter-server
 jupyter-lsp==2.2.1
     # via jupyterlab
-jupyter-server==2.11.1
+jupyter-server==2.11.2
     # via
     #   jupyter-lsp
     #   jupyterlab
@@ -198,7 +198,7 @@ mistune==3.0.2
     # via nbconvert
 nbclient==0.9.0
     # via nbconvert
-nbconvert==7.11.0
+nbconvert==7.12.0
     # via
     #   jupyter
     #   jupyter-server
@@ -290,7 +290,7 @@ pyyaml==6.0.1
     #   -c test.txt
     #   jupyter-events
     #   pre-commit
-pyzmq==25.1.1
+pyzmq==25.1.2
     # via
     #   ipykernel
     #   jupyter-client
@@ -405,7 +405,7 @@ webencodings==0.5.1
     # via
     #   bleach
     #   tinycss2
-websocket-client==1.6.4
+websocket-client==1.7.0
     # via jupyter-server
 wheel==0.42.0
     # via
diff --git a/requirements/extra-markdown.txt b/requirements/extra-markdown.txt
index 940336c7ca..c2c30d59a0 100644
--- a/requirements/extra-markdown.txt
+++ b/requirements/extra-markdown.txt
@@ -4,7 +4,7 @@
 #
 #    pip-compile --output-file=extra-markdown.txt extra-markdown.in
 #
-importlib-metadata==6.9.0
+importlib-metadata==7.0.0
     # via markdown
 markdown==3.5.1
     # via -r extra-markdown.in
diff --git a/requirements/extra-paddleocr.txt b/requirements/extra-paddleocr.txt
index 896e548897..f9b3ba0e9f 100644
--- a/requirements/extra-paddleocr.txt
+++ b/requirements/extra-paddleocr.txt
@@ -45,7 +45,7 @@ flask==3.0.0
     #   visualdl
 flask-babel==4.0.0
     # via visualdl
-fonttools==4.45.1
+fonttools==4.46.0
     # via matplotlib
 future==0.18.3
     # via bce-python-sdk
@@ -59,7 +59,7 @@ imageio==2.33.0
     #   scikit-image
 imgaug==0.4.0
     # via unstructured-paddleocr
-importlib-metadata==6.9.0
+importlib-metadata==7.0.0
     # via flask
 importlib-resources==6.1.1
     # via matplotlib
diff --git a/requirements/extra-pdf-image.in b/requirements/extra-pdf-image.in
index 4ccf33f804..a3184f77df 100644
--- a/requirements/extra-pdf-image.in
+++ b/requirements/extra-pdf-image.in
@@ -8,7 +8,7 @@ pikepdf
 pypdf
 # Do not move to contsraints.in, otherwise unstructured-inference will not be upgraded
 # when unstructured library is.
-unstructured-inference==0.7.17
+unstructured-inference==0.7.18
 # unstructured fork of pytesseract that provides an interface to allow for multiple output formats
 # from one tesseract call
 unstructured.pytesseract>=0.3.12
diff --git a/requirements/extra-pdf-image.txt b/requirements/extra-pdf-image.txt
index dc7f2ec7a3..468aabcecf 100644
--- a/requirements/extra-pdf-image.txt
+++ b/requirements/extra-pdf-image.txt
@@ -37,7 +37,7 @@ filelock==3.13.1
     #   transformers
 flatbuffers==23.5.26
     # via onnxruntime
-fonttools==4.45.1
+fonttools==4.46.0
     # via matplotlib
 fsspec==2023.9.1
     # via
@@ -134,7 +134,7 @@ pdfminer-six==20221105
     #   pdfplumber
 pdfplumber==0.10.3
     # via layoutparser
-pikepdf==8.7.1
+pikepdf==8.8.0
     # via -r extra-pdf-image.in
 pillow==10.0.1
     # via
@@ -250,7 +250,7 @@ typing-extensions==4.8.0
     #   torch
 tzdata==2023.3
     # via pandas
-unstructured-inference==0.7.17
+unstructured-inference==0.7.18
     # via -r extra-pdf-image.in
 unstructured-pytesseract==0.3.12
     # via
diff --git a/requirements/ingest/azure.txt b/requirements/ingest/azure.txt
index c54b4e0aeb..28ade9b0c7 100644
--- a/requirements/ingest/azure.txt
+++ b/requirements/ingest/azure.txt
@@ -60,7 +60,7 @@ idna==3.6
     #   yarl
 isodate==0.6.1
     # via azure-storage-blob
-msal==1.25.0
+msal==1.26.0
     # via
     #   azure-datalake-store
     #   azure-identity
diff --git a/requirements/ingest/delta-table.txt b/requirements/ingest/delta-table.txt
index 1203ed11df..c66e481e47 100644
--- a/requirements/ingest/delta-table.txt
+++ b/requirements/ingest/delta-table.txt
@@ -4,7 +4,7 @@
 #
 #    pip-compile --output-file=ingest/delta-table.txt ingest/delta-table.in
 #
-deltalake==0.13.0
+deltalake==0.14.0
     # via -r ingest/delta-table.in
 fsspec==2023.9.1
     # via
diff --git a/requirements/ingest/embed-aws-bedrock.txt b/requirements/ingest/embed-aws-bedrock.txt
index e155405f14..c8cdfde16b 100644
--- a/requirements/ingest/embed-aws-bedrock.txt
+++ b/requirements/ingest/embed-aws-bedrock.txt
@@ -64,11 +64,11 @@ jsonpatch==1.33
     #   langchain-core
 jsonpointer==2.4
     # via jsonpatch
-langchain==0.0.344
+langchain==0.0.345
     # via -r ingest/embed-aws-bedrock.in
-langchain-core==0.0.8
+langchain-core==0.0.9
     # via langchain
-langsmith==0.0.68
+langsmith==0.0.69
     # via
     #   langchain
     #   langchain-core
diff --git a/requirements/ingest/embed-huggingface.txt b/requirements/ingest/embed-huggingface.txt
index 771a613391..781ea27f3b 100644
--- a/requirements/ingest/embed-huggingface.txt
+++ b/requirements/ingest/embed-huggingface.txt
@@ -79,11 +79,11 @@ jsonpatch==1.33
     #   langchain-core
 jsonpointer==2.4
     # via jsonpatch
-langchain==0.0.344
+langchain==0.0.345
     # via -r ingest/embed-huggingface.in
-langchain-core==0.0.8
+langchain-core==0.0.9
     # via langchain
-langsmith==0.0.68
+langsmith==0.0.69
     # via
     #   langchain
     #   langchain-core
diff --git a/requirements/ingest/embed-openai.txt b/requirements/ingest/embed-openai.txt
index 0486bc927d..74f3b199b2 100644
--- a/requirements/ingest/embed-openai.txt
+++ b/requirements/ingest/embed-openai.txt
@@ -64,11 +64,11 @@ jsonpatch==1.33
     #   langchain-core
 jsonpointer==2.4
     # via jsonpatch
-langchain==0.0.344
+langchain==0.0.345
     # via -r ingest/embed-openai.in
-langchain-core==0.0.8
+langchain-core==0.0.9
     # via langchain
-langsmith==0.0.68
+langsmith==0.0.69
     # via
     #   langchain
     #   langchain-core
@@ -125,7 +125,7 @@ tenacity==8.2.3
     # via
     #   langchain
     #   langchain-core
-tiktoken==0.5.1
+tiktoken==0.5.2
     # via -r ingest/embed-openai.in
 tqdm==4.66.1
     # via
diff --git a/requirements/ingest/onedrive.txt b/requirements/ingest/onedrive.txt
index 155fdcb36a..babcca6a7a 100644
--- a/requirements/ingest/onedrive.txt
+++ b/requirements/ingest/onedrive.txt
@@ -29,7 +29,7 @@ idna==3.6
     # via
     #   -c ingest/../base.txt
     #   requests
-msal==1.25.0
+msal==1.26.0
     # via
     #   -r ingest/onedrive.in
     #   office365-rest-python-client
diff --git a/requirements/ingest/outlook.txt b/requirements/ingest/outlook.txt
index 03aa4ffdd0..9d4c5b5312 100644
--- a/requirements/ingest/outlook.txt
+++ b/requirements/ingest/outlook.txt
@@ -23,7 +23,7 @@ idna==3.6
     # via
     #   -c ingest/../base.txt
     #   requests
-msal==1.25.0
+msal==1.26.0
     # via
     #   -r ingest/outlook.in
     #   office365-rest-python-client
diff --git a/requirements/ingest/reddit.txt b/requirements/ingest/reddit.txt
index 55fb63dc8f..7c6e92c4b9 100644
--- a/requirements/ingest/reddit.txt
+++ b/requirements/ingest/reddit.txt
@@ -33,5 +33,5 @@ urllib3==1.26.18
     #   -c ingest/../base.txt
     #   -c ingest/../constraints.in
     #   requests
-websocket-client==1.6.4
+websocket-client==1.7.0
     # via praw
diff --git a/requirements/ingest/sharepoint.txt b/requirements/ingest/sharepoint.txt
index d49b89c227..1196dfd580 100644
--- a/requirements/ingest/sharepoint.txt
+++ b/requirements/ingest/sharepoint.txt
@@ -23,7 +23,7 @@ idna==3.6
     # via
     #   -c ingest/../base.txt
     #   requests
-msal==1.25.0
+msal==1.26.0
     # via
     #   -r ingest/sharepoint.in
     #   office365-rest-python-client
diff --git a/requirements/ingest/slack.txt b/requirements/ingest/slack.txt
index 0520f221e6..02a878985e 100644
--- a/requirements/ingest/slack.txt
+++ b/requirements/ingest/slack.txt
@@ -4,5 +4,5 @@
 #
 #    pip-compile --output-file=ingest/slack.txt ingest/slack.in
 #
-slack-sdk==3.26.0
+slack-sdk==3.26.1
     # via -r ingest/slack.in
diff --git a/requirements/ingest/weaviate.txt b/requirements/ingest/weaviate.txt
index b95aba5b4f..315aabd93e 100644
--- a/requirements/ingest/weaviate.txt
+++ b/requirements/ingest/weaviate.txt
@@ -2,44 +2,41 @@
 # This file is autogenerated by pip-compile with Python 3.8
 # by the following command:
 #
-#    pip-compile --constraint=requirements/constraints.in requirements/ingest/weaviate.in
+#    pip-compile --output-file=ingest/weaviate.txt ingest/weaviate.in
 #
 authlib==1.2.1
     # via weaviate-client
 certifi==2023.11.17
     # via
-    #   -c requirements/constraints.in
-    #   -c requirements/ingest/../base.txt
-    #   -c requirements/ingest/../constraints.in
+    #   -c ingest/../base.txt
+    #   -c ingest/../constraints.in
     #   requests
 cffi==1.16.0
     # via cryptography
 charset-normalizer==3.3.2
     # via
-    #   -c requirements/ingest/../base.txt
+    #   -c ingest/../base.txt
     #   requests
-cryptography==41.0.5
+cryptography==41.0.7
     # via authlib
-idna==3.4
+idna==3.6
     # via
-    #   -c requirements/ingest/../base.txt
+    #   -c ingest/../base.txt
     #   requests
 pycparser==2.21
     # via cffi
 requests==2.31.0
     # via
-    #   -c requirements/ingest/../base.txt
+    #   -c ingest/../base.txt
     #   weaviate-client
 urllib3==1.26.18
     # via
-    #   -c requirements/constraints.in
-    #   -c requirements/ingest/../base.txt
-    #   -c requirements/ingest/../constraints.in
+    #   -c ingest/../base.txt
+    #   -c ingest/../constraints.in
     #   requests
 validators==0.22.0
     # via weaviate-client
 weaviate-client==3.25.3
     # via
-    #   -c requirements/constraints.in
-    #   -c requirements/ingest/../constraints.in
-    #   -r requirements/ingest/weaviate.in
+    #   -c ingest/../constraints.in
+    #   -r ingest/weaviate.in
diff --git a/requirements/test.txt b/requirements/test.txt
index c7fdb8383e..83829466b1 100644
--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -32,7 +32,7 @@ exceptiongroup==1.2.0
     # via pytest
 flake8==6.1.0
     # via -r test.in
-freezegun==1.2.2
+freezegun==1.3.1
     # via -r test.in
 grpcio==1.59.3
     # via -r test.in
@@ -105,7 +105,7 @@ requests==2.31.0
     # via
     #   -c base.txt
     #   label-studio-sdk
-ruff==0.1.6
+ruff==0.1.7
     # via -r test.in
 six==1.16.0
     # via