From c8ed1e33a343987fe9b684b020ca22f1e48c0bf9 Mon Sep 17 00:00:00 2001 From: christinestraub Date: Fri, 1 Dec 2023 15:03:19 -0800 Subject: [PATCH 1/7] feat: add functionality to support image extraction --- unstructured/documents/elements.py | 78 +++++------ unstructured/partition/pdf_image/pdf.py | 14 +- .../partition/pdf_image/pdf_image_utils.py | 123 ++++++++++++++++++ 3 files changed, 172 insertions(+), 43 deletions(-) create mode 100644 unstructured/partition/pdf_image/pdf_image_utils.py diff --git a/unstructured/documents/elements.py b/unstructured/documents/elements.py index fe1c2ce18e..8fbbf96104 100644 --- a/unstructured/documents/elements.py +++ b/unstructured/documents/elements.py @@ -555,6 +555,44 @@ def _add_regex_metadata( return elements +class ElementType: + TITLE = "Title" + TEXT = "Text" + UNCATEGORIZED_TEXT = "UncategorizedText" + NARRATIVE_TEXT = "NarrativeText" + BULLETED_TEXT = "BulletedText" + ABSTRACT = "Abstract" + THREADING = "Threading" + FORM = "Form" + FIELD_NAME = "Field-Name" + VALUE = "Value" + LINK = "Link" + COMPOSITE_ELEMENT = "CompositeElement" + IMAGE = "Image" + PICTURE = "Picture" + FIGURE_CAPTION = "FigureCaption" + FIGURE = "Figure" + CAPTION = "Caption" + LIST = "List" + LIST_ITEM = "ListItem" + LIST_ITEM_OTHER = "List-item" + CHECKED = "Checked" + UNCHECKED = "Unchecked" + ADDRESS = "Address" + EMAIL_ADDRESS = "EmailAddress" + PAGE_BREAK = "PageBreak" + FORMULA = "Formula" + TABLE = "Table" + HEADER = "Header" + HEADLINE = "Headline" + SUB_HEADLINE = "Subheadline" + PAGE_HEADER = "Page-header" # Title? + SECTION_HEADER = "Section-header" + FOOTER = "Footer" + FOOTNOTE = "Footnote" + PAGE_FOOTER = "Page-footer" + + class Element(abc.ABC): """An element is a section of a page in the document.""" @@ -764,7 +802,7 @@ class EmailAddress(Text): class Image(Text): """A text element for capturing image metadata.""" - category = "Image" + category = ElementType.IMAGE class PageBreak(Text): @@ -797,44 +835,6 @@ class Footer(Text): category = "Footer" -class ElementType: - TITLE = "Title" - TEXT = "Text" - UNCATEGORIZED_TEXT = "UncategorizedText" - NARRATIVE_TEXT = "NarrativeText" - BULLETED_TEXT = "BulletedText" - ABSTRACT = "Abstract" - THREADING = "Threading" - FORM = "Form" - FIELD_NAME = "Field-Name" - VALUE = "Value" - LINK = "Link" - COMPOSITE_ELEMENT = "CompositeElement" - IMAGE = "Image" - PICTURE = "Picture" - FIGURE_CAPTION = "FigureCaption" - FIGURE = "Figure" - CAPTION = "Caption" - LIST = "List" - LIST_ITEM = "ListItem" - LIST_ITEM_OTHER = "List-item" - CHECKED = "Checked" - UNCHECKED = "Unchecked" - ADDRESS = "Address" - EMAIL_ADDRESS = "EmailAddress" - PAGE_BREAK = "PageBreak" - FORMULA = "Formula" - TABLE = "Table" - HEADER = "Header" - HEADLINE = "Headline" - SUB_HEADLINE = "Subheadline" - PAGE_HEADER = "Page-header" # Title? - SECTION_HEADER = "Section-header" - FOOTER = "Footer" - FOOTNOTE = "Footnote" - PAGE_FOOTER = "Page-footer" - - TYPE_TO_TEXT_ELEMENT_MAP: Dict[str, Any] = { ElementType.TITLE: Title, ElementType.SECTION_HEADER: Title, diff --git a/unstructured/partition/pdf_image/pdf.py b/unstructured/partition/pdf_image/pdf.py index 1d911fc8a4..c93070189f 100644 --- a/unstructured/partition/pdf_image/pdf.py +++ b/unstructured/partition/pdf_image/pdf.py @@ -70,6 +70,7 @@ check_languages, prepare_languages_for_tesseract, ) +from unstructured.partition.pdf_image.pdf_image_utils import extract_images_from_elements from unstructured.partition.pdf_image.pdfminer_utils import ( open_pdfminer_pages_generator, rect_to_bbox, @@ -381,8 +382,6 @@ def _partition_pdf_or_image_local( is_image=is_image, model_name=model_name, pdf_image_dpi=pdf_image_dpi, - extract_images_in_pdf=extract_images_in_pdf, - image_output_dir_path=image_output_dir_path, ) # NOTE(christine): merged_document_layout = extracted_layout + inferred_layout @@ -411,8 +410,6 @@ def _partition_pdf_or_image_local( is_image=is_image, model_name=model_name, pdf_image_dpi=pdf_image_dpi, - extract_images_in_pdf=extract_images_in_pdf, - image_output_dir_path=image_output_dir_path, ) if hasattr(file, "seek"): file.seek(0) @@ -458,6 +455,15 @@ def _partition_pdf_or_image_local( **kwargs, ) + if extract_images_in_pdf: + extract_images_from_elements( + elements=elements, + filename=filename, + file=file, + pdf_image_dpi=pdf_image_dpi, + output_dir_path=image_output_dir_path, + ) + out_elements = [] for el in elements: if isinstance(el, PageBreak) and not include_page_breaks: diff --git a/unstructured/partition/pdf_image/pdf_image_utils.py b/unstructured/partition/pdf_image/pdf_image_utils.py new file mode 100644 index 0000000000..c61698b06f --- /dev/null +++ b/unstructured/partition/pdf_image/pdf_image_utils.py @@ -0,0 +1,123 @@ +import os +import tempfile +from pathlib import PurePath +from typing import Union, Optional, List, cast, BinaryIO, TYPE_CHECKING +import cv2 +import numpy as np +import pdf2image +from PIL import Image + +from unstructured.documents.elements import ElementType +from unstructured.logger import logger +from unstructured.partition.common import convert_to_bytes + +if TYPE_CHECKING: + from unstructured.documents.elements import Element + + +def write_image(image: Union[Image.Image, np.ndarray], output_image_path: str): + """ + Write an image to a specified file path, supporting both PIL Image and numpy ndarray formats. + + Parameters: + - image (Union[Image.Image, np.ndarray]): The image to be written, which can be in PIL Image + format or a numpy ndarray format. + - output_image_path (str): The path to which the image will be written. + + Raises: + - ValueError: If the provided image type is neither PIL Image nor numpy ndarray. + + Returns: + - None: The function writes the image to the specified path but does not return any value. + """ + + if isinstance(image, Image.Image): + image.save(output_image_path) + elif isinstance(image, np.ndarray): + cv2.imwrite(output_image_path, image) + else: + raise ValueError("Unsupported Image Type") + + +def convert_pdf_to_image( + filename: str, + file: Optional[Union[bytes, BinaryIO]] = None, + dpi: int = 200, + output_folder: Optional[Union[str, PurePath]] = None, + path_only: bool = False, +) -> Union[List[Image.Image], List[str]]: + """Get the image renderings of the pdf pages using pdf2image""" + + if path_only and not output_folder: + raise ValueError("output_folder must be specified if path_only is true") + + if file is not None: + f_bytes = convert_to_bytes(file) + images = pdf2image.convert_from_bytes( + f_bytes, + dpi=dpi, + output_folder=output_folder, + paths_only=path_only, + ) + else: + images = pdf2image.convert_from_path( + filename, + dpi=dpi, + output_folder=output_folder, + paths_only=path_only, + ) + + return images + + +def extract_images_from_elements( + elements: List["Element"], + pdf_image_dpi: int, + filename: str = "", + file: Optional[Union[bytes, BinaryIO]] = None, + output_dir_path: Optional[str] = None +): + """ + Extract and save images from the page. This method iterates through the layout elements + of the page, identifies image regions, and extracts and saves them as separate image files. + """ + + if not output_dir_path: + output_dir_path = os.path.join(os.getcwd(), "figures") + os.makedirs(output_dir_path, exist_ok=True) + + with tempfile.TemporaryDirectory() as temp_dir: + _image_paths = convert_pdf_to_image( + filename, + file, + pdf_image_dpi, + output_folder=temp_dir, + path_only=True, + ) + image_paths = cast(List[str], _image_paths) + + figure_number = 0 + for el in elements: + coordinates = el.metadata.coordinates + if not coordinates or not coordinates.points or el.category != ElementType.IMAGE: + continue + + points = coordinates.points + x1, y1 = points[0] + x2, y2 = points[2] + page_number = el.metadata.page_number + + figure_number += 1 + try: + output_f_path = os.path.join( + output_dir_path, + f"figure-{page_number}-{figure_number}.jpg", + ) + image_path = image_paths[page_number - 1] + image = Image.open(image_path) + cropped_image = image.crop((x1, y1, x2, y2)) + write_image(cropped_image, output_f_path) + # add image path to element metadata + el.metadata.image_path = output_f_path + except (ValueError, IOError): + logger.warning("Image Extraction Error: Skipping the failed image", exc_info=True) From a488f164970fe573478862b17648f44f73511173 Mon Sep 17 00:00:00 2001 From: christinestraub Date: Sun, 3 Dec 2023 23:43:57 -0800 Subject: [PATCH 2/7] test: add test cases for `pdf_image_utils` module --- .../pdf_image/test_pdf_image_utils.py | 114 ++++++++++++++++++ .../partition/pdf_image/pdf_image_utils.py | 10 +- 2 files changed, 119 insertions(+), 5 deletions(-) create mode 100644 test_unstructured/partition/pdf_image/test_pdf_image_utils.py diff --git a/test_unstructured/partition/pdf_image/test_pdf_image_utils.py b/test_unstructured/partition/pdf_image/test_pdf_image_utils.py new file mode 100644 index 0000000000..de0cbe0265 --- /dev/null +++ b/test_unstructured/partition/pdf_image/test_pdf_image_utils.py @@ -0,0 +1,114 @@ +import os +import tempfile +from typing import List + +import pytest +from PIL import Image as PILImg + +from test_unstructured.unit_utils import example_doc_path +from unstructured.documents.coordinates import PixelSpace +from unstructured.documents.elements import ElementMetadata, Image +from unstructured.partition.pdf_image import pdf_image_utils + + +@pytest.mark.parametrize("image_type", ["pil", "numpy_array"]) +def test_write_image(image_type, mock_pil_image, mock_numpy_image): + image_map = { + "pil": mock_pil_image, + "numpy_array": mock_numpy_image, + } + image = image_map[image_type] + + with tempfile.TemporaryDirectory() as tmpdir: + output_image_path = os.path.join(tmpdir, "test_image.jpg") + pdf_image_utils.write_image(image, output_image_path) + assert os.path.exists(output_image_path) + + # Additional check to see if the written image can be read + read_image = PILImg.open(output_image_path) + assert read_image is not None + + +@pytest.mark.parametrize("file_mode", ["filename", "rb"]) +@pytest.mark.parametrize("path_only", [True, False]) +def test_convert_pdf_to_image( + file_mode, + path_only, + filename=example_doc_path("embedded-images.pdf") +): + with tempfile.TemporaryDirectory() as tmpdir: + if file_mode == "filename": + images = pdf_image_utils.convert_pdf_to_image( + filename=filename, + file=None, + output_folder=tmpdir, + path_only=path_only, + ) + else: + with open(filename, "rb") as f: + images = pdf_image_utils.convert_pdf_to_image( + filename="", + file=f, + output_folder=tmpdir, + path_only=path_only, + ) + + if path_only: + assert isinstance(images[0], str) + else: + assert isinstance(images[0], PILImg.Image) + + +def test_extract_images_from_elements(filename=example_doc_path("embedded-images.pdf")): + with tempfile.TemporaryDirectory() as tmpdir: + elements = [ + Image( + text="3", + coordinates=( + (78.7401411111111, 86.61545694444455), + (78.7401411111111, 519.9487805555556), + (512.0734647222223, 519.9487805555556), + (512.0734647222223, 86.61545694444455), + ), + coordinate_system=PixelSpace(width=1575, height=1166), + metadata=ElementMetadata(page_number=1), + ), + Image( + text="4", + coordinates=( + (570.8661397222222, 86.6154566666667), + (570.8661397222222, 519.6862825000001), + (1003.9369655555556, 519.6862825000001), + (1003.9369655555556, 86.6154566666667), + ), + coordinate_system=PixelSpace(width=1575, height=1166), + metadata=ElementMetadata(page_number=1), + ), + Image( + text="5", + coordinates=( + (1062.9921808333331, 86.61545694444455), + (1062.9921808333331, 519.9487805555556), + (1496.3255044444445, 519.9487805555556), + (1496.3255044444445, 86.61545694444455), + ), + coordinate_system=PixelSpace(width=1575, height=1166), + metadata=ElementMetadata(page_number=1), + ), + ] + + pdf_image_utils.extract_images_from_elements( + elements=elements, pdf_image_dpi=200, filename=filename, output_dir_path=str(tmpdir) + ) + + for i, el in enumerate(elements): + expected_image_path = os.path.join( + str(tmpdir), f"figure-{el.metadata.page_number}-{i + 1}.jpg" + ) + assert os.path.isfile(el.metadata.image_path) + assert el.metadata.image_path == expected_image_path + + +def test_write_image_raises_error(): + with pytest.raises(ValueError): + pdf_image_utils.write_image("invalid_type", "test_image.jpg") diff --git a/unstructured/partition/pdf_image/pdf_image_utils.py b/unstructured/partition/pdf_image/pdf_image_utils.py index c61698b06f..1c73a7c07f 100644 --- a/unstructured/partition/pdf_image/pdf_image_utils.py +++ b/unstructured/partition/pdf_image/pdf_image_utils.py @@ -1,7 +1,8 @@ import os import tempfile from pathlib import PurePath -from typing import Union, Optional, List, cast, BinaryIO, TYPE_CHECKING +from typing import TYPE_CHECKING, BinaryIO, List, Optional, Union, cast + import cv2 import numpy as np import pdf2image @@ -75,7 +76,7 @@ def extract_images_from_elements( pdf_image_dpi: int, filename: str = "", file: Optional[Union[bytes, BinaryIO]] = None, - output_dir_path: Optional[str] = None + output_dir_path: Optional[str] = None, ): """ Extract and save images from the page. This method iterates through the layout elements @@ -96,8 +97,7 @@ def extract_images_from_elements( ) image_paths = cast(List[str], _image_paths) - figure_number = 0 - for el in elements: + for i, el in enumerate(elements): coordinates = el.metadata.coordinates if not coordinates or not coordinates.points or el.category != ElementType.IMAGE: continue @@ -107,7 +107,7 @@ def extract_images_from_elements( x2, y2 = points[2] page_number = el.metadata.page_number - figure_number += 1 + figure_number = i + 1 try: output_f_path = os.path.join( output_dir_path, From 1478b0c83a830d2e6e25eaee713128cdc76f33d7 Mon Sep 17 00:00:00 2001 From: christinestraub Date: Sun, 3 Dec 2023 23:46:09 -0800 Subject: [PATCH 3/7] test: fix lint errors --- .../partition/pdf_image/test_pdf_image_utils.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/test_unstructured/partition/pdf_image/test_pdf_image_utils.py b/test_unstructured/partition/pdf_image/test_pdf_image_utils.py index de0cbe0265..72c92e8f7e 100644 --- a/test_unstructured/partition/pdf_image/test_pdf_image_utils.py +++ b/test_unstructured/partition/pdf_image/test_pdf_image_utils.py @@ -1,6 +1,5 @@ import os import tempfile -from typing import List import pytest from PIL import Image as PILImg @@ -32,9 +31,7 @@ def test_write_image(image_type, mock_pil_image, mock_numpy_image): @pytest.mark.parametrize("file_mode", ["filename", "rb"]) @pytest.mark.parametrize("path_only", [True, False]) def test_convert_pdf_to_image( - file_mode, - path_only, - filename=example_doc_path("embedded-images.pdf") + file_mode, path_only, filename=example_doc_path("embedded-images.pdf") ): with tempfile.TemporaryDirectory() as tmpdir: if file_mode == "filename": From cad7bfe0efde539a065a85039ac3f8ecaf40ee04 Mon Sep 17 00:00:00 2001 From: christinestraub Date: Sun, 3 Dec 2023 23:52:41 -0800 Subject: [PATCH 4/7] chore: update changelog & version --- CHANGELOG.md | 3 ++- unstructured/__version__.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c1fa0cdf5c..c198d4fd84 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,7 +1,8 @@ -## 0.11.4-dev2 +## 0.11.4-dev3 ### Enhancements +* **Refactor image extraction code.** The image extraction code is moved from `unstructured-inference` to `unstructured`. * **Refactor pdfminer code.** The pdfminer code is moved from `unstructured-inference` to `unstructured`. ### Features diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 6feed12e46..18e5d9c421 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.11.4-dev2" # pragma: no cover +__version__ = "0.11.4-dev3" # pragma: no cover From d43e8176a53f39c4405c9c650fe7c164dde0ac59 Mon Sep 17 00:00:00 2001 From: christinestraub Date: Mon, 4 Dec 2023 00:05:03 -0800 Subject: [PATCH 5/7] feat: revert image file name change --- unstructured/partition/pdf_image/pdf_image_utils.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/unstructured/partition/pdf_image/pdf_image_utils.py b/unstructured/partition/pdf_image/pdf_image_utils.py index 1c73a7c07f..a3dd83774f 100644 --- a/unstructured/partition/pdf_image/pdf_image_utils.py +++ b/unstructured/partition/pdf_image/pdf_image_utils.py @@ -97,7 +97,8 @@ def extract_images_from_elements( ) image_paths = cast(List[str], _image_paths) - for i, el in enumerate(elements): + figure_number = 0 + for el in elements: coordinates = el.metadata.coordinates if not coordinates or not coordinates.points or el.category != ElementType.IMAGE: continue @@ -107,7 +108,7 @@ def extract_images_from_elements( x2, y2 = points[2] page_number = el.metadata.page_number - figure_number = i + 1 + figure_number += 1 try: output_f_path = os.path.join( output_dir_path, From 0663e5cfc870ffaf0277c4acae76112f230ce1fe Mon Sep 17 00:00:00 2001 From: christinestraub Date: Mon, 4 Dec 2023 10:38:05 -0800 Subject: [PATCH 6/7] test: fix unit test errors --- .../partition/pdf_image/test_pdf_image_utils.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/test_unstructured/partition/pdf_image/test_pdf_image_utils.py b/test_unstructured/partition/pdf_image/test_pdf_image_utils.py index 72c92e8f7e..91c991b926 100644 --- a/test_unstructured/partition/pdf_image/test_pdf_image_utils.py +++ b/test_unstructured/partition/pdf_image/test_pdf_image_utils.py @@ -1,6 +1,7 @@ import os import tempfile +import numpy as np import pytest from PIL import Image as PILImg @@ -11,7 +12,10 @@ @pytest.mark.parametrize("image_type", ["pil", "numpy_array"]) -def test_write_image(image_type, mock_pil_image, mock_numpy_image): +def test_write_image(image_type): + mock_pil_image = PILImg.new("RGB", (50, 50)) + mock_numpy_image = np.zeros((50, 50, 3), np.uint8) + image_map = { "pil": mock_pil_image, "numpy_array": mock_numpy_image, From fcf538f922c70a4b747d7c9e67117484995bc983 Mon Sep 17 00:00:00 2001 From: christinestraub Date: Tue, 5 Dec 2023 09:16:26 -0800 Subject: [PATCH 7/7] chore: bump unstructured-inference==0.7.18 & make pip-compile --- docs/requirements.txt | 2 +- requirements/build.txt | 2 +- requirements/dev.txt | 10 ++++----- requirements/extra-markdown.txt | 2 +- requirements/extra-paddleocr.txt | 4 ++-- requirements/extra-pdf-image.in | 2 +- requirements/extra-pdf-image.txt | 6 ++--- requirements/ingest/azure.txt | 2 +- requirements/ingest/delta-table.txt | 2 +- requirements/ingest/embed-aws-bedrock.txt | 6 ++--- requirements/ingest/embed-huggingface.txt | 6 ++--- requirements/ingest/embed-openai.txt | 8 +++---- requirements/ingest/onedrive.txt | 2 +- requirements/ingest/outlook.txt | 2 +- requirements/ingest/reddit.txt | 2 +- requirements/ingest/sharepoint.txt | 2 +- requirements/ingest/slack.txt | 2 +- requirements/ingest/weaviate.txt | 27 ++++++++++------------- requirements/test.txt | 4 ++-- 19 files changed, 45 insertions(+), 48 deletions(-) diff --git a/docs/requirements.txt b/docs/requirements.txt index 129ce79f3d..ee5fdd1d2d 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -36,7 +36,7 @@ idna==3.6 # requests imagesize==1.4.1 # via sphinx -importlib-metadata==6.9.0 +importlib-metadata==7.0.0 # via sphinx jinja2==3.1.2 # via diff --git a/requirements/build.txt b/requirements/build.txt index 129ce79f3d..ee5fdd1d2d 100644 --- a/requirements/build.txt +++ b/requirements/build.txt @@ -36,7 +36,7 @@ idna==3.6 # requests imagesize==1.4.1 # via sphinx -importlib-metadata==6.9.0 +importlib-metadata==7.0.0 # via sphinx jinja2==3.1.2 # via diff --git a/requirements/dev.txt b/requirements/dev.txt index c5592adbd0..0f880f65b3 100644 --- a/requirements/dev.txt +++ b/requirements/dev.txt @@ -91,7 +91,7 @@ idna==3.6 # anyio # jsonschema # requests -importlib-metadata==6.9.0 +importlib-metadata==7.0.0 # via # build # jupyter-client @@ -167,7 +167,7 @@ jupyter-events==0.9.0 # via jupyter-server jupyter-lsp==2.2.1 # via jupyterlab -jupyter-server==2.11.1 +jupyter-server==2.11.2 # via # jupyter-lsp # jupyterlab @@ -198,7 +198,7 @@ mistune==3.0.2 # via nbconvert nbclient==0.9.0 # via nbconvert -nbconvert==7.11.0 +nbconvert==7.12.0 # via # jupyter # jupyter-server @@ -290,7 +290,7 @@ pyyaml==6.0.1 # -c test.txt # jupyter-events # pre-commit -pyzmq==25.1.1 +pyzmq==25.1.2 # via # ipykernel # jupyter-client @@ -405,7 +405,7 @@ webencodings==0.5.1 # via # bleach # tinycss2 -websocket-client==1.6.4 +websocket-client==1.7.0 # via jupyter-server wheel==0.42.0 # via diff --git a/requirements/extra-markdown.txt b/requirements/extra-markdown.txt index 940336c7ca..c2c30d59a0 100644 --- a/requirements/extra-markdown.txt +++ b/requirements/extra-markdown.txt @@ -4,7 +4,7 @@ # # pip-compile --output-file=extra-markdown.txt extra-markdown.in # -importlib-metadata==6.9.0 +importlib-metadata==7.0.0 # via markdown markdown==3.5.1 # via -r extra-markdown.in diff --git a/requirements/extra-paddleocr.txt b/requirements/extra-paddleocr.txt index 896e548897..f9b3ba0e9f 100644 --- a/requirements/extra-paddleocr.txt +++ b/requirements/extra-paddleocr.txt @@ -45,7 +45,7 @@ flask==3.0.0 # visualdl flask-babel==4.0.0 # via visualdl -fonttools==4.45.1 +fonttools==4.46.0 # via matplotlib future==0.18.3 # via bce-python-sdk @@ -59,7 +59,7 @@ imageio==2.33.0 # scikit-image imgaug==0.4.0 # via unstructured-paddleocr -importlib-metadata==6.9.0 +importlib-metadata==7.0.0 # via flask importlib-resources==6.1.1 # via matplotlib diff --git a/requirements/extra-pdf-image.in b/requirements/extra-pdf-image.in index 4ccf33f804..a3184f77df 100644 --- a/requirements/extra-pdf-image.in +++ b/requirements/extra-pdf-image.in @@ -8,7 +8,7 @@ pikepdf pypdf # Do not move to contsraints.in, otherwise unstructured-inference will not be upgraded # when unstructured library is. -unstructured-inference==0.7.17 +unstructured-inference==0.7.18 # unstructured fork of pytesseract that provides an interface to allow for multiple output formats # from one tesseract call unstructured.pytesseract>=0.3.12 diff --git a/requirements/extra-pdf-image.txt b/requirements/extra-pdf-image.txt index dc7f2ec7a3..468aabcecf 100644 --- a/requirements/extra-pdf-image.txt +++ b/requirements/extra-pdf-image.txt @@ -37,7 +37,7 @@ filelock==3.13.1 # transformers flatbuffers==23.5.26 # via onnxruntime -fonttools==4.45.1 +fonttools==4.46.0 # via matplotlib fsspec==2023.9.1 # via @@ -134,7 +134,7 @@ pdfminer-six==20221105 # pdfplumber pdfplumber==0.10.3 # via layoutparser -pikepdf==8.7.1 +pikepdf==8.8.0 # via -r extra-pdf-image.in pillow==10.0.1 # via @@ -250,7 +250,7 @@ typing-extensions==4.8.0 # torch tzdata==2023.3 # via pandas -unstructured-inference==0.7.17 +unstructured-inference==0.7.18 # via -r extra-pdf-image.in unstructured-pytesseract==0.3.12 # via diff --git a/requirements/ingest/azure.txt b/requirements/ingest/azure.txt index c54b4e0aeb..28ade9b0c7 100644 --- a/requirements/ingest/azure.txt +++ b/requirements/ingest/azure.txt @@ -60,7 +60,7 @@ idna==3.6 # yarl isodate==0.6.1 # via azure-storage-blob -msal==1.25.0 +msal==1.26.0 # via # azure-datalake-store # azure-identity diff --git a/requirements/ingest/delta-table.txt b/requirements/ingest/delta-table.txt index 1203ed11df..c66e481e47 100644 --- a/requirements/ingest/delta-table.txt +++ b/requirements/ingest/delta-table.txt @@ -4,7 +4,7 @@ # # pip-compile --output-file=ingest/delta-table.txt ingest/delta-table.in # -deltalake==0.13.0 +deltalake==0.14.0 # via -r ingest/delta-table.in fsspec==2023.9.1 # via diff --git a/requirements/ingest/embed-aws-bedrock.txt b/requirements/ingest/embed-aws-bedrock.txt index e155405f14..c8cdfde16b 100644 --- a/requirements/ingest/embed-aws-bedrock.txt +++ b/requirements/ingest/embed-aws-bedrock.txt @@ -64,11 +64,11 @@ jsonpatch==1.33 # langchain-core jsonpointer==2.4 # via jsonpatch -langchain==0.0.344 +langchain==0.0.345 # via -r ingest/embed-aws-bedrock.in -langchain-core==0.0.8 +langchain-core==0.0.9 # via langchain -langsmith==0.0.68 +langsmith==0.0.69 # via # langchain # langchain-core diff --git a/requirements/ingest/embed-huggingface.txt b/requirements/ingest/embed-huggingface.txt index 771a613391..781ea27f3b 100644 --- a/requirements/ingest/embed-huggingface.txt +++ b/requirements/ingest/embed-huggingface.txt @@ -79,11 +79,11 @@ jsonpatch==1.33 # langchain-core jsonpointer==2.4 # via jsonpatch -langchain==0.0.344 +langchain==0.0.345 # via -r ingest/embed-huggingface.in -langchain-core==0.0.8 +langchain-core==0.0.9 # via langchain -langsmith==0.0.68 +langsmith==0.0.69 # via # langchain # langchain-core diff --git a/requirements/ingest/embed-openai.txt b/requirements/ingest/embed-openai.txt index 0486bc927d..74f3b199b2 100644 --- a/requirements/ingest/embed-openai.txt +++ b/requirements/ingest/embed-openai.txt @@ -64,11 +64,11 @@ jsonpatch==1.33 # langchain-core jsonpointer==2.4 # via jsonpatch -langchain==0.0.344 +langchain==0.0.345 # via -r ingest/embed-openai.in -langchain-core==0.0.8 +langchain-core==0.0.9 # via langchain -langsmith==0.0.68 +langsmith==0.0.69 # via # langchain # langchain-core @@ -125,7 +125,7 @@ tenacity==8.2.3 # via # langchain # langchain-core -tiktoken==0.5.1 +tiktoken==0.5.2 # via -r ingest/embed-openai.in tqdm==4.66.1 # via diff --git a/requirements/ingest/onedrive.txt b/requirements/ingest/onedrive.txt index 155fdcb36a..babcca6a7a 100644 --- a/requirements/ingest/onedrive.txt +++ b/requirements/ingest/onedrive.txt @@ -29,7 +29,7 @@ idna==3.6 # via # -c ingest/../base.txt # requests -msal==1.25.0 +msal==1.26.0 # via # -r ingest/onedrive.in # office365-rest-python-client diff --git a/requirements/ingest/outlook.txt b/requirements/ingest/outlook.txt index 03aa4ffdd0..9d4c5b5312 100644 --- a/requirements/ingest/outlook.txt +++ b/requirements/ingest/outlook.txt @@ -23,7 +23,7 @@ idna==3.6 # via # -c ingest/../base.txt # requests -msal==1.25.0 +msal==1.26.0 # via # -r ingest/outlook.in # office365-rest-python-client diff --git a/requirements/ingest/reddit.txt b/requirements/ingest/reddit.txt index 55fb63dc8f..7c6e92c4b9 100644 --- a/requirements/ingest/reddit.txt +++ b/requirements/ingest/reddit.txt @@ -33,5 +33,5 @@ urllib3==1.26.18 # -c ingest/../base.txt # -c ingest/../constraints.in # requests -websocket-client==1.6.4 +websocket-client==1.7.0 # via praw diff --git a/requirements/ingest/sharepoint.txt b/requirements/ingest/sharepoint.txt index d49b89c227..1196dfd580 100644 --- a/requirements/ingest/sharepoint.txt +++ b/requirements/ingest/sharepoint.txt @@ -23,7 +23,7 @@ idna==3.6 # via # -c ingest/../base.txt # requests -msal==1.25.0 +msal==1.26.0 # via # -r ingest/sharepoint.in # office365-rest-python-client diff --git a/requirements/ingest/slack.txt b/requirements/ingest/slack.txt index 0520f221e6..02a878985e 100644 --- a/requirements/ingest/slack.txt +++ b/requirements/ingest/slack.txt @@ -4,5 +4,5 @@ # # pip-compile --output-file=ingest/slack.txt ingest/slack.in # -slack-sdk==3.26.0 +slack-sdk==3.26.1 # via -r ingest/slack.in diff --git a/requirements/ingest/weaviate.txt b/requirements/ingest/weaviate.txt index b95aba5b4f..315aabd93e 100644 --- a/requirements/ingest/weaviate.txt +++ b/requirements/ingest/weaviate.txt @@ -2,44 +2,41 @@ # This file is autogenerated by pip-compile with Python 3.8 # by the following command: # -# pip-compile --constraint=requirements/constraints.in requirements/ingest/weaviate.in +# pip-compile --output-file=ingest/weaviate.txt ingest/weaviate.in # authlib==1.2.1 # via weaviate-client certifi==2023.11.17 # via - # -c requirements/constraints.in - # -c requirements/ingest/../base.txt - # -c requirements/ingest/../constraints.in + # -c ingest/../base.txt + # -c ingest/../constraints.in # requests cffi==1.16.0 # via cryptography charset-normalizer==3.3.2 # via - # -c requirements/ingest/../base.txt + # -c ingest/../base.txt # requests -cryptography==41.0.5 +cryptography==41.0.7 # via authlib -idna==3.4 +idna==3.6 # via - # -c requirements/ingest/../base.txt + # -c ingest/../base.txt # requests pycparser==2.21 # via cffi requests==2.31.0 # via - # -c requirements/ingest/../base.txt + # -c ingest/../base.txt # weaviate-client urllib3==1.26.18 # via - # -c requirements/constraints.in - # -c requirements/ingest/../base.txt - # -c requirements/ingest/../constraints.in + # -c ingest/../base.txt + # -c ingest/../constraints.in # requests validators==0.22.0 # via weaviate-client weaviate-client==3.25.3 # via - # -c requirements/constraints.in - # -c requirements/ingest/../constraints.in - # -r requirements/ingest/weaviate.in + # -c ingest/../constraints.in + # -r ingest/weaviate.in diff --git a/requirements/test.txt b/requirements/test.txt index c7fdb8383e..83829466b1 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -32,7 +32,7 @@ exceptiongroup==1.2.0 # via pytest flake8==6.1.0 # via -r test.in -freezegun==1.2.2 +freezegun==1.3.1 # via -r test.in grpcio==1.59.3 # via -r test.in @@ -105,7 +105,7 @@ requests==2.31.0 # via # -c base.txt # label-studio-sdk -ruff==0.1.6 +ruff==0.1.7 # via -r test.in six==1.16.0 # via