From 1d63200a3cf00f159e68efcc55c704b70eb5cc2a Mon Sep 17 00:00:00 2001 From: Austin Walker Date: Tue, 2 Jul 2024 15:54:43 -0400 Subject: [PATCH 1/8] Extract list parameters in `parse_form_data` When the client prepares the request, it turns list parameters into multiple instances of the same key. For instance: `extract_image_block_types=["Image", "Table"]` becomes `extract_image_block_types[]="Image"` `extract_image_block_types[]="Table"` We need to account for this in our `parse_form_data` helper if we want to use list params in our hooks. Likewise, we need to go the other way when recreating the request in `create_request_body`. --- .../unit/test_split_pdf_hook.py | 62 ++++++++++++++----- .../_hooks/custom/form_utils.py | 11 +++- .../_hooks/custom/request_utils.py | 28 ++++++--- 3 files changed, 74 insertions(+), 27 deletions(-) diff --git a/_test_unstructured_client/unit/test_split_pdf_hook.py b/_test_unstructured_client/unit/test_split_pdf_hook.py index a7f035fe..d6d21768 100644 --- a/_test_unstructured_client/unit/test_split_pdf_hook.py +++ b/_test_unstructured_client/unit/test_split_pdf_hook.py @@ -122,7 +122,8 @@ def test_unit_create_response(): def test_unit_create_request(): - """Test create request method properly sets file, Content-Type and Content-Length headers.""" + """Test create request method properly sets file, Content-Type and Content-Length headers. + List parameters should be flattened in the body.""" # Prepare test data request = requests.PreparedRequest() @@ -133,27 +134,27 @@ def test_unit_create_request(): form_data = { "parameter_1": "value_1", "parameter_2": "value_2", + "list_parameter": ["value_1", "value_2"], } page = (io.BytesIO(b"page_content"), 1) filename = "test_file.pdf" # Expected results - expected_payload = { - "parameter_1": "value_1", - "parameter_2": "value_2", - "split_pdf_page": "false", - "starting_page_number": "7", - } expected_page_filename = "test_file.pdf" expected_body = MultipartEncoder( - fields={ - **expected_payload, - "files": ( + fields=[ + ("parameter_1", "value_1"), + ("parameter_2", "value_2"), + ("list_parameter", "value_1"), + ("list_parameter", "value_2"), + ("split_pdf_page", "false"), + ("starting_page_number", "7"), + ("files", ( expected_page_filename, page[0], "application/pdf", - ), - } + )), + ] ) expected_url = "" @@ -164,7 +165,10 @@ def test_unit_create_request(): # Assert the request object assert request_obj.method == "POST" assert request_obj.url == expected_url - assert request_obj.data.fields == expected_body.fields + + # Validate fields ignoring order + assert set(request_obj.data.fields) == set(expected_body.fields) + assert request_content_type.startswith("multipart/form-data") @@ -191,11 +195,37 @@ def test_unit_decode_content_disposition(): def test_unit_parse_form_data(): - """Test parse form data method properly parses the form data and returns dictionary.""" + """Test parse form data method properly parses the form data and returns dictionary. + Parameters with the same key should be consolidated to a list.""" # Prepare test data + test_form_data = ( + b"--boundary\r\n" + b"Content-Disposition: form-data; name=\"files\"; filename=\"test_file.pdf\"\r\n" + b"\r\n" + b"file_content\r\n" + b"--boundary\r\n" + b"Content-Disposition: form-data; name=\"parameter_1\"\r\n" + b"\r\n" + b"value_1\r\n" + b"--boundary\r\n" + b"Content-Disposition: form-data; name=\"parameter_2\"\r\n" + b"\r\n" + b"value_2\r\n" + b"--boundary\r\n" + b"Content-Disposition: form-data; name=\"list_parameter\"\r\n" + b"\r\n" + b"value_1\r\n" + b"--boundary\r\n" + b"Content-Disposition: form-data; name=\"list_parameter\"\r\n" + b"\r\n" + b"value_2\r\n" + b"--boundary--\r\n" + ) + + decoded_data = MultipartDecoder( - b'--boundary\r\nContent-Disposition: form-data; name="files"; filename="test_file.pdf"\r\n\r\nfile_content\r\n--boundary\r\nContent-Disposition: form-data; name="parameter_1"\r\n\r\nvalue_1\r\n--boundary\r\nContent-Disposition: form-data; name="parameter_2"\r\n\r\nvalue_2\r\n--boundary--\r\n', + test_form_data, "multipart/form-data; boundary=boundary", ) @@ -204,6 +234,7 @@ def test_unit_parse_form_data(): "files": shared.Files(b"file_content", "test_file.pdf"), "parameter_1": "value_1", "parameter_2": "value_2", + "list_parameter": ["value_1", "value_2"], } # Parse form data @@ -212,6 +243,7 @@ def test_unit_parse_form_data(): # Assert the parsed form data assert form_data.get("parameter_1") == expected_form_data.get("parameter_1") assert form_data.get("parameter_2") == expected_form_data.get("parameter_2") + assert form_data.get("list_parameter") == expected_form_data.get("list_parameter") assert form_data.get("files").file_name == expected_form_data.get("files").file_name assert form_data.get("files").content == expected_form_data.get("files").content diff --git a/src/unstructured_client/_hooks/custom/form_utils.py b/src/unstructured_client/_hooks/custom/form_utils.py index e73ab084..f02ec570 100644 --- a/src/unstructured_client/_hooks/custom/form_utils.py +++ b/src/unstructured_client/_hooks/custom/form_utils.py @@ -9,7 +9,7 @@ from unstructured_client.models import shared logger = logging.getLogger(UNSTRUCTURED_CLIENT_LOGGER_NAME) -FormData = dict[str, Union[str, shared.Files]] +FormData = dict[str, Union[str, shared.Files, list[str]]] PARTITION_FORM_FILES_KEY = "files" PARTITION_FORM_SPLIT_PDF_PAGE_KEY = "split_pdf_page" @@ -148,6 +148,13 @@ def parse_form_data(decoded_data: MultipartDecoder) -> FormData: raise ValueError("Filename can't be an empty string.") form_data[PARTITION_FORM_FILES_KEY] = shared.Files(part.content, filename) else: - form_data[name] = part.content.decode() + content = part.content.decode() + if name in form_data: + if isinstance(form_data[name], list): + form_data[name].append(content) + else: + form_data[name] = [form_data[name], content] + else: + form_data[name] = content return form_data diff --git a/src/unstructured_client/_hooks/custom/request_utils.py b/src/unstructured_client/_hooks/custom/request_utils.py index a714ddb1..7b83f7a7 100644 --- a/src/unstructured_client/_hooks/custom/request_utils.py +++ b/src/unstructured_client/_hooks/custom/request_utils.py @@ -5,7 +5,7 @@ import io import json import logging -from typing import Optional, Tuple +from typing import Optional, Tuple, Any import httpx import requests @@ -27,16 +27,24 @@ def create_request_body( form_data: FormData, page_content: io.BytesIO, filename: str, page_number: int ) -> MultipartEncoder: payload = prepare_request_payload(form_data) + + payload_fields: list[tuple[str, Any]] = [] + for key, value in payload.items(): + if isinstance(value, list): + payload_fields.extend([(key, list_value) for list_value in value]) + else: + payload_fields.append((key, value)) + + payload_fields.append((PARTITION_FORM_FILES_KEY, ( + filename, + page_content, + "application/pdf", + ))) + + payload_fields.append((PARTITION_FORM_STARTING_PAGE_NUMBER_KEY, str(page_number))) + body = MultipartEncoder( - fields={ - **payload, - PARTITION_FORM_FILES_KEY: ( - filename, - page_content, - "application/pdf", - ), - PARTITION_FORM_STARTING_PAGE_NUMBER_KEY: str(page_number), - } + fields=payload_fields ) return body From f54ad539120ecc7062983052569c5d19e9d4471f Mon Sep 17 00:00:00 2001 From: Austin Walker Date: Wed, 10 Jul 2024 09:55:42 -0400 Subject: [PATCH 2/8] Add get_page_range form util helper --- .../unit/test_split_pdf_hook.py | 23 +++++++++++ .../_hooks/custom/form_utils.py | 38 +++++++++++++++++++ 2 files changed, 61 insertions(+) diff --git a/_test_unstructured_client/unit/test_split_pdf_hook.py b/_test_unstructured_client/unit/test_split_pdf_hook.py index d6d21768..25512466 100644 --- a/_test_unstructured_client/unit/test_split_pdf_hook.py +++ b/_test_unstructured_client/unit/test_split_pdf_hook.py @@ -9,6 +9,7 @@ from unstructured_client._hooks.custom.form_utils import ( PARTITION_FORM_CONCURRENCY_LEVEL_KEY, PARTITION_FORM_STARTING_PAGE_NUMBER_KEY, + PARTITION_FORM_PAGE_RANGE_KEY, ) from unstructured_client._hooks.custom.split_pdf_hook import ( DEFAULT_CONCURRENCY_LEVEL, @@ -398,3 +399,25 @@ def test_unit_get_starting_page_number(starting_page_number, expected_result): fallback_value=DEFAULT_STARTING_PAGE_NUMBER, ) assert result == expected_result + + +@pytest.mark.parametrize( + "page_range, expected_result", + [ + (["2", "5"], (2, 5)), # Valid range + (["2", "100"], (2, 20)), # End too high + (["-50", "5"], (1, 5)), # Start too low + (None, (1, 20)), # Range not specified + (["foo", "foo"], (1, 20)), # Parse error + ], +) +def test_unit_get_page_range_returns_valid_range(page_range, expected_result): + """Test get_page_range method with different inputs. + Ranges that are out of bounds for a 20 page doc will be adjusted.""" + form_data = {"split_pdf_page_range[]": page_range} + result = form_utils.get_page_range( + form_data, + key=PARTITION_FORM_PAGE_RANGE_KEY, + max_pages=20, + ) + assert result == expected_result diff --git a/src/unstructured_client/_hooks/custom/form_utils.py b/src/unstructured_client/_hooks/custom/form_utils.py index f02ec570..489ab8c2 100644 --- a/src/unstructured_client/_hooks/custom/form_utils.py +++ b/src/unstructured_client/_hooks/custom/form_utils.py @@ -13,10 +13,48 @@ PARTITION_FORM_FILES_KEY = "files" PARTITION_FORM_SPLIT_PDF_PAGE_KEY = "split_pdf_page" +PARTITION_FORM_PAGE_RANGE_KEY = "split_pdf_page_range[]" PARTITION_FORM_STARTING_PAGE_NUMBER_KEY = "starting_page_number" PARTITION_FORM_CONCURRENCY_LEVEL_KEY = "split_pdf_concurrency_level" +def get_page_range(form_data: FormData, key: str, max_pages: int) -> tuple[int, int]: + """Retrieves the split page range from the given form data. + + If the range is invalid or outside the bounds of the page count, + returns (1, num_pages), i.e. the full range. + + Args: + form_data: The form data containing the page range + key: The key to look for in the form data. + + Returns: + The range of pages to send in the request in the form (start, end) + """ + try: + _page_range = form_data.get(key) + + if _page_range is not None: + page_range = (int(_page_range[0]), int(_page_range[1])) + else: + page_range = (1, max_pages) + + except (ValueError, IndexError): + logger.warning( + "'%s' is not a valid page range. Selecting default range (1 to %d).", + _page_range, + max_pages, + ) + page_range = (1, max_pages) + + if page_range[0] < 1 or page_range[1] > max_pages: + new_page_range = (max(page_range[0], 1), min(page_range[1], max_pages)) + logger.warning(f"Page range {page_range} is out of bounds, setting to {new_page_range}.") + page_range = new_page_range + + return page_range + + def get_starting_page_number(form_data: FormData, key: str, fallback_value: int) -> int: """Retrieves the starting page number from the given form data. From f98193cbba14200ebf3548ccaca7da0820466877 Mon Sep 17 00:00:00 2001 From: Austin Walker Date: Wed, 10 Jul 2024 11:24:05 -0400 Subject: [PATCH 3/8] Add support for page ranges in pdf split hook --- .../integration/test_decorators.py | 72 +++++++++++++++++++ .../_hooks/custom/form_utils.py | 19 +++-- .../_hooks/custom/pdf_utils.py | 10 +-- .../_hooks/custom/request_utils.py | 2 + .../_hooks/custom/split_pdf_hook.py | 35 +++++---- 5 files changed, 112 insertions(+), 26 deletions(-) diff --git a/_test_unstructured_client/integration/test_decorators.py b/_test_unstructured_client/integration/test_decorators.py index 703c8665..124890de 100644 --- a/_test_unstructured_client/integration/test_decorators.py +++ b/_test_unstructured_client/integration/test_decorators.py @@ -110,3 +110,75 @@ def test_integration_split_pdf_for_file_with_no_name(): ) pytest.raises(ValueError, client.general.partition, req) + + +@pytest.mark.parametrize("starting_page_number", [1, 100]) +@pytest.mark.parametrize( + "page_range, expected_ok, expected_pages", + [ + (["1", "14"], True, (1, 14)), # Valid range, start on boundary + (["4", "16"], True, (4, 16)), # Valid range, end on boundary + (["2", "5"], True, (2, 5)), # Valid range within boundary + # A 1 page doc wouldn't normally be split, + # but this code still needs to return the page range + (["6", "6"], True, (6, 6)), + (["2", "100"], False, None), # End page too high + (["50", "100"], False, None), # Range too high + (["-50", "5"], False, None), # Start page too low + (["-50", "-2"], False, None), # Range too low + (["10", "2"], False, None), # Backwards range + ], +) +def test_integration_split_pdf_with_page_range( + starting_page_number: int, + page_range: list[int], + expected_ok: bool, + expected_pages: tuple[int, int], + caplog, +): + """ + Test that we can split pdfs with an arbitrary page range. Send the selected range to the API and assert that the metadata page numbers are correct. + We should also be able to offset the metadata with starting_page_number. + + Requires unstructured-api running in bg. See Makefile for how to run it. + """ + try: + response = requests.get("http://localhost:8000/general/docs") + assert response.status_code == 200, "The unstructured-api is not running on localhost:8000" + except requests.exceptions.ConnectionError: + assert False, "The unstructured-api is not running on localhost:8000" + + client = UnstructuredClient(api_key_auth=FAKE_KEY, server_url="localhost:8000") + + filename = "_sample_docs/layout-parser-paper.pdf" + with open(filename, "rb") as f: + files = shared.Files( + content=f.read(), + file_name=filename, + ) + + req = shared.PartitionParameters( + files=files, + strategy="fast", + split_pdf_page=True, + split_pdf_page_range=page_range, + starting_page_number=starting_page_number, + ) + + try: + resp = client.general.partition(req) + except ValueError as exc: + if not expected_ok: + assert "is out of bounds." in caplog.text + assert "is out of bounds." in str(exc) + return + else: + assert exc is None + + page_numbers = set([e["metadata"]["page_number"] for e in resp.elements]) + + min_page_number = expected_pages[0] + starting_page_number - 1 + max_page_number = expected_pages[1] + starting_page_number - 1 + + assert min(page_numbers) == min_page_number, f"Result should start at page {min_page_number}" + assert max(page_numbers) == max_page_number, f"Result should end at page {max_page_number}" diff --git a/src/unstructured_client/_hooks/custom/form_utils.py b/src/unstructured_client/_hooks/custom/form_utils.py index 489ab8c2..e3863a49 100644 --- a/src/unstructured_client/_hooks/custom/form_utils.py +++ b/src/unstructured_client/_hooks/custom/form_utils.py @@ -40,17 +40,16 @@ def get_page_range(form_data: FormData, key: str, max_pages: int) -> tuple[int, page_range = (1, max_pages) except (ValueError, IndexError): - logger.warning( - "'%s' is not a valid page range. Selecting default range (1 to %d).", - _page_range, - max_pages, - ) - page_range = (1, max_pages) + msg = f"{_page_range} is not a valid page range." + logger.error(msg) + raise ValueError(msg) + + start, end = page_range - if page_range[0] < 1 or page_range[1] > max_pages: - new_page_range = (max(page_range[0], 1), min(page_range[1], max_pages)) - logger.warning(f"Page range {page_range} is out of bounds, setting to {new_page_range}.") - page_range = new_page_range + if not (0 < start <= max_pages) or not (0 < end <= max_pages) or not (start <= end): + msg = f"Page range {page_range} is out of bounds. Valid range is (1 - {max_pages})." + logger.error(msg) + raise ValueError(msg) return page_range diff --git a/src/unstructured_client/_hooks/custom/pdf_utils.py b/src/unstructured_client/_hooks/custom/pdf_utils.py index e9200030..c03a81ef 100644 --- a/src/unstructured_client/_hooks/custom/pdf_utils.py +++ b/src/unstructured_client/_hooks/custom/pdf_utils.py @@ -1,6 +1,6 @@ import io import logging -from typing import Generator, Tuple +from typing import Generator, Tuple, Optional from pypdf import PdfReader, PdfWriter from pypdf.errors import PdfReadError @@ -12,7 +12,7 @@ def get_pdf_pages( - pdf: PdfReader, split_size: int = 1 + pdf: PdfReader, split_size: int = 1, page_start: int = 1, page_end: Optional[int] = None ) -> Generator[Tuple[io.BytesIO, int, int], None, None]: """Reads given bytes of a pdf file and split it into n file-like objects, each with `split_size` pages. @@ -22,13 +22,15 @@ def get_pdf_pages( split_size: Split size, e.g. if the given file has 10 pages and this value is set to 2 it will yield 5 documents, each containing 2 pages of the original document. By default it will split each page to a separate file. + page_start: Begin splitting at this page number + page_end: If provided, split up to and including this page number Yields: The file contents with their page number and overall pages number of the original document. """ - offset = 0 - offset_end = len(pdf.pages) + offset = page_start - 1 + offset_end = page_end or len(pdf.pages) while offset < offset_end: new_pdf = PdfWriter() diff --git a/src/unstructured_client/_hooks/custom/request_utils.py b/src/unstructured_client/_hooks/custom/request_utils.py index 7b83f7a7..0dde007d 100644 --- a/src/unstructured_client/_hooks/custom/request_utils.py +++ b/src/unstructured_client/_hooks/custom/request_utils.py @@ -16,6 +16,7 @@ from unstructured_client._hooks.custom.form_utils import ( PARTITION_FORM_FILES_KEY, PARTITION_FORM_SPLIT_PDF_PAGE_KEY, + PARTITION_FORM_PAGE_RANGE_KEY, PARTITION_FORM_STARTING_PAGE_NUMBER_KEY, FormData, ) @@ -145,6 +146,7 @@ def prepare_request_payload(form_data: FormData) -> FormData: payload = copy.deepcopy(form_data) payload.pop(PARTITION_FORM_SPLIT_PDF_PAGE_KEY, None) payload.pop(PARTITION_FORM_FILES_KEY, None) + payload.pop(PARTITION_FORM_PAGE_RANGE_KEY, None) payload.pop(PARTITION_FORM_STARTING_PAGE_NUMBER_KEY, None) updated_parameters = { PARTITION_FORM_SPLIT_PDF_PAGE_KEY: "false", diff --git a/src/unstructured_client/_hooks/custom/split_pdf_hook.py b/src/unstructured_client/_hooks/custom/split_pdf_hook.py index a983aa91..cd65d555 100644 --- a/src/unstructured_client/_hooks/custom/split_pdf_hook.py +++ b/src/unstructured_client/_hooks/custom/split_pdf_hook.py @@ -18,6 +18,7 @@ from unstructured_client._hooks.custom.form_utils import ( PARTITION_FORM_CONCURRENCY_LEVEL_KEY, PARTITION_FORM_FILES_KEY, + PARTITION_FORM_PAGE_RANGE_KEY, PARTITION_FORM_SPLIT_PDF_PAGE_KEY, PARTITION_FORM_STARTING_PAGE_NUMBER_KEY, ) @@ -143,7 +144,10 @@ def before_request( key=PARTITION_FORM_STARTING_PAGE_NUMBER_KEY, fallback_value=DEFAULT_STARTING_PAGE_NUMBER, ) - logger.info("Starting page number set to %d", starting_page_number) + + if starting_page_number > 1: + logger.info("Starting page number set to %d", starting_page_number) + concurrency_level = form_utils.get_split_pdf_concurrency_level_param( form_data, key=PARTITION_FORM_CONCURRENCY_LEVEL_KEY, @@ -154,27 +158,34 @@ def before_request( limiter = asyncio.Semaphore(concurrency_level) pdf = PdfReader(io.BytesIO(file.content)) + + page_range_start, page_range_end = form_utils.get_page_range( + form_data, + key=PARTITION_FORM_PAGE_RANGE_KEY, + max_pages=len(pdf.pages), + ) + + page_count = min(len(pdf.pages), page_range_end - page_range_start + 1) + logger.info(f"Splitting pages {page_range_start} to {page_range_end} ({page_count} total)") + split_size = get_optimal_split_size( - num_pages=len(pdf.pages), concurrency_level=concurrency_level + num_pages=page_count, concurrency_level=concurrency_level ) logger.info("Determined optimal split size of %d pages.", split_size) - if split_size >= len(pdf.pages): + # If the doc is small enough, and we aren't slicing it with a page range: + # do not split, just continue with the original request + if split_size >= page_count and page_count == len(pdf.pages): logger.info( "Document has too few pages (%d) to be split efficiently. Partitioning without split.", - len(pdf.pages), + page_count, ) return request - pages = pdf_utils.get_pdf_pages(pdf, split_size) - logger.info( - "Document split into %d, %d-paged sets.", - math.ceil(len(pdf.pages) / split_size), - split_size, - ) + pages = pdf_utils.get_pdf_pages(pdf, split_size=split_size, page_start=page_range_start, page_end=page_range_end) logger.info( "Partitioning %d, %d-paged sets.", - math.ceil(len(pdf.pages) / split_size), + math.ceil(page_count / split_size), split_size, ) @@ -209,7 +220,7 @@ async def call_api_partial(page): "Partitioning set #%d (pages %d-%d).", set_index, page_number, - min(page_number + split_size, all_pages_number), + min(page_number + split_size - 1, all_pages_number), ) # Check if this set of pages is the last one if page_index + split_size >= all_pages_number: From 80902b510d46264225bb56b2eca5e0da78ad1caa Mon Sep 17 00:00:00 2001 From: Austin Walker Date: Wed, 10 Jul 2024 13:11:09 -0400 Subject: [PATCH 4/8] Add split_pdf_page_range parameter, update unit tests --- README.md | 12 +++++++ USAGE.md | 4 +++ .../unit/test_split_pdf_hook.py | 33 +++++++++++++------ docs/models/shared/partitionparameters.md | 1 + overlay_client.yaml | 12 +++++++ .../models/shared/partition_parameters.py | 2 ++ 6 files changed, 54 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index b15d5371..99e00975 100755 --- a/README.md +++ b/README.md @@ -55,6 +55,10 @@ res = s.general.partition(request=operations.PartitionRequest( content='0x2cC94b2FEF'.encode(), file_name='your_file_here', ), + split_pdf_page_range=[ + 1, + 10, + ], strategy=shared.Strategy.AUTO, ), )) @@ -110,6 +114,10 @@ res = s.general.partition(request=operations.PartitionRequest( content='0x2cC94b2FEF'.encode(), file_name='your_file_here', ), + split_pdf_page_range=[ + 1, + 10, + ], strategy=shared.Strategy.AUTO, ), ), @@ -139,6 +147,10 @@ res = s.general.partition(request=operations.PartitionRequest( content='0x2cC94b2FEF'.encode(), file_name='your_file_here', ), + split_pdf_page_range=[ + 1, + 10, + ], strategy=shared.Strategy.AUTO, ), )) diff --git a/USAGE.md b/USAGE.md index 3af1a129..c58c078b 100644 --- a/USAGE.md +++ b/USAGE.md @@ -14,6 +14,10 @@ res = s.general.partition(request=operations.PartitionRequest( content='0x2cC94b2FEF'.encode(), file_name='your_file_here', ), + split_pdf_page_range=[ + 1, + 10, + ], strategy=shared.Strategy.AUTO, ), )) diff --git a/_test_unstructured_client/unit/test_split_pdf_hook.py b/_test_unstructured_client/unit/test_split_pdf_hook.py index 25512466..c7893539 100644 --- a/_test_unstructured_client/unit/test_split_pdf_hook.py +++ b/_test_unstructured_client/unit/test_split_pdf_hook.py @@ -404,20 +404,33 @@ def test_unit_get_starting_page_number(starting_page_number, expected_result): @pytest.mark.parametrize( "page_range, expected_result", [ - (["2", "5"], (2, 5)), # Valid range - (["2", "100"], (2, 20)), # End too high - (["-50", "5"], (1, 5)), # Start too low - (None, (1, 20)), # Range not specified - (["foo", "foo"], (1, 20)), # Parse error + (["1", "14"], (1, 14)), # Valid range, start on boundary + (["4", "16"], (4, 16)), # Valid range, end on boundary + (None, (1, 20)), # Range not specified, defaults to full range + (["2", "5"], (2, 5)), # Valid range within boundary + (["2", "100"], None), # End page too high + (["50", "100"], None), # Range too high + (["-50", "5"], None), # Start page too low + (["-50", "-2"], None), # Range too low + (["10", "2"], None), # Backwards range + (["foo", "foo"], None), # Parse error ], ) def test_unit_get_page_range_returns_valid_range(page_range, expected_result): """Test get_page_range method with different inputs. Ranges that are out of bounds for a 20 page doc will be adjusted.""" form_data = {"split_pdf_page_range[]": page_range} - result = form_utils.get_page_range( - form_data, - key=PARTITION_FORM_PAGE_RANGE_KEY, - max_pages=20, - ) + try: + result = form_utils.get_page_range( + form_data, + key=PARTITION_FORM_PAGE_RANGE_KEY, + max_pages=20, + ) + except ValueError as exc: + if not expected_result: + assert "is out of bounds." in str(exc) or "is not a valid page range." in str(exc) + return + else: + assert exc is None + assert result == expected_result diff --git a/docs/models/shared/partitionparameters.md b/docs/models/shared/partitionparameters.md index 7cea8024..2fbfa7ec 100644 --- a/docs/models/shared/partitionparameters.md +++ b/docs/models/shared/partitionparameters.md @@ -28,6 +28,7 @@ | `skip_infer_table_types` | List[*str*] | :heavy_minus_sign: | The document types that you want to skip table extraction with. Default: [] | | | `split_pdf_concurrency_level` | *Optional[int]* | :heavy_minus_sign: | When `split_pdf_page` is set to `True`, this parameter specifies the number of workers used for sending requests when the PDF is split on the client side. It's an internal parameter for the Python client and is not sent to the backend. | | | `split_pdf_page` | *Optional[bool]* | :heavy_minus_sign: | This parameter determines if the PDF file should be split on the client side. It's an internal parameter for the Python client and is not sent to the backend. | | +| `split_pdf_page_range` | List[*int*] | :heavy_minus_sign: | When `split_pdf_page is set to `True`, this parameter selects a subset of the pdf to send to the API. The parameter is a list of 2 integers within the range [1, length_of_pdf]. It's an internal parameter for the Python client and is not sent to the backend. | [
1,
10
] | | `starting_page_number` | *Optional[int]* | :heavy_minus_sign: | When PDF is split into pages before sending it into the API, providing this information will allow the page number to be assigned correctly. Introduced in 1.0.27. | | | `strategy` | [Optional[shared.Strategy]](../../models/shared/strategy.md) | :heavy_minus_sign: | The strategy to use for partitioning PDF/image. Options are fast, hi_res, auto. Default: auto | auto | | `unique_element_ids` | *Optional[bool]* | :heavy_minus_sign: | When `True`, assign UUIDs to element IDs, which guarantees their uniqueness (useful when using them as primary keys in database). Otherwise a SHA-256 of element text is used. Default: `False` | | diff --git a/overlay_client.yaml b/overlay_client.yaml index db1af850..a64130ae 100644 --- a/overlay_client.yaml +++ b/overlay_client.yaml @@ -12,6 +12,18 @@ actions: "description": "This parameter determines if the PDF file should be split on the client side. It's an internal parameter for the Python client and is not sent to the backend.", "default": true, } + - target: $["components"]["schemas"]["partition_parameters"]["properties"] + update: + "split_pdf_page_range": + { + "type": "array", + "title": "Split Pdf Page Range", + "description": "When `split_pdf_page is set to `True`, this parameter selects a subset of the pdf to send to the API. The parameter is a list of 2 integers within the range [1, length_of_pdf]. It's an internal parameter for the Python client and is not sent to the backend.", + "items": {"type": "integer"}, + "minItems": 2, + "maxItems": 2, + "example": [1, 10], + } - target: $["components"]["schemas"]["partition_parameters"]["properties"] update: "split_pdf_concurrency_level": diff --git a/src/unstructured_client/models/shared/partition_parameters.py b/src/unstructured_client/models/shared/partition_parameters.py index 485428a6..985aea87 100644 --- a/src/unstructured_client/models/shared/partition_parameters.py +++ b/src/unstructured_client/models/shared/partition_parameters.py @@ -85,6 +85,8 @@ class PartitionParameters: r"""When `split_pdf_page` is set to `True`, this parameter specifies the number of workers used for sending requests when the PDF is split on the client side. It's an internal parameter for the Python client and is not sent to the backend.""" split_pdf_page: Optional[bool] = dataclasses.field(default=True, metadata={'multipart_form': { 'field_name': 'split_pdf_page' }}) r"""This parameter determines if the PDF file should be split on the client side. It's an internal parameter for the Python client and is not sent to the backend.""" + split_pdf_page_range: Optional[List[int]] = dataclasses.field(default=None, metadata={'multipart_form': { 'field_name': 'split_pdf_page_range' }}) + r"""When `split_pdf_page is set to `True`, this parameter selects a subset of the pdf to send to the API. The parameter is a list of 2 integers within the range [1, length_of_pdf]. It's an internal parameter for the Python client and is not sent to the backend.""" starting_page_number: Optional[int] = dataclasses.field(default=None, metadata={'multipart_form': { 'field_name': 'starting_page_number' }}) r"""When PDF is split into pages before sending it into the API, providing this information will allow the page number to be assigned correctly. Introduced in 1.0.27.""" strategy: Optional[Strategy] = dataclasses.field(default=Strategy.AUTO, metadata={'multipart_form': { 'field_name': 'strategy' }}) From 39483e97068d431e3230d35e9d28bcf6de312642 Mon Sep 17 00:00:00 2001 From: Austin Walker Date: Wed, 10 Jul 2024 13:29:51 -0400 Subject: [PATCH 5/8] Update param docstring --- overlay_client.yaml | 2 +- src/unstructured_client/models/shared/partition_parameters.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/overlay_client.yaml b/overlay_client.yaml index a64130ae..67c35cd5 100644 --- a/overlay_client.yaml +++ b/overlay_client.yaml @@ -18,7 +18,7 @@ actions: { "type": "array", "title": "Split Pdf Page Range", - "description": "When `split_pdf_page is set to `True`, this parameter selects a subset of the pdf to send to the API. The parameter is a list of 2 integers within the range [1, length_of_pdf]. It's an internal parameter for the Python client and is not sent to the backend.", + "description": "When `split_pdf_page is set to `True`, this parameter selects a subset of the pdf to send to the API. The parameter is a list of 2 integers within the range [1, length_of_pdf]. A ValueError is thrown if the given range is invalid. It's an internal parameter for the Python client and is not sent to the backend.", "items": {"type": "integer"}, "minItems": 2, "maxItems": 2, diff --git a/src/unstructured_client/models/shared/partition_parameters.py b/src/unstructured_client/models/shared/partition_parameters.py index 985aea87..7f2fe724 100644 --- a/src/unstructured_client/models/shared/partition_parameters.py +++ b/src/unstructured_client/models/shared/partition_parameters.py @@ -86,7 +86,7 @@ class PartitionParameters: split_pdf_page: Optional[bool] = dataclasses.field(default=True, metadata={'multipart_form': { 'field_name': 'split_pdf_page' }}) r"""This parameter determines if the PDF file should be split on the client side. It's an internal parameter for the Python client and is not sent to the backend.""" split_pdf_page_range: Optional[List[int]] = dataclasses.field(default=None, metadata={'multipart_form': { 'field_name': 'split_pdf_page_range' }}) - r"""When `split_pdf_page is set to `True`, this parameter selects a subset of the pdf to send to the API. The parameter is a list of 2 integers within the range [1, length_of_pdf]. It's an internal parameter for the Python client and is not sent to the backend.""" + r"""When `split_pdf_page is set to `True`, this parameter selects a subset of the pdf to send to the API. The parameter is a list of 2 integers within the range [1, length_of_pdf]. A ValueError is thrown if the given range is invalid. It's an internal parameter for the Python client and is not sent to the backend.""" starting_page_number: Optional[int] = dataclasses.field(default=None, metadata={'multipart_form': { 'field_name': 'starting_page_number' }}) r"""When PDF is split into pages before sending it into the API, providing this information will allow the page number to be assigned correctly. Introduced in 1.0.27.""" strategy: Optional[Strategy] = dataclasses.field(default=Strategy.AUTO, metadata={'multipart_form': { 'field_name': 'strategy' }}) From fe6a6a30be5c8ae951a9c797c81c862958e18a68 Mon Sep 17 00:00:00 2001 From: Austin Walker Date: Wed, 10 Jul 2024 14:11:28 -0400 Subject: [PATCH 6/8] Update comment --- _test_unstructured_client/unit/test_split_pdf_hook.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/_test_unstructured_client/unit/test_split_pdf_hook.py b/_test_unstructured_client/unit/test_split_pdf_hook.py index c7893539..7853c09e 100644 --- a/_test_unstructured_client/unit/test_split_pdf_hook.py +++ b/_test_unstructured_client/unit/test_split_pdf_hook.py @@ -418,7 +418,7 @@ def test_unit_get_starting_page_number(starting_page_number, expected_result): ) def test_unit_get_page_range_returns_valid_range(page_range, expected_result): """Test get_page_range method with different inputs. - Ranges that are out of bounds for a 20 page doc will be adjusted.""" + Ranges that are out of bounds for a 20 page doc will throw a ValueError.""" form_data = {"split_pdf_page_range[]": page_range} try: result = form_utils.get_page_range( From f2ad642e73b5ef93357c5ba8f53bb368013206f8 Mon Sep 17 00:00:00 2001 From: Austin Walker Date: Wed, 10 Jul 2024 14:31:26 -0400 Subject: [PATCH 7/8] Fix pylint errors --- src/unstructured_client/_hooks/custom/form_utils.py | 6 +++--- src/unstructured_client/_hooks/custom/split_pdf_hook.py | 7 ++++++- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/src/unstructured_client/_hooks/custom/form_utils.py b/src/unstructured_client/_hooks/custom/form_utils.py index e3863a49..ab51e19b 100644 --- a/src/unstructured_client/_hooks/custom/form_utils.py +++ b/src/unstructured_client/_hooks/custom/form_utils.py @@ -39,14 +39,14 @@ def get_page_range(form_data: FormData, key: str, max_pages: int) -> tuple[int, else: page_range = (1, max_pages) - except (ValueError, IndexError): + except (ValueError, IndexError) as exc: msg = f"{_page_range} is not a valid page range." logger.error(msg) - raise ValueError(msg) + raise ValueError(msg) from exc start, end = page_range - if not (0 < start <= max_pages) or not (0 < end <= max_pages) or not (start <= end): + if not 0 < start <= max_pages or not 0 < end <= max_pages or not start <= end: msg = f"Page range {page_range} is out of bounds. Valid range is (1 - {max_pages})." logger.error(msg) raise ValueError(msg) diff --git a/src/unstructured_client/_hooks/custom/split_pdf_hook.py b/src/unstructured_client/_hooks/custom/split_pdf_hook.py index cd65d555..cbeacfdb 100644 --- a/src/unstructured_client/_hooks/custom/split_pdf_hook.py +++ b/src/unstructured_client/_hooks/custom/split_pdf_hook.py @@ -166,7 +166,12 @@ def before_request( ) page_count = min(len(pdf.pages), page_range_end - page_range_start + 1) - logger.info(f"Splitting pages {page_range_start} to {page_range_end} ({page_count} total)") + logger.info( + "Splitting pages %d to %d (%d total)", + page_range_start, + page_range_end, + page_count, + ) split_size = get_optimal_split_size( num_pages=page_count, concurrency_level=concurrency_level From ab11a4d143dcf57d70212fe1f177700bc967ab19 Mon Sep 17 00:00:00 2001 From: Austin Walker Date: Thu, 11 Jul 2024 15:59:37 -0400 Subject: [PATCH 8/8] Update some log messages, unit test assertions --- .../integration/test_decorators.py | 10 ++++------ .../unit/test_split_pdf_hook.py | 8 +++----- src/unstructured_client/_hooks/custom/form_utils.py | 2 +- .../_hooks/custom/split_pdf_hook.py | 13 ++++++++++--- 4 files changed, 18 insertions(+), 15 deletions(-) diff --git a/_test_unstructured_client/integration/test_decorators.py b/_test_unstructured_client/integration/test_decorators.py index 124890de..16d5cdf8 100644 --- a/_test_unstructured_client/integration/test_decorators.py +++ b/_test_unstructured_client/integration/test_decorators.py @@ -168,12 +168,10 @@ def test_integration_split_pdf_with_page_range( try: resp = client.general.partition(req) except ValueError as exc: - if not expected_ok: - assert "is out of bounds." in caplog.text - assert "is out of bounds." in str(exc) - return - else: - assert exc is None + assert not expected_ok + assert "is out of bounds." in caplog.text + assert "is out of bounds." in str(exc) + return page_numbers = set([e["metadata"]["page_number"] for e in resp.elements]) diff --git a/_test_unstructured_client/unit/test_split_pdf_hook.py b/_test_unstructured_client/unit/test_split_pdf_hook.py index 7853c09e..ee364eb9 100644 --- a/_test_unstructured_client/unit/test_split_pdf_hook.py +++ b/_test_unstructured_client/unit/test_split_pdf_hook.py @@ -427,10 +427,8 @@ def test_unit_get_page_range_returns_valid_range(page_range, expected_result): max_pages=20, ) except ValueError as exc: - if not expected_result: - assert "is out of bounds." in str(exc) or "is not a valid page range." in str(exc) - return - else: - assert exc is None + assert not expected_result + assert "is out of bounds." in str(exc) or "is not a valid page range." in str(exc) + return assert result == expected_result diff --git a/src/unstructured_client/_hooks/custom/form_utils.py b/src/unstructured_client/_hooks/custom/form_utils.py index ab51e19b..b04f5d8b 100644 --- a/src/unstructured_client/_hooks/custom/form_utils.py +++ b/src/unstructured_client/_hooks/custom/form_utils.py @@ -47,7 +47,7 @@ def get_page_range(form_data: FormData, key: str, max_pages: int) -> tuple[int, start, end = page_range if not 0 < start <= max_pages or not 0 < end <= max_pages or not start <= end: - msg = f"Page range {page_range} is out of bounds. Valid range is (1 - {max_pages})." + msg = f"Page range {page_range} is out of bounds. Start and end values should be between 1 and {max_pages}." logger.error(msg) raise ValueError(msg) diff --git a/src/unstructured_client/_hooks/custom/split_pdf_hook.py b/src/unstructured_client/_hooks/custom/split_pdf_hook.py index cbeacfdb..1d3a5714 100644 --- a/src/unstructured_client/_hooks/custom/split_pdf_hook.py +++ b/src/unstructured_client/_hooks/custom/split_pdf_hook.py @@ -165,7 +165,7 @@ def before_request( max_pages=len(pdf.pages), ) - page_count = min(len(pdf.pages), page_range_end - page_range_start + 1) + page_count = page_range_end - page_range_start + 1 logger.info( "Splitting pages %d to %d (%d total)", page_range_start, @@ -189,11 +189,18 @@ def before_request( pages = pdf_utils.get_pdf_pages(pdf, split_size=split_size, page_start=page_range_start, page_end=page_range_end) logger.info( - "Partitioning %d, %d-paged sets.", - math.ceil(page_count / split_size), + "Partitioning %d files with %d page(s) each.", + math.floor(page_count / split_size), split_size, ) + # Log the remainder pages if there are any + if page_count % split_size > 0: + logger.info( + "Partitioning 1 file with %d page(s).", + page_count % split_size, + ) + async def call_api_partial(page): async with httpx.AsyncClient() as client: status_code, json_response = await request_utils.call_api_async(