From e8ec363554f23d69e7f330a9982a2e3bf01a3bab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20Kmiecik?= Date: Wed, 24 Jul 2024 17:59:49 +0200 Subject: [PATCH 1/9] chore: added new parameter description in doc files --- README.md | 15 ++++++ USAGE.md | 1 + docs/models/shared/partitionparameters.md | 62 +++++++++++------------ 3 files changed, 47 insertions(+), 31 deletions(-) diff --git a/README.md b/README.md index 8f222dec..ff99466d 100755 --- a/README.md +++ b/README.md @@ -109,6 +109,21 @@ req = shared.PartitionParameters( ) ``` +#### Splitting PDF by pages - strict mode + +When `split_pdf_allow_failed=False` (the default), any errors encountered during sending parallel request will break the process and raise an exception. +When `split_pdf_allow_failed=True`, the process will continue even if some requests fail, and the results will be combined at the end (the output from the errored pages will not be included). + +Example: +```python +req = shared.PartitionParameters( + files=files, + strategy="fast", + languages=["eng"], + split_pdf_allow_failed=True, +) +``` + ## Retries diff --git a/USAGE.md b/USAGE.md index c58c078b..b35323a5 100644 --- a/USAGE.md +++ b/USAGE.md @@ -18,6 +18,7 @@ res = s.general.partition(request=operations.PartitionRequest( 1, 10, ], + split_pdf_allow_failed=False, strategy=shared.Strategy.AUTO, ), )) diff --git a/docs/models/shared/partitionparameters.md b/docs/models/shared/partitionparameters.md index 6133821a..00947422 100644 --- a/docs/models/shared/partitionparameters.md +++ b/docs/models/shared/partitionparameters.md @@ -1,35 +1,35 @@ # PartitionParameters - ## Fields -| Field | Type | Required | Description | Example | -| ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `files` | [shared.Files](../../models/shared/files.md) | :heavy_check_mark: | The file to extract | | -| `chunking_strategy` | [Optional[shared.ChunkingStrategy]](../../models/shared/chunkingstrategy.md) | :heavy_minus_sign: | Use one of the supported strategies to chunk the returned elements after partitioning. When 'chunking_strategy' is not specified, no chunking is performed and any other chunking parameters provided are ignored. Supported strategies: 'basic', 'by_page', 'by_similarity', or 'by_title' | | -| `combine_under_n_chars` | *Optional[int]* | :heavy_minus_sign: | If chunking strategy is set, combine elements until a section reaches a length of n chars. Default: 500 | | -| `coordinates` | *Optional[bool]* | :heavy_minus_sign: | If `True`, return coordinates for each element extracted via OCR. Default: `False` | | -| `encoding` | *Optional[str]* | :heavy_minus_sign: | The encoding method used to decode the text input. Default: utf-8 | | -| `extract_image_block_types` | List[*str*] | :heavy_minus_sign: | The types of elements to extract, for use in extracting image blocks as base64 encoded data stored in metadata fields. | | -| `gz_uncompressed_content_type` | *Optional[str]* | :heavy_minus_sign: | If file is gzipped, use this content type after unzipping. | | -| `hi_res_model_name` | *Optional[str]* | :heavy_minus_sign: | The name of the inference model used when strategy is hi_res | | -| `include_orig_elements` | *Optional[bool]* | :heavy_minus_sign: | When a chunking strategy is specified, each returned chunk will include the elements consolidated to form that chunk as `.metadata.orig_elements`. Default: true. | | -| `include_page_breaks` | *Optional[bool]* | :heavy_minus_sign: | If true, the output will include page breaks if the filetype supports it. Default: false | | -| `languages` | List[*str*] | :heavy_minus_sign: | The languages present in the document, for use in partitioning and/or OCR. See the Tesseract documentation for a full list of languages. | | -| `max_characters` | *Optional[int]* | :heavy_minus_sign: | If chunking strategy is set, cut off new sections after reaching a length of n chars (hard max). Default: 500 | | -| `multipage_sections` | *Optional[bool]* | :heavy_minus_sign: | If chunking strategy is set, determines if sections can span multiple sections. Default: true | | -| `new_after_n_chars` | *Optional[int]* | :heavy_minus_sign: | If chunking strategy is set, cut off new sections after reaching a length of n chars (soft max). Default: 1500 | | -| `ocr_languages` | List[*str*] | :heavy_minus_sign: | Deprecated! The languages present in the document, for use in partitioning and/or OCR | | -| `output_format` | [Optional[shared.OutputFormat]](../../models/shared/outputformat.md) | :heavy_minus_sign: | The format of the response. Supported formats are application/json and text/csv. Default: application/json. | | -| `overlap` | *Optional[int]* | :heavy_minus_sign: | Specifies the length of a string ('tail') to be drawn from each chunk and prefixed to the next chunk as a context-preserving mechanism. By default, this only applies to split-chunks where an oversized element is divided into multiple chunks by text-splitting. Default: 0 | | -| `overlap_all` | *Optional[bool]* | :heavy_minus_sign: | When `True`, apply overlap between 'normal' chunks formed from whole elements and not subject to text-splitting. Use this with caution as it entails a certain level of 'pollution' of otherwise clean semantic chunk boundaries. Default: False | | -| `pdf_infer_table_structure` | *Optional[bool]* | :heavy_minus_sign: | Deprecated! Use skip_infer_table_types to opt out of table extraction for any file type. If False and strategy=hi_res, no Table Elements will be extracted from pdf files regardless of skip_infer_table_types contents. | | -| `similarity_threshold` | *Optional[float]* | :heavy_minus_sign: | A value between 0.0 and 1.0 describing the minimum similarity two elements must have to be included in the same chunk. Note that similar elements may be separated to meet chunk-size criteria; this value can only guarantees that two elements with similarity below the threshold will appear in separate chunks. | | -| `skip_infer_table_types` | List[*str*] | :heavy_minus_sign: | The document types that you want to skip table extraction with. Default: [] | | -| `split_pdf_concurrency_level` | *Optional[int]* | :heavy_minus_sign: | When `split_pdf_page` is set to `True`, this parameter specifies the number of workers used for sending requests when the PDF is split on the client side. It's an internal parameter for the Python client and is not sent to the backend. | | -| `split_pdf_page` | *Optional[bool]* | :heavy_minus_sign: | This parameter determines if the PDF file should be split on the client side. It's an internal parameter for the Python client and is not sent to the backend. | | -| `split_pdf_page_range` | List[*int*] | :heavy_minus_sign: | When `split_pdf_page is set to `True`, this parameter selects a subset of the pdf to send to the API. The parameter is a list of 2 integers within the range [1, length_of_pdf]. A ValueError is thrown if the given range is invalid. It's an internal parameter for the Python client and is not sent to the backend. | [
1,
10
] | -| `starting_page_number` | *Optional[int]* | :heavy_minus_sign: | When PDF is split into pages before sending it into the API, providing this information will allow the page number to be assigned correctly. Introduced in 1.0.27. | | -| `strategy` | [Optional[shared.Strategy]](../../models/shared/strategy.md) | :heavy_minus_sign: | The strategy to use for partitioning PDF/image. Options are fast, hi_res, auto. Default: auto | auto | -| `unique_element_ids` | *Optional[bool]* | :heavy_minus_sign: | When `True`, assign UUIDs to element IDs, which guarantees their uniqueness (useful when using them as primary keys in database). Otherwise a SHA-256 of element text is used. Default: `False` | | -| `xml_keep_tags` | *Optional[bool]* | :heavy_minus_sign: | If `True`, will retain the XML tags in the output. Otherwise it will simply extract the text from within the tags. Only applies to XML documents. | | \ No newline at end of file +| Field | Type | Required | Description | Example | +|--------------------------------|------------------------------------------------------------------------------|--------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------------------| +| `files` | [shared.Files](../../models/shared/files.md) | :heavy_check_mark: | The file to extract | | +| `chunking_strategy` | [Optional[shared.ChunkingStrategy]](../../models/shared/chunkingstrategy.md) | :heavy_minus_sign: | Use one of the supported strategies to chunk the returned elements after partitioning. When 'chunking_strategy' is not specified, no chunking is performed and any other chunking parameters provided are ignored. Supported strategies: 'basic', 'by_page', 'by_similarity', or 'by_title' | | +| `combine_under_n_chars` | *Optional[int]* | :heavy_minus_sign: | If chunking strategy is set, combine elements until a section reaches a length of n chars. Default: 500 | | +| `coordinates` | *Optional[bool]* | :heavy_minus_sign: | If `True`, return coordinates for each element extracted via OCR. Default: `False` | | +| `encoding` | *Optional[str]* | :heavy_minus_sign: | The encoding method used to decode the text input. Default: utf-8 | | +| `extract_image_block_types` | List[*str*] | :heavy_minus_sign: | The types of elements to extract, for use in extracting image blocks as base64 encoded data stored in metadata fields. | | +| `gz_uncompressed_content_type` | *Optional[str]* | :heavy_minus_sign: | If file is gzipped, use this content type after unzipping. | | +| `hi_res_model_name` | *Optional[str]* | :heavy_minus_sign: | The name of the inference model used when strategy is hi_res | | +| `include_orig_elements` | *Optional[bool]* | :heavy_minus_sign: | When a chunking strategy is specified, each returned chunk will include the elements consolidated to form that chunk as `.metadata.orig_elements`. Default: true. | | +| `include_page_breaks` | *Optional[bool]* | :heavy_minus_sign: | If true, the output will include page breaks if the filetype supports it. Default: false | | +| `languages` | List[*str*] | :heavy_minus_sign: | The languages present in the document, for use in partitioning and/or OCR. See the Tesseract documentation for a full list of languages. | | +| `max_characters` | *Optional[int]* | :heavy_minus_sign: | If chunking strategy is set, cut off new sections after reaching a length of n chars (hard max). Default: 500 | | +| `multipage_sections` | *Optional[bool]* | :heavy_minus_sign: | If chunking strategy is set, determines if sections can span multiple sections. Default: true | | +| `new_after_n_chars` | *Optional[int]* | :heavy_minus_sign: | If chunking strategy is set, cut off new sections after reaching a length of n chars (soft max). Default: 1500 | | +| `ocr_languages` | List[*str*] | :heavy_minus_sign: | Deprecated! The languages present in the document, for use in partitioning and/or OCR | | +| `output_format` | [Optional[shared.OutputFormat]](../../models/shared/outputformat.md) | :heavy_minus_sign: | The format of the response. Supported formats are application/json and text/csv. Default: application/json. | | +| `overlap` | *Optional[int]* | :heavy_minus_sign: | Specifies the length of a string ('tail') to be drawn from each chunk and prefixed to the next chunk as a context-preserving mechanism. By default, this only applies to split-chunks where an oversized element is divided into multiple chunks by text-splitting. Default: 0 | | +| `overlap_all` | *Optional[bool]* | :heavy_minus_sign: | When `True`, apply overlap between 'normal' chunks formed from whole elements and not subject to text-splitting. Use this with caution as it entails a certain level of 'pollution' of otherwise clean semantic chunk boundaries. Default: False | | +| `pdf_infer_table_structure` | *Optional[bool]* | :heavy_minus_sign: | Deprecated! Use skip_infer_table_types to opt out of table extraction for any file type. If False and strategy=hi_res, no Table Elements will be extracted from pdf files regardless of skip_infer_table_types contents. | | +| `similarity_threshold` | *Optional[float]* | :heavy_minus_sign: | A value between 0.0 and 1.0 describing the minimum similarity two elements must have to be included in the same chunk. Note that similar elements may be separated to meet chunk-size criteria; this value can only guarantees that two elements with similarity below the threshold will appear in separate chunks. | | +| `skip_infer_table_types` | List[*str*] | :heavy_minus_sign: | The document types that you want to skip table extraction with. Default: [] | | +| `split_pdf_allow_failed` | *Optional[bool]* | :heavy_minus_sign: | When `split_pdf_page is set to `True`, this parameter selects a subset of the pdf to send to the API. The parameter is a list of 2 integers within the range [1, length_of_pdf]. It's an internal parameter for the Python client and is not sent to the backend. | true | +| `split_pdf_concurrency_level` | *Optional[int]* | :heavy_minus_sign: | When `split_pdf_page` is set to `True`, this parameter specifies the number of workers used for sending requests when the PDF is split on the client side. It's an internal parameter for the Python client and is not sent to the backend. | | +| `split_pdf_page` | *Optional[bool]* | :heavy_minus_sign: | This parameter determines if the PDF file should be split on the client side. It's an internal parameter for the Python client and is not sent to the backend. | | +| `split_pdf_page_range` | List[*int*] | :heavy_minus_sign: | When `split_pdf_page is set to `True`, this parameter selects a subset of the pdf to send to the API. The parameter is a list of 2 integers within the range [1, length_of_pdf]. A ValueError is thrown if the given range is invalid. It's an internal parameter for the Python client and is not sent to the backend. | [
1,
10
] | +| `starting_page_number` | *Optional[int]* | :heavy_minus_sign: | When PDF is split into pages before sending it into the API, providing this information will allow the page number to be assigned correctly. Introduced in 1.0.27. | | +| `strategy` | [Optional[shared.Strategy]](../../models/shared/strategy.md) | :heavy_minus_sign: | The strategy to use for partitioning PDF/image. Options are fast, hi_res, auto. Default: auto | auto | +| `unique_element_ids` | *Optional[bool]* | :heavy_minus_sign: | When `True`, assign UUIDs to element IDs, which guarantees their uniqueness (useful when using them as primary keys in database). Otherwise a SHA-256 of element text is used. Default: `False` | | +| `xml_keep_tags` | *Optional[bool]* | :heavy_minus_sign: | If `True`, will retain the XML tags in the output. Otherwise it will simply extract the text from within the tags. Only applies to XML documents. | | \ No newline at end of file From da289e036c9122817b377096e0f67d9473729599 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20Kmiecik?= Date: Wed, 24 Jul 2024 18:00:36 +0200 Subject: [PATCH 2/9] feat: added allow_failed parameter --- .../_hooks/custom/form_utils.py | 31 ++++++ .../_hooks/custom/request_utils.py | 2 + .../_hooks/custom/split_pdf_hook.py | 99 ++++++++++++++----- .../models/shared/partition_parameters.py | 2 + 4 files changed, 110 insertions(+), 24 deletions(-) diff --git a/src/unstructured_client/_hooks/custom/form_utils.py b/src/unstructured_client/_hooks/custom/form_utils.py index b04f5d8b..776dcb0d 100644 --- a/src/unstructured_client/_hooks/custom/form_utils.py +++ b/src/unstructured_client/_hooks/custom/form_utils.py @@ -14,6 +14,7 @@ PARTITION_FORM_FILES_KEY = "files" PARTITION_FORM_SPLIT_PDF_PAGE_KEY = "split_pdf_page" PARTITION_FORM_PAGE_RANGE_KEY = "split_pdf_page_range[]" +PARTITION_FORM_SPLIT_PDF_ALLOW_FAILED = "split_pdf_allow_failed" PARTITION_FORM_STARTING_PAGE_NUMBER_KEY = "starting_page_number" PARTITION_FORM_CONCURRENCY_LEVEL_KEY = "split_pdf_concurrency_level" @@ -89,6 +90,36 @@ def get_starting_page_number(form_data: FormData, key: str, fallback_value: int) return starting_page_number +def get_split_pdf_allow_failed_param( + form_data: FormData, key: str, fallback_value: bool, +) -> bool: + """Retrieves the value for allow failed that should be used for splitting pdf. + + In case given the number is not a "false" or "true" literal, it will use the + default value. + + Args: + form_data: The form data containing the desired concurrency level. + key: The key to look for in the form data. + fallback_value: The default value to use in case of an error. + + Returns: + The concurrency level after validation. + """ + allow_failed = form_data.get(key) + + if allow_failed is None: + return fallback_value + + if allow_failed.lower() not in ["true", "false"]: + logger.warning( + "'%s' is not a valid boolean. Using default value '%s'.", + key, + fallback_value, + ) + return fallback_value + + return allow_failed.lower() == "true" def get_split_pdf_concurrency_level_param( form_data: FormData, key: str, fallback_value: int, max_allowed: int diff --git a/src/unstructured_client/_hooks/custom/request_utils.py b/src/unstructured_client/_hooks/custom/request_utils.py index 0dde007d..678a50b4 100644 --- a/src/unstructured_client/_hooks/custom/request_utils.py +++ b/src/unstructured_client/_hooks/custom/request_utils.py @@ -16,6 +16,7 @@ from unstructured_client._hooks.custom.form_utils import ( PARTITION_FORM_FILES_KEY, PARTITION_FORM_SPLIT_PDF_PAGE_KEY, + PARTITION_FORM_SPLIT_PDF_ALLOW_FAILED, PARTITION_FORM_PAGE_RANGE_KEY, PARTITION_FORM_STARTING_PAGE_NUMBER_KEY, FormData, @@ -145,6 +146,7 @@ def prepare_request_payload(form_data: FormData) -> FormData: """ payload = copy.deepcopy(form_data) payload.pop(PARTITION_FORM_SPLIT_PDF_PAGE_KEY, None) + payload.pop(PARTITION_FORM_SPLIT_PDF_ALLOW_FAILED, None) payload.pop(PARTITION_FORM_FILES_KEY, None) payload.pop(PARTITION_FORM_PAGE_RANGE_KEY, None) payload.pop(PARTITION_FORM_STARTING_PAGE_NUMBER_KEY, None) diff --git a/src/unstructured_client/_hooks/custom/split_pdf_hook.py b/src/unstructured_client/_hooks/custom/split_pdf_hook.py index 1d3a5714..0653624b 100644 --- a/src/unstructured_client/_hooks/custom/split_pdf_hook.py +++ b/src/unstructured_client/_hooks/custom/split_pdf_hook.py @@ -20,6 +20,7 @@ PARTITION_FORM_FILES_KEY, PARTITION_FORM_PAGE_RANGE_KEY, PARTITION_FORM_SPLIT_PDF_PAGE_KEY, + PARTITION_FORM_SPLIT_PDF_ALLOW_FAILED, PARTITION_FORM_STARTING_PAGE_NUMBER_KEY, ) from unstructured_client._hooks.types import ( @@ -35,17 +36,41 @@ logger = logging.getLogger(UNSTRUCTURED_CLIENT_LOGGER_NAME) - DEFAULT_STARTING_PAGE_NUMBER = 1 +DEFAULT_ALLOW_FAILED = False DEFAULT_CONCURRENCY_LEVEL = 8 MAX_CONCURRENCY_LEVEL = 15 MIN_PAGES_PER_SPLIT = 2 MAX_PAGES_PER_SPLIT = 20 +async def _order_keeper(index: int, coro: Coroutine) -> Tuple[int, requests.Response]: + response = await coro + return index, response -async def run_tasks(tasks): - return await asyncio.gather(*tasks) +async def run_tasks(coroutines, allow_failed: bool = False) -> list[tuple[int, requests.Response]]: + if allow_failed: + responses = await asyncio.gather(*coroutines, return_exceptions=False) + return list(enumerate(responses, 1)) + else: + # TODO: replace with asyncio.TaskGroup for python >3.11 + tasks = [asyncio.create_task(_order_keeper(index, coro)) for index, coro in enumerate(coroutines, 1)] + results = [] + remaining_tasks = {i: task for i, task in enumerate(tasks, 1)} + for future in asyncio.as_completed(tasks): + index, response = await future + if response.status_code != 200: + # cancel all remaining tasks + for remaining_task in remaining_tasks.values(): + remaining_task.cancel() + results.append((index, response)) + break + else: + results.append((index, response)) + # remove task from remaining_tasks that should be cancelled in case of failure + del remaining_tasks[index] + # return results in the original order + return sorted(results, key=lambda x: x[0]) def get_optimal_split_size(num_pages: int, concurrency_level: int) -> int: @@ -78,9 +103,11 @@ def __init__(self) -> None: str, list[Coroutine[Any, Any, requests.Response]] ] = {} self.api_successful_responses: dict[str, list[requests.Response]] = {} + self.api_failed_responses: dict[str, list[requests.Response]] = {} + self.allow_failed: bool = DEFAULT_ALLOW_FAILED def sdk_init( - self, base_url: str, client: requests.Session + self, base_url: str, client: requests.Session ) -> Tuple[str, requests.Session]: """Initializes Split PDF Hook. @@ -95,7 +122,7 @@ def sdk_init( return base_url, client def before_request( - self, hook_ctx: BeforeRequestContext, request: requests.PreparedRequest + self, hook_ctx: BeforeRequestContext, request: requests.PreparedRequest ) -> Union[requests.PreparedRequest, Exception]: """If `splitPdfPage` is set to `true` in the request, the PDF file is split into separate pages. Each page is sent as a separate request in parallel. The last @@ -132,9 +159,9 @@ def before_request( logger.info("Preparing to split document for partition.") file = form_data.get(PARTITION_FORM_FILES_KEY) if ( - file is None - or not isinstance(file, shared.Files) - or not pdf_utils.is_pdf(file) + file is None + or not isinstance(file, shared.Files) + or not pdf_utils.is_pdf(file) ): logger.info("Partitioning without split.") return request @@ -144,9 +171,16 @@ def before_request( key=PARTITION_FORM_STARTING_PAGE_NUMBER_KEY, fallback_value=DEFAULT_STARTING_PAGE_NUMBER, ) - if starting_page_number > 1: logger.info("Starting page number set to %d", starting_page_number) + logger.info("Starting page number set to %d", starting_page_number) + + self.allow_failed = form_utils.get_split_pdf_allow_failed_param( + form_data, + key=PARTITION_FORM_SPLIT_PDF_ALLOW_FAILED, + fallback_value=DEFAULT_ALLOW_FAILED, + ) + logger.info("Allow failed set to %d", self.allow_failed) concurrency_level = form_utils.get_split_pdf_concurrency_level_param( form_data, @@ -253,7 +287,7 @@ async def call_api_partial(page): return last_page_prepared_request def _await_elements( - self, operation_id: str, response: requests.Response + self, operation_id: str, response: requests.Response ) -> Optional[list]: """ Waits for the partition requests to complete and returns the flattened @@ -272,34 +306,42 @@ def _await_elements( return None ioloop = asyncio.get_event_loop() - task_responses: list[requests.Response] = ioloop.run_until_complete( - run_tasks(tasks) + task_responses: list[tuple[int, requests.Response]] = ioloop.run_until_complete( + run_tasks(tasks, allow_failed=self.allow_failed) ) if task_responses is None: return None successful_responses = [] + failed_responses = [] elements = [] - for response_number, res in enumerate(task_responses, 1): + for response_number, res in task_responses: request_utils.log_after_split_response(res.status_code, response_number) if res.status_code == 200: successful_responses.append(res) elements.append(res.json()) + else: + failed_responses.append(res) - last_response_number = len(task_responses) + 1 - request_utils.log_after_split_response( - response.status_code, last_response_number - ) - if response.status_code == 200: - elements.append(response.json()) + if self.allow_failed or not failed_responses: + last_response_number = len(task_responses) + 1 + request_utils.log_after_split_response( + response.status_code, last_response_number + ) + if response.status_code == 200: + elements.append(response.json()) + successful_responses.append(response) + else: + failed_responses.append(response) self.api_successful_responses[operation_id] = successful_responses + self.api_failed_responses[operation_id] = failed_responses flattened_elements = [element for sublist in elements for element in sublist] return flattened_elements def after_success( - self, hook_ctx: AfterSuccessContext, response: requests.Response + self, hook_ctx: AfterSuccessContext, response: requests.Response ) -> Union[requests.Response, Exception]: """Executes after a successful API request. Awaits all parallel requests and combines the responses into a single response object. @@ -320,6 +362,10 @@ def after_success( # we need to pass response, which contains last page, to `_await_elements` method elements = self._await_elements(operation_id, response) + # if fails are disallowed, return the first failed response + if not self.allow_failed and self.api_failed_responses.get(operation_id): + return self.api_failed_responses[operation_id][0] + if elements is None: return response @@ -328,10 +374,10 @@ def after_success( return updated_response def after_error( - self, - hook_ctx: AfterErrorContext, - response: Optional[requests.Response], - error: Optional[Exception], + self, + hook_ctx: AfterErrorContext, + response: Optional[requests.Response], + error: Optional[Exception], ) -> Union[Tuple[Optional[requests.Response], Optional[Exception]], Exception]: """Executes after an unsuccessful API request. Awaits all parallel requests, if at least one request was successful, combines the responses into a single @@ -350,6 +396,11 @@ def after_error( If requests were run in parallel, and at least one was successful, a combined response object; otherwise, the original response and exception. """ + + # if fails are disallowed - return response and error objects immediately + if not self.allow_failed: + return (response, error) + operation_id = hook_ctx.operation_id # We know that this request failed so we pass a failed or empty response to `_await_elements` method # where it checks if at least on of the other requests succeeded diff --git a/src/unstructured_client/models/shared/partition_parameters.py b/src/unstructured_client/models/shared/partition_parameters.py index 7f2fe724..88fae006 100644 --- a/src/unstructured_client/models/shared/partition_parameters.py +++ b/src/unstructured_client/models/shared/partition_parameters.py @@ -87,6 +87,8 @@ class PartitionParameters: r"""This parameter determines if the PDF file should be split on the client side. It's an internal parameter for the Python client and is not sent to the backend.""" split_pdf_page_range: Optional[List[int]] = dataclasses.field(default=None, metadata={'multipart_form': { 'field_name': 'split_pdf_page_range' }}) r"""When `split_pdf_page is set to `True`, this parameter selects a subset of the pdf to send to the API. The parameter is a list of 2 integers within the range [1, length_of_pdf]. A ValueError is thrown if the given range is invalid. It's an internal parameter for the Python client and is not sent to the backend.""" + split_pdf_allow_failed: Optional[bool] = dataclasses.field(default=False, metadata={'multipart_form': { 'field_name': 'split_pdf_allow_failed' }}) + r"""When `split_pdf_page` is set to `True`, this parameter defines the behavior when some of the parallel requests fail. By default `split_pdf_allow_failed` is set to `False` and any failed request send to the API will make the whole process break and raise an Exception. If `split_pdf_allow_failed` is set to `True`, the errors encountered while sending parallel requests will not break the processing - the resuling list of Elements will miss the data from errored pages.""" starting_page_number: Optional[int] = dataclasses.field(default=None, metadata={'multipart_form': { 'field_name': 'starting_page_number' }}) r"""When PDF is split into pages before sending it into the API, providing this information will allow the page number to be assigned correctly. Introduced in 1.0.27.""" strategy: Optional[Strategy] = dataclasses.field(default=Strategy.AUTO, metadata={'multipart_form': { 'field_name': 'strategy' }}) From aeaba7ba2bfe9269d9664960ef8ad7c13f423bc0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20Kmiecik?= Date: Wed, 24 Jul 2024 18:01:00 +0200 Subject: [PATCH 3/9] test: added basic test for new param --- .../integration/test_decorators.py | 70 +++++++++++++++++++ 1 file changed, 70 insertions(+) diff --git a/_test_unstructured_client/integration/test_decorators.py b/_test_unstructured_client/integration/test_decorators.py index 16d5cdf8..0baeaccc 100644 --- a/_test_unstructured_client/integration/test_decorators.py +++ b/_test_unstructured_client/integration/test_decorators.py @@ -180,3 +180,73 @@ def test_integration_split_pdf_with_page_range( assert min(page_numbers) == min_page_number, f"Result should start at page {min_page_number}" assert max(page_numbers) == max_page_number, f"Result should end at page {max_page_number}" + + +@pytest.mark.parametrize("concurrency_level", [3]) +@pytest.mark.parametrize( + ("filename", "expected_ok", "strategy"), + [ + # ("_sample_docs/list-item-example-1.pdf", True, "fast"), # 1 page + # ("_sample_docs/layout-parser-paper-fast.pdf", True, "fast"), # 2 pages + # NOTE(mike): using "fast" strategy fails on this file for unknown reasons + ("_sample_docs/layout-parser-paper.pdf", True, shared.Strategy.HI_RES), # 16 pages + ], +) +def test_integration_split_pdf_strict_mode( + concurrency_level: int, filename: str, expected_ok: bool, strategy: shared.Strategy, caplog +): + """Test strict mode (allow failed = False) + + """ + try: + response = requests.get("http://localhost:8000/general/docs") + assert response.status_code == 200, "The unstructured-api is not running on localhost:8000" + except requests.exceptions.ConnectionError: + assert False, "The unstructured-api is not running on localhost:8000" + + client = UnstructuredClient(api_key_auth=FAKE_KEY, server_url="localhost:8000") + + with open(filename, "rb") as f: + files = shared.Files( + content=f.read(), + file_name=filename, + ) + + if not expected_ok: + # This will append .pdf to filename to fool first line of filetype detection, to simulate decoding error + files.file_name += ".pdf" + + req = shared.PartitionParameters( + files=files, + strategy=strategy, + languages=["eng"], + split_pdf_page=True, + split_pdf_concurrency_level=concurrency_level, + split_pdf_allow_failed=True, + ) + + try: + resp_split = client.general.partition(req) + except (HTTPValidationError, AttributeError) as exc: + if not expected_ok: + assert "The file does not appear to be a valid PDF." in caplog.text + assert "File does not appear to be a valid PDF" in str(exc) + return + else: + assert exc is None + + req.split_pdf_page = False + resp_single = client.general.partition(req) + + assert len(resp_split.elements) == len(resp_single.elements) + assert resp_split.content_type == resp_single.content_type + assert resp_split.status_code == resp_single.status_code + + diff = DeepDiff( + t1=resp_split.elements, + t2=resp_single.elements, + exclude_regex_paths=[ + r"root\[\d+\]\['metadata'\]\['parent_id'\]", + ], + ) + assert len(diff) == 0 \ No newline at end of file From 0fe8dc0e698ef05c4e042ff1e89ff51bea2481db Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20Kmiecik?= Date: Thu, 25 Jul 2024 15:43:52 +0200 Subject: [PATCH 4/9] test: updated and added test cases for strict mode --- .../integration/test_decorators.py | 23 +-- .../unit/test_split_pdf_hook.py | 140 +++++++++++++++--- 2 files changed, 131 insertions(+), 32 deletions(-) diff --git a/_test_unstructured_client/integration/test_decorators.py b/_test_unstructured_client/integration/test_decorators.py index 0baeaccc..8d2343b8 100644 --- a/_test_unstructured_client/integration/test_decorators.py +++ b/_test_unstructured_client/integration/test_decorators.py @@ -182,22 +182,25 @@ def test_integration_split_pdf_with_page_range( assert max(page_numbers) == max_page_number, f"Result should end at page {max_page_number}" -@pytest.mark.parametrize("concurrency_level", [3]) +@pytest.mark.parametrize("concurrency_level", [2, 3]) +@pytest.mark.parametrize("allow_failed", [True, False]) @pytest.mark.parametrize( ("filename", "expected_ok", "strategy"), [ - # ("_sample_docs/list-item-example-1.pdf", True, "fast"), # 1 page - # ("_sample_docs/layout-parser-paper-fast.pdf", True, "fast"), # 2 pages - # NOTE(mike): using "fast" strategy fails on this file for unknown reasons + ("_sample_docs/list-item-example-1.pdf", True, "fast"), # 1 page + ("_sample_docs/layout-parser-paper-fast.pdf", True, "fast"), # 2 pages ("_sample_docs/layout-parser-paper.pdf", True, shared.Strategy.HI_RES), # 16 pages ], ) def test_integration_split_pdf_strict_mode( - concurrency_level: int, filename: str, expected_ok: bool, strategy: shared.Strategy, caplog + concurrency_level: int, + allow_failed: bool, + filename: str, + expected_ok: bool, + strategy: shared.Strategy, + caplog ): - """Test strict mode (allow failed = False) - - """ + """Test strict mode (allow failed = False) for split_pdf.""" try: response = requests.get("http://localhost:8000/general/docs") assert response.status_code == 200, "The unstructured-api is not running on localhost:8000" @@ -222,7 +225,7 @@ def test_integration_split_pdf_strict_mode( languages=["eng"], split_pdf_page=True, split_pdf_concurrency_level=concurrency_level, - split_pdf_allow_failed=True, + split_pdf_allow_failed=allow_failed, ) try: @@ -249,4 +252,4 @@ def test_integration_split_pdf_strict_mode( r"root\[\d+\]\['metadata'\]\['parent_id'\]", ], ) - assert len(diff) == 0 \ No newline at end of file + assert len(diff) == 0 diff --git a/_test_unstructured_client/unit/test_split_pdf_hook.py b/_test_unstructured_client/unit/test_split_pdf_hook.py index ee364eb9..d62b8b57 100644 --- a/_test_unstructured_client/unit/test_split_pdf_hook.py +++ b/_test_unstructured_client/unit/test_split_pdf_hook.py @@ -1,10 +1,14 @@ +import asyncio import io import logging -from concurrent.futures import Future +from asyncio import Task +from collections import Counter +from typing import Coroutine import pytest import requests from requests_toolbelt import MultipartDecoder, MultipartEncoder + from unstructured_client._hooks.custom import form_utils, pdf_utils, request_utils from unstructured_client._hooks.custom.form_utils import ( PARTITION_FORM_CONCURRENCY_LEVEL_KEY, @@ -18,7 +22,7 @@ MAX_PAGES_PER_SPLIT, MIN_PAGES_PER_SPLIT, SplitPdfHook, - get_optimal_split_size, + get_optimal_split_size, run_tasks, ) from unstructured_client.models import shared @@ -224,7 +228,6 @@ def test_unit_parse_form_data(): b"--boundary--\r\n" ) - decoded_data = MultipartDecoder( test_form_data, "multipart/form-data; boundary=boundary", @@ -361,22 +364,22 @@ def test_get_optimal_split_size(num_pages, concurrency_level, expected_split_siz ({}, DEFAULT_CONCURRENCY_LEVEL), # no value ({"split_pdf_concurrency_level": 10}, 10), # valid number ( - # exceeds max value - {"split_pdf_concurrency_level": f"{MAX_CONCURRENCY_LEVEL+1}"}, - MAX_CONCURRENCY_LEVEL, + # exceeds max value + {"split_pdf_concurrency_level": f"{MAX_CONCURRENCY_LEVEL + 1}"}, + MAX_CONCURRENCY_LEVEL, ), ({"split_pdf_concurrency_level": -3}, DEFAULT_CONCURRENCY_LEVEL), # negative value ], ) def test_unit_get_split_pdf_concurrency_level_returns_valid_number(form_data, expected_result): assert ( - form_utils.get_split_pdf_concurrency_level_param( - form_data, - key=PARTITION_FORM_CONCURRENCY_LEVEL_KEY, - fallback_value=DEFAULT_CONCURRENCY_LEVEL, - max_allowed=MAX_CONCURRENCY_LEVEL, - ) - == expected_result + form_utils.get_split_pdf_concurrency_level_param( + form_data, + key=PARTITION_FORM_CONCURRENCY_LEVEL_KEY, + fallback_value=DEFAULT_CONCURRENCY_LEVEL, + max_allowed=MAX_CONCURRENCY_LEVEL, + ) + == expected_result ) @@ -404,16 +407,16 @@ def test_unit_get_starting_page_number(starting_page_number, expected_result): @pytest.mark.parametrize( "page_range, expected_result", [ - (["1", "14"], (1, 14)), # Valid range, start on boundary - (["4", "16"], (4, 16)), # Valid range, end on boundary - (None, (1, 20)), # Range not specified, defaults to full range + (["1", "14"], (1, 14)), # Valid range, start on boundary + (["4", "16"], (4, 16)), # Valid range, end on boundary + (None, (1, 20)), # Range not specified, defaults to full range (["2", "5"], (2, 5)), # Valid range within boundary - (["2", "100"], None), # End page too high - (["50", "100"], None), # Range too high - (["-50", "5"], None), # Start page too low - (["-50", "-2"], None), # Range too low - (["10", "2"], None), # Backwards range - (["foo", "foo"], None), # Parse error + (["2", "100"], None), # End page too high + (["50", "100"], None), # Range too high + (["-50", "5"], None), # Start page too low + (["-50", "-2"], None), # Range too low + (["10", "2"], None), # Backwards range + (["foo", "foo"], None), # Parse error ], ) def test_unit_get_page_range_returns_valid_range(page_range, expected_result): @@ -432,3 +435,96 @@ def test_unit_get_page_range_returns_valid_range(page_range, expected_result): return assert result == expected_result + + +async def _request_mock(fails: bool, content: str) -> requests.Response: + response = requests.Response() + response.status_code = 500 if fails else 200 + response._content = content.encode() + return response + + +@pytest.mark.parametrize( + ("allow_failed", "tasks", "expected_responses"), [ + pytest.param( + True, [ + _request_mock(fails=False, content="1"), + _request_mock(fails=False, content="2"), + _request_mock(fails=False, content="3"), + _request_mock(fails=False, content="4"), + ], + ["1", "2", "3", "4"], + id="no failures, fails allower" + ), + pytest.param( + True, [ + _request_mock(fails=False, content="1"), + _request_mock(fails=True, content="2"), + _request_mock(fails=False, content="3"), + _request_mock(fails=True, content="4"), + ], + ["1", "2", "3", "4"], + id="failures, fails allowed" + ), + pytest.param( + False, [ + _request_mock(fails=True, content="failure"), + _request_mock(fails=False, content="2"), + _request_mock(fails=True, content="failure"), + _request_mock(fails=False, content="4"), + ], + ["failure"], + id="failures, fails disallowed" + ), + pytest.param( + False, [ + _request_mock(fails=False, content="1"), + _request_mock(fails=False, content="2"), + _request_mock(fails=False, content="3"), + _request_mock(fails=False, content="4"), + ], + ["1", "2", "3", "4"], + id="no failures, fails disallowed" + ), + ] +) +@pytest.mark.asyncio +async def test_unit_disallow_failed_coroutines( + allow_failed: bool, + tasks: list[Task], + expected_responses: list[str], +): + """Test disallow failed coroutines method properly sets the flag to False.""" + responses = await run_tasks(tasks, allow_failed=allow_failed) + response_contents = [response[1].content.decode() for response in responses] + assert response_contents == expected_responses + + +async def _fetch_canceller_error(fails: bool, content: str, cancelled_counter: Counter): + try: + if not fails: + await asyncio.sleep(0.01) + print("Doesn't fail") + else: + print("Fails") + return await _request_mock(fails=fails, content=content) + except asyncio.CancelledError: + cancelled_counter.update(["cancelled"]) + print(cancelled_counter["cancelled"]) + print("Cancelled") + + +@pytest.mark.asyncio +async def test_remaining_tasks_cancelled_when_fails_disallowed(): + cancelled_counter = Counter() + tasks = [ + _fetch_canceller_error(fails=True, content="1", cancelled_counter=cancelled_counter), + *[_fetch_canceller_error(fails=False, content=f"{i}", cancelled_counter=cancelled_counter) + for i in range(2, 200)], + ] + + await run_tasks(tasks, allow_failed=False) + # give some time to actually cancel the tasks in background + await asyncio.sleep(1) + print("Cancelled amount: ", cancelled_counter["cancelled"]) + assert len(tasks) > cancelled_counter["cancelled"] > 0 From 95465d374714808de7d24ac3d4405fe115bc89ca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20Kmiecik?= Date: Thu, 25 Jul 2024 15:44:29 +0200 Subject: [PATCH 5/9] chore: updated type hints --- src/unstructured_client/_hooks/custom/split_pdf_hook.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/unstructured_client/_hooks/custom/split_pdf_hook.py b/src/unstructured_client/_hooks/custom/split_pdf_hook.py index 0653624b..f3d4b030 100644 --- a/src/unstructured_client/_hooks/custom/split_pdf_hook.py +++ b/src/unstructured_client/_hooks/custom/split_pdf_hook.py @@ -5,6 +5,7 @@ import json import logging import math +from collections.abc import Awaitable from typing import Any, Coroutine, Optional, Tuple, Union import httpx @@ -44,11 +45,12 @@ MAX_PAGES_PER_SPLIT = 20 -async def _order_keeper(index: int, coro: Coroutine) -> Tuple[int, requests.Response]: +async def _order_keeper(index: int, coro: Awaitable) -> Tuple[int, requests.Response]: response = await coro return index, response -async def run_tasks(coroutines, allow_failed: bool = False) -> list[tuple[int, requests.Response]]: + +async def run_tasks(coroutines: list[Awaitable], allow_failed: bool = False) -> list[tuple[int, requests.Response]]: if allow_failed: responses = await asyncio.gather(*coroutines, return_exceptions=False) return list(enumerate(responses, 1)) From 73404be8e0152a9eb2bf9aae07c3712e3cfc76a8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20Kmiecik?= Date: Thu, 25 Jul 2024 15:44:45 +0200 Subject: [PATCH 6/9] chore: added pytest-asyncio and pytest-mock for installation --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index f8e23e1c..4744534e 100644 --- a/Makefile +++ b/Makefile @@ -9,7 +9,7 @@ DOCKER_IMAGE ?= downloads.unstructured.io/unstructured-io/unstructured-api:lates .PHONY: install-test install-test: - pip install pytest requests_mock pypdf deepdiff requests-toolbelt + pip install pytest pytest-asyncio pytest-mock requests_mock pypdf deepdiff requests-toolbelt .PHONY: install-dev install-dev: From 44b9b202693611e810b14e31835f642ff9ffa8dc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20Kmiecik?= Date: Fri, 26 Jul 2024 14:04:46 +0200 Subject: [PATCH 7/9] chore: renamed constant name --- src/unstructured_client/_hooks/custom/form_utils.py | 2 +- src/unstructured_client/_hooks/custom/request_utils.py | 4 ++-- src/unstructured_client/_hooks/custom/split_pdf_hook.py | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/unstructured_client/_hooks/custom/form_utils.py b/src/unstructured_client/_hooks/custom/form_utils.py index 776dcb0d..bf97faa3 100644 --- a/src/unstructured_client/_hooks/custom/form_utils.py +++ b/src/unstructured_client/_hooks/custom/form_utils.py @@ -14,7 +14,7 @@ PARTITION_FORM_FILES_KEY = "files" PARTITION_FORM_SPLIT_PDF_PAGE_KEY = "split_pdf_page" PARTITION_FORM_PAGE_RANGE_KEY = "split_pdf_page_range[]" -PARTITION_FORM_SPLIT_PDF_ALLOW_FAILED = "split_pdf_allow_failed" +PARTITION_FORM_SPLIT_PDF_ALLOW_FAILED_KEY = "split_pdf_allow_failed" PARTITION_FORM_STARTING_PAGE_NUMBER_KEY = "starting_page_number" PARTITION_FORM_CONCURRENCY_LEVEL_KEY = "split_pdf_concurrency_level" diff --git a/src/unstructured_client/_hooks/custom/request_utils.py b/src/unstructured_client/_hooks/custom/request_utils.py index 678a50b4..1512e80b 100644 --- a/src/unstructured_client/_hooks/custom/request_utils.py +++ b/src/unstructured_client/_hooks/custom/request_utils.py @@ -16,7 +16,7 @@ from unstructured_client._hooks.custom.form_utils import ( PARTITION_FORM_FILES_KEY, PARTITION_FORM_SPLIT_PDF_PAGE_KEY, - PARTITION_FORM_SPLIT_PDF_ALLOW_FAILED, + PARTITION_FORM_SPLIT_PDF_ALLOW_FAILED_KEY, PARTITION_FORM_PAGE_RANGE_KEY, PARTITION_FORM_STARTING_PAGE_NUMBER_KEY, FormData, @@ -146,7 +146,7 @@ def prepare_request_payload(form_data: FormData) -> FormData: """ payload = copy.deepcopy(form_data) payload.pop(PARTITION_FORM_SPLIT_PDF_PAGE_KEY, None) - payload.pop(PARTITION_FORM_SPLIT_PDF_ALLOW_FAILED, None) + payload.pop(PARTITION_FORM_SPLIT_PDF_ALLOW_FAILED_KEY, None) payload.pop(PARTITION_FORM_FILES_KEY, None) payload.pop(PARTITION_FORM_PAGE_RANGE_KEY, None) payload.pop(PARTITION_FORM_STARTING_PAGE_NUMBER_KEY, None) diff --git a/src/unstructured_client/_hooks/custom/split_pdf_hook.py b/src/unstructured_client/_hooks/custom/split_pdf_hook.py index f3d4b030..2aa4015d 100644 --- a/src/unstructured_client/_hooks/custom/split_pdf_hook.py +++ b/src/unstructured_client/_hooks/custom/split_pdf_hook.py @@ -21,7 +21,7 @@ PARTITION_FORM_FILES_KEY, PARTITION_FORM_PAGE_RANGE_KEY, PARTITION_FORM_SPLIT_PDF_PAGE_KEY, - PARTITION_FORM_SPLIT_PDF_ALLOW_FAILED, + PARTITION_FORM_SPLIT_PDF_ALLOW_FAILED_KEY, PARTITION_FORM_STARTING_PAGE_NUMBER_KEY, ) from unstructured_client._hooks.types import ( @@ -179,7 +179,7 @@ def before_request( self.allow_failed = form_utils.get_split_pdf_allow_failed_param( form_data, - key=PARTITION_FORM_SPLIT_PDF_ALLOW_FAILED, + key=PARTITION_FORM_SPLIT_PDF_ALLOW_FAILED_KEY, fallback_value=DEFAULT_ALLOW_FAILED, ) logger.info("Allow failed set to %d", self.allow_failed) From d3f2014638e0413af8b649779a47fa8bdeb74a64 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20Kmiecik?= Date: Fri, 26 Jul 2024 14:32:56 +0200 Subject: [PATCH 8/9] chore: fixed pylint complaints --- .../_hooks/custom/split_pdf_hook.py | 36 +++++++++---------- 1 file changed, 17 insertions(+), 19 deletions(-) diff --git a/src/unstructured_client/_hooks/custom/split_pdf_hook.py b/src/unstructured_client/_hooks/custom/split_pdf_hook.py index 2aa4015d..2d8fab8f 100644 --- a/src/unstructured_client/_hooks/custom/split_pdf_hook.py +++ b/src/unstructured_client/_hooks/custom/split_pdf_hook.py @@ -54,25 +54,23 @@ async def run_tasks(coroutines: list[Awaitable], allow_failed: bool = False) -> if allow_failed: responses = await asyncio.gather(*coroutines, return_exceptions=False) return list(enumerate(responses, 1)) - else: - # TODO: replace with asyncio.TaskGroup for python >3.11 - tasks = [asyncio.create_task(_order_keeper(index, coro)) for index, coro in enumerate(coroutines, 1)] - results = [] - remaining_tasks = {i: task for i, task in enumerate(tasks, 1)} - for future in asyncio.as_completed(tasks): - index, response = await future - if response.status_code != 200: - # cancel all remaining tasks - for remaining_task in remaining_tasks.values(): - remaining_task.cancel() - results.append((index, response)) - break - else: - results.append((index, response)) - # remove task from remaining_tasks that should be cancelled in case of failure - del remaining_tasks[index] - # return results in the original order - return sorted(results, key=lambda x: x[0]) + # TODO: replace with asyncio.TaskGroup for python >3.11 # pylint: disable=fixme + tasks = [asyncio.create_task(_order_keeper(index, coro)) for index, coro in enumerate(coroutines, 1)] + results = [] + remaining_tasks = dict(enumerate(tasks, 1)) + for future in asyncio.as_completed(tasks): + index, response = await future + if response.status_code != 200: + # cancel all remaining tasks + for remaining_task in remaining_tasks.values(): + remaining_task.cancel() + results.append((index, response)) + break + results.append((index, response)) + # remove task from remaining_tasks that should be cancelled in case of failure + del remaining_tasks[index] + # return results in the original order + return sorted(results, key=lambda x: x[0]) def get_optimal_split_size(num_pages: int, concurrency_level: int) -> int: From b729aa8cacce36f2b81b39beb2b8b6e73283fee6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20Kmiecik?= Date: Tue, 30 Jul 2024 13:33:25 +0200 Subject: [PATCH 9/9] chore: updated overlay_client.yaml with the new parameter --- overlay_client.yaml | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/overlay_client.yaml b/overlay_client.yaml index 67c35cd5..f36cdc73 100644 --- a/overlay_client.yaml +++ b/overlay_client.yaml @@ -33,6 +33,15 @@ actions: "type": "integer", "default": 5, } + - target: $["components"]["schemas"]["partition_parameters"]["properties"] + update: + "split_pdf_allow_failed": + { + "title": "Split Pdf Allow Failed", + "description": "When `split_pdf_page` is set to `True`, this parameter defines the behavior when some of the parallel requests fail. By default `split_pdf_allow_failed` is set to `False` and any failed request send to the API will make the whole process break and raise an Exception. If `split_pdf_allow_failed` is set to `True`, the errors encountered while sending parallel requests will not break the processing - the resuling list of Elements will miss the data from errored pages.", + "type": "boolean", + "default": false, + } - target: $["components"]["schemas"]["partition_parameters"]["properties"][*].anyOf[0] description: Add a null default to all optional parameters. Prevents the sdk from sending a default string when param is not specified. update: