From 508574b925af9a2c98689ed52cc66822665cadaf Mon Sep 17 00:00:00 2001 From: Austin Walker Date: Tue, 20 Aug 2024 18:09:59 -0400 Subject: [PATCH 1/8] fix: Address mypy typing errors in v2 SDK Update all typing that used requests to take httpx classes instead. Logic changes should be minimal, this is mostly to change type hints where a `httpx.Response` is used instead of a `requests.Response`, etc. Add mypy to `make lint` so that we can cath these errors before merging. The publish job runs a full linter suite, and these changes made it to main but broke the publish job. --- Makefile | 3 +- .../_hooks/custom/form_utils.py | 17 ++-- .../_hooks/custom/pdf_utils.py | 5 +- .../_hooks/custom/request_utils.py | 37 ++++----- .../_hooks/custom/split_pdf_hook.py | 81 ++++++++++--------- 5 files changed, 71 insertions(+), 72 deletions(-) diff --git a/Makefile b/Makefile index 4b039cfd..be811cc7 100644 --- a/Makefile +++ b/Makefile @@ -14,7 +14,7 @@ install-test: .PHONY: install-dev install-dev: pip install jupyter - pip install pylint + pip install pylint mypy ## install: installs all test, dev, and experimental requirements .PHONY: install @@ -48,6 +48,7 @@ test-integration-docker: .PHONY: lint lint: pylint --rcfile=pylintrc src + mypy src ############# # Speakeasy # diff --git a/src/unstructured_client/_hooks/custom/form_utils.py b/src/unstructured_client/_hooks/custom/form_utils.py index 309b5928..d13f3383 100644 --- a/src/unstructured_client/_hooks/custom/form_utils.py +++ b/src/unstructured_client/_hooks/custom/form_utils.py @@ -3,7 +3,7 @@ import logging from typing import Union -from requests_toolbelt.multipart.decoder import MultipartDecoder +from requests_toolbelt.multipart.decoder import MultipartDecoder # type: ignore from unstructured_client._hooks.custom.common import UNSTRUCTURED_CLIENT_LOGGER_NAME from unstructured_client.models import shared @@ -35,7 +35,7 @@ def get_page_range(form_data: FormData, key: str, max_pages: int) -> tuple[int, try: _page_range = form_data.get(key) - if _page_range is not None: + if _page_range is not None and isinstance(_page_range, list): page_range = (int(_page_range[0]), int(_page_range[1])) else: page_range = (1, max_pages) @@ -108,7 +108,7 @@ def get_split_pdf_allow_failed_param( """ allow_failed = form_data.get(key) - if allow_failed is None: + if allow_failed is None or not isinstance(allow_failed, str): return fallback_value if allow_failed.lower() not in ["true", "false"]: @@ -121,6 +121,7 @@ def get_split_pdf_allow_failed_param( return allow_failed.lower() == "true" + def get_split_pdf_concurrency_level_param( form_data: FormData, key: str, fallback_value: int, max_allowed: int ) -> int: @@ -140,7 +141,7 @@ def get_split_pdf_concurrency_level_param( """ concurrency_level_str = form_data.get(key) - if concurrency_level_str is None: + if concurrency_level_str is None or not isinstance(concurrency_level_str, str): return fallback_value try: @@ -218,10 +219,12 @@ def parse_form_data(decoded_data: MultipartDecoder) -> FormData: else: content = part.content.decode() if name in form_data: - if isinstance(form_data[name], list): - form_data[name].append(content) + form_data_value = form_data[name] + if isinstance(form_data_value, list): + form_data_value.append(content) else: - form_data[name] = [form_data[name], content] + new_list = [form_data_value, content] + form_data[name] = new_list else: form_data[name] = content diff --git a/src/unstructured_client/_hooks/custom/pdf_utils.py b/src/unstructured_client/_hooks/custom/pdf_utils.py index 27dc5e03..589e367b 100644 --- a/src/unstructured_client/_hooks/custom/pdf_utils.py +++ b/src/unstructured_client/_hooks/custom/pdf_utils.py @@ -1,6 +1,6 @@ import io import logging -from typing import Generator, Tuple, Optional +from typing import cast, Generator, Tuple, Optional from pypdf import PdfReader, PdfWriter from pypdf.errors import PdfReadError @@ -70,7 +70,8 @@ def is_pdf(file: shared.Files) -> bool: return False try: - PdfReader(io.BytesIO(file.content), strict=True) + content = cast(bytes, file.content) + PdfReader(io.BytesIO(content), strict=True) except (PdfReadError, UnicodeDecodeError) as exc: logger.error(exc) logger.warning("The file does not appear to be a valid PDF.") diff --git a/src/unstructured_client/_hooks/custom/request_utils.py b/src/unstructured_client/_hooks/custom/request_utils.py index 1512e80b..c805cdc2 100644 --- a/src/unstructured_client/_hooks/custom/request_utils.py +++ b/src/unstructured_client/_hooks/custom/request_utils.py @@ -10,7 +10,7 @@ import httpx import requests from requests.structures import CaseInsensitiveDict -from requests_toolbelt.multipart.encoder import MultipartEncoder +from requests_toolbelt.multipart.encoder import MultipartEncoder # type: ignore from unstructured_client._hooks.custom.common import UNSTRUCTURED_CLIENT_LOGGER_NAME from unstructured_client._hooks.custom.form_utils import ( @@ -51,18 +51,6 @@ def create_request_body( return body -def create_httpx_request( - original_request: requests.Request, body: MultipartEncoder -) -> httpx.Request: - headers = prepare_request_headers(original_request.headers) - return httpx.Request( - method="POST", - url=original_request.url or "", - content=body.to_string(), - headers={**headers, "Content-Type": body.content_type}, - ) - - def create_request( request: requests.PreparedRequest, body: MultipartEncoder, @@ -79,21 +67,24 @@ def create_request( async def call_api_async( client: httpx.AsyncClient, page: Tuple[io.BytesIO, int], - original_request: requests.Request, + original_request: httpx.Request, form_data: FormData, filename: str, limiter: asyncio.Semaphore, -) -> tuple[int, dict]: +) -> httpx.Response: page_content, page_number = page body = create_request_body(form_data, page_content, filename, page_number) - new_request = create_httpx_request(original_request, body) + + new_request = httpx.Request( + method="POST", + url=original_request.url or "", + content=body.to_string(), + headers={**original_request.headers, "Content-Type": body.content_type}, + ) + async with limiter: - try: - response = await client.send(new_request) - return response.status_code, response.json() - except Exception: - logger.error("Failed to send request for page %d", page_number) - return 500, {} + response = await client.send(new_request) + return response def call_api( @@ -157,7 +148,7 @@ def prepare_request_payload(form_data: FormData) -> FormData: return payload -def create_response(response: requests.Response, elements: list) -> requests.Response: +def create_response(response: httpx.Response, elements: list) -> httpx.Response: """ Creates a modified response object with updated content. diff --git a/src/unstructured_client/_hooks/custom/split_pdf_hook.py b/src/unstructured_client/_hooks/custom/split_pdf_hook.py index 584eeb5f..60527c7e 100644 --- a/src/unstructured_client/_hooks/custom/split_pdf_hook.py +++ b/src/unstructured_client/_hooks/custom/split_pdf_hook.py @@ -2,17 +2,15 @@ import asyncio import io -import json import logging import math from collections.abc import Awaitable -from typing import Any, Coroutine, Optional, Tuple, Union +from typing import Any, Coroutine, Optional, Tuple, Union, cast import httpx -import nest_asyncio -import requests +import nest_asyncio # type: ignore from pypdf import PdfReader -from requests_toolbelt.multipart.decoder import MultipartDecoder +from requests_toolbelt.multipart.decoder import MultipartDecoder # type: ignore from unstructured_client._hooks.custom import form_utils, pdf_utils, request_utils from unstructured_client._hooks.custom.common import UNSTRUCTURED_CLIENT_LOGGER_NAME @@ -33,6 +31,7 @@ BeforeRequestHook, SDKInitHook, ) +from unstructured_client.httpclient import HttpClient from unstructured_client.models import shared logger = logging.getLogger(UNSTRUCTURED_CLIENT_LOGGER_NAME) @@ -45,12 +44,12 @@ MAX_PAGES_PER_SPLIT = 20 -async def _order_keeper(index: int, coro: Awaitable) -> Tuple[int, requests.Response]: +async def _order_keeper(index: int, coro: Awaitable) -> Tuple[int, httpx.Response]: response = await coro return index, response -async def run_tasks(coroutines: list[Awaitable], allow_failed: bool = False) -> list[tuple[int, requests.Response]]: +async def run_tasks(coroutines: list[Coroutine], allow_failed: bool = False) -> list[tuple[int, httpx.Response]]: if allow_failed: responses = await asyncio.gather(*coroutines, return_exceptions=False) return list(enumerate(responses, 1)) @@ -103,25 +102,25 @@ class SplitPdfHook(SDKInitHook, BeforeRequestHook, AfterSuccessHook, AfterErrorH """ def __init__(self) -> None: - self.client: Optional[requests.Session] = None + self.client: Optional[HttpClient] = None self.coroutines_to_execute: dict[ - str, list[Coroutine[Any, Any, requests.Response]] + str, list[Coroutine[Any, Any, httpx.Response]] ] = {} - self.api_successful_responses: dict[str, list[requests.Response]] = {} - self.api_failed_responses: dict[str, list[requests.Response]] = {} + self.api_successful_responses: dict[str, list[httpx.Response]] = {} + self.api_failed_responses: dict[str, list[httpx.Response]] = {} self.allow_failed: bool = DEFAULT_ALLOW_FAILED def sdk_init( - self, base_url: str, client: requests.Session - ) -> Tuple[str, requests.Session]: + self, base_url: str, client: HttpClient + ) -> Tuple[str, HttpClient]: """Initializes Split PDF Hook. Args: base_url (str): URL of the API. - client (requests.Session): HTTP Client. + client (HttpClient): HTTP Client. Returns: - Tuple[str, requests.Session]: The initialized SDK options. + Tuple[str, httpx.Session]: The initialized SDK options. """ self.client = client return base_url, client @@ -139,10 +138,10 @@ def before_request( Args: hook_ctx (BeforeRequestContext): The hook context containing information about the operation. - request (requests.PreparedRequest): The request object. + request (httpx.PreparedRequest): The request object. Returns: - Union[requests.PreparedRequest, Exception]: If `splitPdfPage` is set to `true`, + Union[httpx.PreparedRequest, Exception]: If `splitPdfPage` is set to `true`, the last page request; otherwise, the original request. """ if self.client is None: @@ -160,11 +159,11 @@ def before_request( content_type = request.headers.get("Content-Type") request_content = request.read() - body = request_content - if not isinstance(body, bytes) or content_type is None: + request_body = request_content + if not isinstance(request_body, bytes) or content_type is None: return request - decoded_body = MultipartDecoder(body, content_type) + decoded_body = MultipartDecoder(request_body, content_type) form_data = form_utils.parse_form_data(decoded_body) split_pdf_page = form_data.get(PARTITION_FORM_SPLIT_PDF_PAGE_KEY) if split_pdf_page is None or split_pdf_page == "false": @@ -206,7 +205,8 @@ def before_request( logger.info("Concurrency level set to %d", concurrency_level) limiter = asyncio.Semaphore(concurrency_level) - pdf = PdfReader(io.BytesIO(file.content)) + content = cast(bytes, file.content) + pdf = PdfReader(io.BytesIO(content)) page_range_start, page_range_end = form_utils.get_page_range( form_data, @@ -252,7 +252,7 @@ def before_request( async def call_api_partial(page): async with httpx.AsyncClient() as client: - status_code, json_response = await request_utils.call_api_async( + response = await request_utils.call_api_async( client=client, original_request=request, form_data=form_data, @@ -261,14 +261,6 @@ async def call_api_partial(page): limiter=limiter, ) - # convert httpx response to requests.Response to preserve - # compatibility with the synchronous SDK generated by speakeasy - response = requests.Response() - response.status_code = status_code - response._content = json.dumps( # pylint: disable=W0212 - json_response - ).encode() - response.headers["Content-Type"] = "application/json" return response self.coroutines_to_execute[operation_id] = [] @@ -297,11 +289,19 @@ async def call_api_partial(page): body = request_utils.create_request_body( form_data, last_page_content, file.file_name, last_page_number ) - last_page_request = request_utils.create_httpx_request(request, body) + + original_request = request + last_page_request = httpx.Request( + method="POST", + url=original_request.url or "", + content=body.to_string(), + headers={**original_request.headers, "Content-Type": body.content_type}, + ) + return last_page_request def _await_elements( - self, operation_id: str, response: requests.Response + self, operation_id: str, response: httpx.Response ) -> Optional[list]: """ Waits for the partition requests to complete and returns the flattened @@ -309,7 +309,7 @@ def _await_elements( Args: operation_id (str): The ID of the operation. - response (requests.Response): The response object. + response (httpx.Response): The response object. Returns: Optional[list]: The flattened elements if the partition requests are @@ -320,7 +320,8 @@ def _await_elements( return None ioloop = asyncio.get_event_loop() - task_responses: list[tuple[int, requests.Response]] = ioloop.run_until_complete( + # TODO New response type + task_responses: list[tuple[int, httpx.Response]] = ioloop.run_until_complete( run_tasks(tasks, allow_failed=self.allow_failed) ) @@ -363,11 +364,11 @@ def after_success( Args: hook_ctx (AfterSuccessContext): The context object containing information about the hook execution. - response (requests.Response): The response object returned from the API + response (httpx.Response): The response object returned from the API request. Returns: - Union[requests.Response, Exception]: If requests were run in parallel, a + Union[httpx.Response, Exception]: If requests were run in parallel, a combined response object; otherwise, the original response. Can return exception if it ocurred during the execution. """ @@ -385,6 +386,8 @@ def after_success( updated_response = request_utils.create_response(response, elements) self._clear_operation(operation_id) + + # TODO return updated_response def after_error( @@ -401,12 +404,12 @@ def after_error( Args: hook_ctx (AfterErrorContext): The AfterErrorContext object containing information about the hook context. - response (Optional[requests.Response]): The Response object representing + response (Optional[httpx.Response]): The Response object representing the response received before the exception occurred. error (Optional[Exception]): The exception object that was thrown. Returns: - Union[Tuple[Optional[requests.Response], Optional[Exception]], Exception]: + Union[Tuple[Optional[httpx.Response], Optional[Exception]], Exception]: If requests were run in parallel, and at least one was successful, a combined response object; otherwise, the original response and exception. """ @@ -418,7 +421,7 @@ def after_error( operation_id = hook_ctx.operation_id # We know that this request failed so we pass a failed or empty response to `_await_elements` method # where it checks if at least on of the other requests succeeded - elements = self._await_elements(operation_id, response or requests.Response()) + elements = self._await_elements(operation_id, response or httpx.Response(status_code=200)) successful_responses = self.api_successful_responses.get(operation_id) if elements is None or successful_responses is None: From 0d92fde2507bea87138487e8fd87500804537d4d Mon Sep 17 00:00:00 2001 From: Austin Walker Date: Tue, 20 Aug 2024 18:15:00 -0400 Subject: [PATCH 2/8] Remove the custom code patch CI job --- .github/workflows/speakeasy_sdk_generation.yml | 7 ------- 1 file changed, 7 deletions(-) diff --git a/.github/workflows/speakeasy_sdk_generation.yml b/.github/workflows/speakeasy_sdk_generation.yml index 40aff40e..5803f1d1 100644 --- a/.github/workflows/speakeasy_sdk_generation.yml +++ b/.github/workflows/speakeasy_sdk_generation.yml @@ -24,10 +24,3 @@ jobs: github_access_token: ${{ secrets.GITHUB_TOKEN }} pypi_token: ${{ secrets.PYPI_TOKEN }} speakeasy_api_key: ${{ secrets.SPEAKEASY_API_KEY }} - patch-custom-code: - runs-on: ubuntu-latest - needs: [generate] - steps: - - name: Patch in custom code after regenerating - run: make patch-custom-code - From 0e98795bdee627f0d23838897ff1b0b123b3b5e8 Mon Sep 17 00:00:00 2001 From: Austin Walker Date: Tue, 20 Aug 2024 19:18:54 -0400 Subject: [PATCH 3/8] Remove requests import from all src files --- .../_hooks/custom/request_utils.py | 43 ++----------------- 1 file changed, 3 insertions(+), 40 deletions(-) diff --git a/src/unstructured_client/_hooks/custom/request_utils.py b/src/unstructured_client/_hooks/custom/request_utils.py index c805cdc2..c6c78f05 100644 --- a/src/unstructured_client/_hooks/custom/request_utils.py +++ b/src/unstructured_client/_hooks/custom/request_utils.py @@ -5,11 +5,9 @@ import io import json import logging -from typing import Optional, Tuple, Any +from typing import Tuple, Any import httpx -import requests -from requests.structures import CaseInsensitiveDict from requests_toolbelt.multipart.encoder import MultipartEncoder # type: ignore from unstructured_client._hooks.custom.common import UNSTRUCTURED_CLIENT_LOGGER_NAME @@ -51,19 +49,6 @@ def create_request_body( return body -def create_request( - request: requests.PreparedRequest, - body: MultipartEncoder, -) -> requests.Request: - headers = prepare_request_headers(request.headers) - return requests.Request( - method="POST", - url=request.url or "", - data=body, - headers={**headers, "Content-Type": body.content_type}, - ) - - async def call_api_async( client: httpx.AsyncClient, page: Tuple[io.BytesIO, int], @@ -87,31 +72,9 @@ async def call_api_async( return response -def call_api( - client: Optional[requests.Session], - page: Tuple[io.BytesIO, int], - request: requests.PreparedRequest, - form_data: FormData, - filename: str, -) -> requests.Response: - if client is None: - raise RuntimeError("HTTP client not accessible!") - page_content, page_number = page - - body = create_request_body(form_data, page_content, filename, page_number) - new_request = create_request(request, body) - prepared_request = client.prepare_request(new_request) - - try: - return client.send(prepared_request) - except Exception: - logger.error("Failed to send request for page %d", page_number) - return requests.Response() - - def prepare_request_headers( - headers: CaseInsensitiveDict[str], -) -> CaseInsensitiveDict[str]: + headers: dict[str, str], +) -> dict[str, str]: """Prepare the request headers by removing the 'Content-Type' and 'Content-Length' headers. Args: From c9f00ce7503fd95c82ea43b1cdb310daff1dffad Mon Sep 17 00:00:00 2001 From: Austin Walker Date: Tue, 20 Aug 2024 19:25:52 -0400 Subject: [PATCH 4/8] Fix uvloop type hints not available in CI --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index be811cc7..5ac17f68 100644 --- a/Makefile +++ b/Makefile @@ -13,7 +13,7 @@ install-test: .PHONY: install-dev install-dev: - pip install jupyter + pip install jupyter uvloop pip install pylint mypy ## install: installs all test, dev, and experimental requirements From 6e41c308cfbb8bf3219456881a022e2d00e09032 Mon Sep 17 00:00:00 2001 From: Austin Walker Date: Tue, 20 Aug 2024 19:40:45 -0400 Subject: [PATCH 5/8] Fix unit tests --- .../unit/test_split_pdf_hook.py | 55 +------------------ 1 file changed, 2 insertions(+), 53 deletions(-) diff --git a/_test_unstructured_client/unit/test_split_pdf_hook.py b/_test_unstructured_client/unit/test_split_pdf_hook.py index e606227d..d83af7eb 100644 --- a/_test_unstructured_client/unit/test_split_pdf_hook.py +++ b/_test_unstructured_client/unit/test_split_pdf_hook.py @@ -126,57 +126,6 @@ def test_unit_create_response(): assert response.headers.get("Content-Length"), expected_content_length -def test_unit_create_request(): - """Test create request method properly sets file, Content-Type and Content-Length headers. - List parameters should be flattened in the body.""" - - # Prepare test data - request = requests.PreparedRequest() - request.headers = { - "Content-Type": "application/json", - "Authorization": "Bearer token", - } - form_data = { - "parameter_1": "value_1", - "parameter_2": "value_2", - "list_parameter": ["value_1", "value_2"], - } - page = (io.BytesIO(b"page_content"), 1) - filename = "test_file.pdf" - - # Expected results - expected_page_filename = "test_file.pdf" - expected_body = MultipartEncoder( - fields=[ - ("parameter_1", "value_1"), - ("parameter_2", "value_2"), - ("list_parameter", "value_1"), - ("list_parameter", "value_2"), - ("split_pdf_page", "false"), - ("starting_page_number", "7"), - ("files", ( - expected_page_filename, - page[0], - "application/pdf", - )), - ] - ) - expected_url = "" - - # Create request - body = request_utils.create_request_body(form_data, page[0], filename, 7) - request_obj = request_utils.create_request(request, body) - request_content_type: str = request_obj.headers.get("Content-Type") - # Assert the request object - assert request_obj.method == "POST" - assert request_obj.url == expected_url - - # Validate fields ignoring order - assert set(request_obj.data.fields) == set(expected_body.fields) - - assert request_content_type.startswith("multipart/form-data") - - def test_unit_decode_content_disposition(): """Test decode content disposition method properly decodes Content-Disposition header.""" @@ -362,13 +311,13 @@ def test_get_optimal_split_size(num_pages, concurrency_level, expected_split_siz ("form_data", "expected_result"), [ ({}, DEFAULT_CONCURRENCY_LEVEL), # no value - ({"split_pdf_concurrency_level": 10}, 10), # valid number + ({"split_pdf_concurrency_level": "10"}, 10), # valid number ( # exceeds max value {"split_pdf_concurrency_level": f"{MAX_CONCURRENCY_LEVEL + 1}"}, MAX_CONCURRENCY_LEVEL, ), - ({"split_pdf_concurrency_level": -3}, DEFAULT_CONCURRENCY_LEVEL), # negative value + ({"split_pdf_concurrency_level": "-3"}, DEFAULT_CONCURRENCY_LEVEL), # negative value ], ) def test_unit_get_split_pdf_concurrency_level_returns_valid_number(form_data, expected_result): From 38928bae89c7c262e337f1f458e96d11b4991a68 Mon Sep 17 00:00:00 2001 From: Austin Walker Date: Tue, 20 Aug 2024 19:44:59 -0400 Subject: [PATCH 6/8] Remove temp comments --- src/unstructured_client/_hooks/custom/split_pdf_hook.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/unstructured_client/_hooks/custom/split_pdf_hook.py b/src/unstructured_client/_hooks/custom/split_pdf_hook.py index 60527c7e..e4ab6ada 100644 --- a/src/unstructured_client/_hooks/custom/split_pdf_hook.py +++ b/src/unstructured_client/_hooks/custom/split_pdf_hook.py @@ -320,7 +320,6 @@ def _await_elements( return None ioloop = asyncio.get_event_loop() - # TODO New response type task_responses: list[tuple[int, httpx.Response]] = ioloop.run_until_complete( run_tasks(tasks, allow_failed=self.allow_failed) ) @@ -387,7 +386,6 @@ def after_success( updated_response = request_utils.create_response(response, elements) self._clear_operation(operation_id) - # TODO return updated_response def after_error( From ff157a3ec068c734a69b87812fe1352116ab7490 Mon Sep 17 00:00:00 2001 From: Austin Walker Date: Tue, 20 Aug 2024 20:10:10 -0400 Subject: [PATCH 7/8] Fix test failures due to incorrect Content-Length --- .../_hooks/custom/request_utils.py | 15 ++++++++------- .../_hooks/custom/split_pdf_hook.py | 4 +++- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/src/unstructured_client/_hooks/custom/request_utils.py b/src/unstructured_client/_hooks/custom/request_utils.py index c6c78f05..09592804 100644 --- a/src/unstructured_client/_hooks/custom/request_utils.py +++ b/src/unstructured_client/_hooks/custom/request_utils.py @@ -59,12 +59,13 @@ async def call_api_async( ) -> httpx.Response: page_content, page_number = page body = create_request_body(form_data, page_content, filename, page_number) + original_headers = prepare_request_headers(original_request.headers) new_request = httpx.Request( method="POST", url=original_request.url or "", content=body.to_string(), - headers={**original_request.headers, "Content-Type": body.content_type}, + headers={**original_headers, "Content-Type": body.content_type}, ) async with limiter: @@ -73,8 +74,8 @@ async def call_api_async( def prepare_request_headers( - headers: dict[str, str], -) -> dict[str, str]: + headers: httpx.Headers, +) -> httpx.Headers: """Prepare the request headers by removing the 'Content-Type' and 'Content-Length' headers. Args: @@ -83,10 +84,10 @@ def prepare_request_headers( Returns: The modified request headers. """ - headers = copy.deepcopy(headers) - headers.pop("Content-Type", None) - headers.pop("Content-Length", None) - return headers + new_headers = headers.copy() + new_headers.pop("Content-Type", None) + new_headers.pop("Content-Length", None) + return new_headers def prepare_request_payload(form_data: FormData) -> FormData: diff --git a/src/unstructured_client/_hooks/custom/split_pdf_hook.py b/src/unstructured_client/_hooks/custom/split_pdf_hook.py index e4ab6ada..8627333f 100644 --- a/src/unstructured_client/_hooks/custom/split_pdf_hook.py +++ b/src/unstructured_client/_hooks/custom/split_pdf_hook.py @@ -14,6 +14,7 @@ from unstructured_client._hooks.custom import form_utils, pdf_utils, request_utils from unstructured_client._hooks.custom.common import UNSTRUCTURED_CLIENT_LOGGER_NAME +from unstructured_client._hooks.custom.request_utils import prepare_request_headers from unstructured_client._hooks.custom.form_utils import ( PARTITION_FORM_CONCURRENCY_LEVEL_KEY, PARTITION_FORM_FILES_KEY, @@ -291,11 +292,12 @@ async def call_api_partial(page): ) original_request = request + original_headers = prepare_request_headers(original_request.headers) last_page_request = httpx.Request( method="POST", url=original_request.url or "", content=body.to_string(), - headers={**original_request.headers, "Content-Type": body.content_type}, + headers={**original_headers, "Content-Type": body.content_type}, ) return last_page_request From 39f182d7af0df695f5ea13e25fabc416611aef62 Mon Sep 17 00:00:00 2001 From: Austin Walker Date: Wed, 21 Aug 2024 13:55:09 -0400 Subject: [PATCH 8/8] Address pr comments * Fix some verbose isinstance checks * Use single line for install-dev --- Makefile | 3 +-- src/unstructured_client/_hooks/custom/form_utils.py | 6 +++--- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/Makefile b/Makefile index 5ac17f68..0dc74af6 100644 --- a/Makefile +++ b/Makefile @@ -13,8 +13,7 @@ install-test: .PHONY: install-dev install-dev: - pip install jupyter uvloop - pip install pylint mypy + pip install jupyter uvloop pylint mypy ## install: installs all test, dev, and experimental requirements .PHONY: install diff --git a/src/unstructured_client/_hooks/custom/form_utils.py b/src/unstructured_client/_hooks/custom/form_utils.py index d13f3383..54fb06b3 100644 --- a/src/unstructured_client/_hooks/custom/form_utils.py +++ b/src/unstructured_client/_hooks/custom/form_utils.py @@ -35,7 +35,7 @@ def get_page_range(form_data: FormData, key: str, max_pages: int) -> tuple[int, try: _page_range = form_data.get(key) - if _page_range is not None and isinstance(_page_range, list): + if isinstance(_page_range, list): page_range = (int(_page_range[0]), int(_page_range[1])) else: page_range = (1, max_pages) @@ -108,7 +108,7 @@ def get_split_pdf_allow_failed_param( """ allow_failed = form_data.get(key) - if allow_failed is None or not isinstance(allow_failed, str): + if not isinstance(allow_failed, str): return fallback_value if allow_failed.lower() not in ["true", "false"]: @@ -141,7 +141,7 @@ def get_split_pdf_concurrency_level_param( """ concurrency_level_str = form_data.get(key) - if concurrency_level_str is None or not isinstance(concurrency_level_str, str): + if not isinstance(concurrency_level_str, str): return fallback_value try: