From d9332ac9f108f5639f1f661e28f1c7611c49cf14 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20Kmiecik?= Date: Thu, 17 Oct 2024 15:37:45 +0200 Subject: [PATCH 1/7] test: added partition_via_api tests removed from unstructured recently --- .../contract/test_partition_via_api.py | 133 ++++++++++++++++++ 1 file changed, 133 insertions(+) create mode 100644 _test_unstructured_client/contract/test_partition_via_api.py diff --git a/_test_unstructured_client/contract/test_partition_via_api.py b/_test_unstructured_client/contract/test_partition_via_api.py new file mode 100644 index 00000000..f40f9a65 --- /dev/null +++ b/_test_unstructured_client/contract/test_partition_via_api.py @@ -0,0 +1,133 @@ +import os +from pathlib import Path + +import httpx +import pytest +from unstructured.partition.api import partition_via_api + +from unstructured_client import UnstructuredClient + + +@pytest.fixture(scope="module") +def client() -> UnstructuredClient: + _client = UnstructuredClient(api_key_auth=os.getenv("UNSTRUCTURED_API_KEY"), server='free-api') + yield _client + + +@pytest.fixture(scope="module") +def doc_path() -> Path: + return Path(__file__).resolve().parents[2] / "_sample_docs" + + +MOCK_TEXT = """[ + { + "element_id": "f49fbd614ddf5b72e06f59e554e6ae2b", + "text": "This is a test email to use for unit tests.", + "type": "NarrativeText", + "metadata": { + "sent_from": [ + "Matthew Robinson " + ], + "sent_to": [ + "Matthew Robinson " + ], + "subject": "Test Email", + "filename": "fake-email.eml", + "filetype": "message/rfc822" + } + } +]""" + + +@pytest.mark.parametrize(("url", "full_url"), [ + ("http://localhost:8000", "http://localhost:8000/general/v0/general"), + ("http://localhost:8000/general/v0/general", "http://localhost:8000/general/v0/general"), +] + ) +def test_partition_via_api_custom_url(httpx_mock, doc_path: Path, url: str, full_url: str): + """ + Assert that we can specify api_url and requests are sent to the right place + """ + + filename = "layout-parser-paper-fast.pdf" + + # adding response automatically checks whether a response to a request to given URL was found + httpx_mock.add_response( + method="POST", + url=full_url, + headers={"Content-Type": "application/json"}, + content=MOCK_TEXT.encode(), + ) + + partition_via_api(filename=str(doc_path/filename), api_url=url, metadata_filename=filename) + + + +def test_partition_via_api_pass_list_type_parameters(httpx_mock, doc_path: Path): + url = "http://localhost:8000/general/v0/general" + filename = "layout-parser-paper-fast.pdf" + + httpx_mock.add_response( + method="POST", + headers={"Content-Type": "application/json"}, + content=MOCK_TEXT.encode(), + url=url, + ) + + params = dict( + split_pdf_page=False, + strategy="hi_res", + extract_image_block_types=["image", "table"], + skip_infer_table_types=["pdf", "docx"], + languages=["eng"], + ) + + partition_via_api(filename=str(doc_path / filename), + api_url=url, + metadata_filename=filename, + **params) + + requests = httpx_mock.get_requests() + + assert len(requests) == 1 + + request = requests[0] + + parsed_multipart_form = _parse_multipart_data(request) + assert "coordinates" in parsed_multipart_form + for key, value in params.items(): + assert key in parsed_multipart_form + assert parsed_multipart_form[key] == value + + +def _parse_multipart_data(request: httpx.Request) -> dict: + """Parser for multipart form data in raw format to a dictionary. Ommits "files" field + Includes table-like entries. + """ + data = request.content + boundary = request.headers["Content-Type"].split("boundary=")[1] + parts = data.split(f"--{boundary}".encode()) + parts = [part.strip() for part in parts if part.strip()] + parsed_data = {} + for part in parts: + if b"Content-Disposition: form-data" in part: + try: + semicolon_pos = part.find(b";") + contents = part[semicolon_pos + 2:] + if b"name=\"files\"" in contents: + continue + contents = contents.decode() + key, value = contents.split("\r\n\r\n") + key = key.replace("name=", "").strip('"') + if "[]" in key: + key = key.replace("[]", "") + if key not in parsed_data: + parsed_data[key] = [] + parsed_data[key].append(value) + elif value in ["true", "false"]: + parsed_data[key] = value == "true" + else: + parsed_data[key] = value + except Exception as ex: + print(ex) + return parsed_data From d34908e327288e533bc923fa5e08733a507f8ff9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20Kmiecik?= Date: Thu, 17 Oct 2024 15:39:13 +0200 Subject: [PATCH 2/7] chore: enabled newly added tests and their dependencies --- .github/workflows/ci.yaml | 19 +++++++++++++++++++ Makefile | 9 +++++++++ pyproject.toml | 5 +++-- 3 files changed, 31 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 9d96e9f8..fb28e1a8 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -63,3 +63,22 @@ jobs: env: UNSTRUCTURED_API_KEY: ${{ secrets.UNSTRUCTURED_API_KEY }} + test_contract: + strategy: + matrix: + python-version: [ "3.9","3.10","3.11", "3.12" ] + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + make install + make install-test-contract + - name: Run unit tests + run: | + poetry run make test-contract + diff --git a/Makefile b/Makefile index 8f97a22b..f1065e2d 100644 --- a/Makefile +++ b/Makefile @@ -19,6 +19,11 @@ install: install-speakeasy-cli: curl -fsSL https://raw.githubusercontent.com/speakeasy-api/speakeasy/main/install.sh | sh +## install-test: install test requirements as they cannot be put into pyproject.toml due to python version requirements mismatch +.PHONY: install-test-contract +install-test: + pip install unstructured pytest-httpx + ################# # Test and Lint # ################# @@ -30,6 +35,10 @@ test: test-unit test-integration-docker test-unit: PYTHONPATH=. pytest _test_unstructured_client -v -k "unit" +.PHONY: test-contract +test-contract: + PYTHONPATH=. pytest _test_unstructured_client -v -k "contract" + # Assumes you have unstructured-api running on localhost:8000 .PHONY: test-integration test-integration: diff --git a/pyproject.toml b/pyproject.toml index 0acb1067..e4724a80 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -18,7 +18,7 @@ include = ["py.typed", "src/unstructured_client/py.typed"] in-project = true [tool.poetry.dependencies] -python = "^3.8" +python = "^3.9" cryptography = ">=3.1" eval-type-backport = "^0.2.0" httpx = ">=0.27.0" @@ -37,9 +37,10 @@ pylint = "==3.2.3" pytest = ">=8.3.3" pytest-asyncio = ">=0.24.0" pytest-mock = ">=3.14.0" +# pytest-httpx = ">=0.32.0" # requires python >= 3.9, the project is pinned to <4.0; >=3.8 (used by contract tests) types-python-dateutil = "^2.9.0.20240316" uvloop = ">=0.20.0" - +# unstructured = ">=0.15.0" # requires python >= 3.9, the project is pinned to <4.0; >=3.8 (used by contract tests) [build-system] requires = ["poetry-core"] build-backend = "poetry.core.masonry.api" From 626baddeea96a7f675c7c8aecc39822820f6c9c9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20Kmiecik?= Date: Thu, 17 Oct 2024 15:56:32 +0200 Subject: [PATCH 3/7] chore: removed comments from pyproject to satisfy ci checks --- pyproject.toml | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index e4724a80..0acb1067 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -18,7 +18,7 @@ include = ["py.typed", "src/unstructured_client/py.typed"] in-project = true [tool.poetry.dependencies] -python = "^3.9" +python = "^3.8" cryptography = ">=3.1" eval-type-backport = "^0.2.0" httpx = ">=0.27.0" @@ -37,10 +37,9 @@ pylint = "==3.2.3" pytest = ">=8.3.3" pytest-asyncio = ">=0.24.0" pytest-mock = ">=3.14.0" -# pytest-httpx = ">=0.32.0" # requires python >= 3.9, the project is pinned to <4.0; >=3.8 (used by contract tests) types-python-dateutil = "^2.9.0.20240316" uvloop = ">=0.20.0" -# unstructured = ">=0.15.0" # requires python >= 3.9, the project is pinned to <4.0; >=3.8 (used by contract tests) + [build-system] requires = ["poetry-core"] build-backend = "poetry.core.masonry.api" From e1a2fe40f379f49f55c4ffd098f605c5b921a83f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20Kmiecik?= Date: Fri, 18 Oct 2024 12:49:48 +0200 Subject: [PATCH 4/7] fix: fixed tests structure and deps installation --- Makefile | 4 ++-- .../contract => _test_contract}/test_partition_via_api.py | 0 2 files changed, 2 insertions(+), 2 deletions(-) rename {_test_unstructured_client/contract => _test_contract}/test_partition_via_api.py (100%) diff --git a/Makefile b/Makefile index f1065e2d..4233335c 100644 --- a/Makefile +++ b/Makefile @@ -21,7 +21,7 @@ install-speakeasy-cli: ## install-test: install test requirements as they cannot be put into pyproject.toml due to python version requirements mismatch .PHONY: install-test-contract -install-test: +install-test-contract: pip install unstructured pytest-httpx ################# @@ -37,7 +37,7 @@ test-unit: .PHONY: test-contract test-contract: - PYTHONPATH=. pytest _test_unstructured_client -v -k "contract" + PYTHONPATH=. pytest _test_contract -v # Assumes you have unstructured-api running on localhost:8000 .PHONY: test-integration diff --git a/_test_unstructured_client/contract/test_partition_via_api.py b/_test_contract/test_partition_via_api.py similarity index 100% rename from _test_unstructured_client/contract/test_partition_via_api.py rename to _test_contract/test_partition_via_api.py From f1451e19c6da894e535bf6bda1f68a8458cc7633 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20Kmiecik?= Date: Fri, 18 Oct 2024 13:05:36 +0200 Subject: [PATCH 5/7] fix: try to fix poetry venv issues while installing non-safe packages --- .github/workflows/ci.yaml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index fb28e1a8..2000c2bb 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -68,6 +68,8 @@ jobs: matrix: python-version: [ "3.9","3.10","3.11", "3.12" ] runs-on: ubuntu-latest + env: + POETRY_VIRTUALENVS_IN_PROJECT: "true" steps: - uses: actions/checkout@v4 - name: Set up Python ${{ matrix.python-version }} @@ -77,7 +79,7 @@ jobs: - name: Install dependencies run: | make install - make install-test-contract + source .venv/bin/activate && make install-test-contract - name: Run unit tests run: | poetry run make test-contract From 7fb3ac89258bf341b892a6e381739e89415a0d2d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20Kmiecik?= Date: Fri, 18 Oct 2024 13:11:10 +0200 Subject: [PATCH 6/7] fix: fix a test's path setting --- _test_contract/test_partition_via_api.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/_test_contract/test_partition_via_api.py b/_test_contract/test_partition_via_api.py index f40f9a65..c837c80b 100644 --- a/_test_contract/test_partition_via_api.py +++ b/_test_contract/test_partition_via_api.py @@ -16,7 +16,9 @@ def client() -> UnstructuredClient: @pytest.fixture(scope="module") def doc_path() -> Path: - return Path(__file__).resolve().parents[2] / "_sample_docs" + samples_path = Path(__file__).resolve().parents[1] / "_sample_docs" + assert samples_path.exists() + return samples_path MOCK_TEXT = """[ From e03e3615fab278ad19cf953dd22b553b1fef048c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20Kmiecik?= Date: Fri, 18 Oct 2024 14:13:57 +0200 Subject: [PATCH 7/7] chore: style fixes --- _test_contract/test_partition_via_api.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/_test_contract/test_partition_via_api.py b/_test_contract/test_partition_via_api.py index c837c80b..1e367d9d 100644 --- a/_test_contract/test_partition_via_api.py +++ b/_test_contract/test_partition_via_api.py @@ -41,11 +41,12 @@ def doc_path() -> Path: ]""" -@pytest.mark.parametrize(("url", "full_url"), [ - ("http://localhost:8000", "http://localhost:8000/general/v0/general"), - ("http://localhost:8000/general/v0/general", "http://localhost:8000/general/v0/general"), -] - ) +@pytest.mark.parametrize( + ("url", "full_url"), [ + ("http://localhost:8000", "http://localhost:8000/general/v0/general"), + ("http://localhost:8000/general/v0/general", "http://localhost:8000/general/v0/general"), + ] +) def test_partition_via_api_custom_url(httpx_mock, doc_path: Path, url: str, full_url: str): """ Assert that we can specify api_url and requests are sent to the right place @@ -61,8 +62,7 @@ def test_partition_via_api_custom_url(httpx_mock, doc_path: Path, url: str, full content=MOCK_TEXT.encode(), ) - partition_via_api(filename=str(doc_path/filename), api_url=url, metadata_filename=filename) - + partition_via_api(filename=str(doc_path / filename), api_url=url, metadata_filename=filename) def test_partition_via_api_pass_list_type_parameters(httpx_mock, doc_path: Path):