diff --git a/.genignore b/.genignore index ab9191ff..185d456e 100644 --- a/.genignore +++ b/.genignore @@ -1,7 +1,10 @@ # https://www.speakeasyapi.dev/docs/customize-sdks/monkey-patching -# ignore human-written test files -tests/test_utils_retries.py +# ignore human-written files and directories +src/unstructured_client/_unstructured +_jupyter +_sample_docs +_test_unstructured_client # ignore Makefile Makefile diff --git a/.gitignore b/.gitignore index 3cdf3380..83bf0c26 100755 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,6 @@ __pycache__/ .pytest_cache/ .python-version .DS_Store + +# human-added igore files +.ipynb_checkpoints/ diff --git a/Makefile b/Makefile index 2921f96d..71c41168 100644 --- a/Makefile +++ b/Makefile @@ -6,9 +6,17 @@ ARCH := $(shell uname -m) # Install # ########### -test-install: +.PHONY: install-test + pip install pytest pip install requests_mock +.PHONY: install-dev + pip install jupyter + +## install: installs all test, dev, and experimental requirements +.PHONY: install +install: install-test install-dev + ################# # Test and Lint # ################# @@ -16,4 +24,13 @@ test-install: .PHONY: test test: PYTHONPATH=. pytest \ - tests \ No newline at end of file + _test_unstructured_client + +########### +# Jupyter # +########### + +## run-jupyter: starts jupyter notebook +.PHONY: run-jupyter +run-jupyter: + PYTHONPATH=$(realpath .) JUPYTER_PATH=$(realpath .) jupyter-notebook --NotebookApp.token='' --NotebookApp.password='' diff --git a/_jupyter/README_example.ipynb b/_jupyter/README_example.ipynb new file mode 100644 index 00000000..51c4f30f --- /dev/null +++ b/_jupyter/README_example.ipynb @@ -0,0 +1,112 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "cd4f8056-2015-4c28-8974-d9862db07e84", + "metadata": {}, + "source": [ + "Setup" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8b8e5368-6268-4da9-9e8d-5c38637da8a5", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "\n", + "def get_api_key():\n", + " api_key = os.getenv(\"UNS_API_KEY\")\n", + " if api_key is None:\n", + " raise ValueError(\"\"\"UNS_API_KEY environment variable not set. \n", + "Set it in your current shell session with `export UNS_API_KEY=`\"\"\")\n", + " return api_key" + ] + }, + { + "cell_type": "markdown", + "id": "11822c83-0791-432c-b1fb-05d8e2ae25bb", + "metadata": {}, + "source": [ + "\"Usage\" instructions from README for `unstructured-python-client` (as of 01/29/2023)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5c28a39c-ad38-47a5-8247-a2fa1488313c", + "metadata": {}, + "outputs": [], + "source": [ + "from unstructured_client import UnstructuredClient\n", + "from unstructured_client.models import shared\n", + "from unstructured_client.models.errors import SDKError\n", + "\n", + "s = UnstructuredClient(api_key_auth=get_api_key())\n", + "filename = \"../_sample_docs/layout-parser-paper-fast.pdf\"\n", + "\n", + "with open(filename, \"rb\") as f:\n", + " # Note that this currently only supports a single filea\n", + " files=shared.Files(\n", + " content=f.read(),\n", + " file_name=filename,\n", + "\t)\n", + "\n", + "req = shared.PartitionParameters(\n", + " files=files,\n", + " # Other partition params\n", + " strategy='ocr_only',\n", + " languages=[\"eng\"],\n", + ")\n", + "\n", + "try:\n", + " resp = s.general.partition(req)\n", + " print(resp.elements[0])\n", + "except SDKError as e:\n", + " print(e)\n", + "\n", + "# {\n", + "# 'type': 'UncategorizedText', \n", + "# 'element_id': 'fc550084fda1e008e07a0356894f5816', \n", + "# 'metadata': {\n", + "# 'filename': 'layout-parser-paper-fast.pdf', \n", + "# 'filetype': 'application/pdf', \n", + "# 'languages': ['eng'], \n", + "# 'page_number': 1\n", + "# }\n", + "# }" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f5dfdb68-ba5d-4d21-98b2-4efe04126b7a", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/_sample_docs/layout-parser-paper-fast.pdf b/_sample_docs/layout-parser-paper-fast.pdf new file mode 100644 index 00000000..6a318063 Binary files /dev/null and b/_sample_docs/layout-parser-paper-fast.pdf differ diff --git a/_test_unstructured_client/test_check_url_protocol.py b/_test_unstructured_client/test_check_url_protocol.py new file mode 100644 index 00000000..2bc0b9fd --- /dev/null +++ b/_test_unstructured_client/test_check_url_protocol.py @@ -0,0 +1,70 @@ +import os +import pytest + +from unstructured_client import UnstructuredClient + + +def get_api_key(): + api_key = os.getenv("UNS_API_KEY") + if api_key is None: + raise ValueError("""UNS_API_KEY environment variable not set. +Set it in your current shell session with `export UNS_API_KEY=`""") + return api_key + + +@pytest.mark.parametrize( + ("server_url"), + [ + ("https://unstructured-000mock.api.unstructuredapp.io"), # correct url + ("unstructured-000mock.api.unstructuredapp.io"), + ("http://unstructured-000mock.api.unstructuredapp.io/general/v0/general"), + ("https://unstructured-000mock.api.unstructuredapp.io/general/v0/general"), + ("unstructured-000mock.api.unstructuredapp.io/general/v0/general"), + ] +) +def test_clean_server_url_on_paid_api_url(server_url: str): + client = UnstructuredClient( + server_url=server_url, + api_key_auth=get_api_key(), + ) + assert client.general.sdk_configuration.server_url == "https://unstructured-000mock.api.unstructuredapp.io" + + +@pytest.mark.parametrize( + ("server_url"), + [ + ("http://localhost:8000"), # correct url + ("localhost:8000"), + ("localhost:8000/general/v0/general"), + ("http://localhost:8000/general/v0/general"), + ] +) +def test_clean_server_url_on_localhost(server_url: str): + client = UnstructuredClient( + server_url=server_url, + api_key_auth=get_api_key(), + ) + assert client.general.sdk_configuration.server_url == "http://localhost:8000" + + +def test_clean_server_url_on_empty_string(): + client = UnstructuredClient( + server_url="", + api_key_auth=get_api_key(), + ) + assert client.general.sdk_configuration.server_url == "" + +@pytest.mark.parametrize( + ("server_url"), + [ + ("https://unstructured-000mock.api.unstructuredapp.io"), + ("unstructured-000mock.api.unstructuredapp.io/general/v0/general"), + ] +) +def test_clean_server_url_with_positional_arguments(server_url: str): + client = UnstructuredClient( + get_api_key(), + "", + server_url, + ) + assert client.general.sdk_configuration.server_url == "https://unstructured-000mock.api.unstructuredapp.io" diff --git a/tests/test_utils_retries.py b/_test_unstructured_client/test_utils_retries.py similarity index 100% rename from tests/test_utils_retries.py rename to _test_unstructured_client/test_utils_retries.py diff --git a/src/unstructured_client/sdk.py b/src/unstructured_client/sdk.py index 18fb5cb1..9358dbac 100644 --- a/src/unstructured_client/sdk.py +++ b/src/unstructured_client/sdk.py @@ -6,6 +6,7 @@ from typing import Callable, Dict, Union from unstructured_client import utils from unstructured_client.models import shared +from unstructured_client.utils._decorators import clean_server_url class UnstructuredClient: r"""Unstructured Pipeline API: Partition documents with the Unstructured library""" @@ -13,6 +14,7 @@ class UnstructuredClient: sdk_configuration: SDKConfiguration + @clean_server_url def __init__(self, api_key_auth: Union[str, Callable[[], str]], server: str = None, diff --git a/src/unstructured_client/utils/_decorators.py b/src/unstructured_client/utils/_decorators.py new file mode 100644 index 00000000..fd891c2a --- /dev/null +++ b/src/unstructured_client/utils/_decorators.py @@ -0,0 +1,46 @@ +from __future__ import annotations + +import functools +from typing import cast, Callable, Optional +from typing_extensions import ParamSpec +from urllib.parse import urlparse, urlunparse, ParseResult + + +_P = ParamSpec("_P") + + +def clean_server_url(func: Callable[_P, None]) -> Callable[_P, None]: + + @functools.wraps(func) + def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> None: + SERVER_URL_ARG_IDX = 3 + url_is_in_kwargs = True + + server_url: Optional[str] = cast(Optional[str], kwargs.get("server_url")) + + if server_url is None and len(args) > SERVER_URL_ARG_IDX: + server_url = cast(str, args[SERVER_URL_ARG_IDX]) + url_is_in_kwargs = False + + if server_url: + # -- add a url scheme if not present (urllib.parse does not work reliably without it) + if "http" not in server_url: + server_url = "http://" + server_url + + parsed_url: ParseResult = urlparse(server_url) + + if "api.unstructuredapp.io" in server_url: + if parsed_url.scheme != "https": + parsed_url = parsed_url._replace(scheme="https") + + # -- path should always be empty + cleaned_url = parsed_url._replace(path="") + + if url_is_in_kwargs: + kwargs["server_url"] = urlunparse(cleaned_url) + else: + args = args[:SERVER_URL_ARG_IDX] + (urlunparse(cleaned_url),) + args[SERVER_URL_ARG_IDX+1:] # type: ignore + + return func(*args, **kwargs) + + return wrapper