Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions .genignore
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
# https://www.speakeasyapi.dev/docs/customize-sdks/monkey-patching

# ignore human-written test files
tests/test_utils_retries.py
# ignore human-written files and directories
src/unstructured_client/_unstructured
_jupyter
_sample_docs
_test_unstructured_client

# ignore Makefile
Makefile
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,6 @@ __pycache__/
.pytest_cache/
.python-version
.DS_Store

# human-added igore files
.ipynb_checkpoints/
21 changes: 19 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,31 @@ ARCH := $(shell uname -m)
# Install #
###########

test-install:
.PHONY: install-test
pip install pytest
pip install requests_mock

.PHONY: install-dev
pip install jupyter

## install: installs all test, dev, and experimental requirements
.PHONY: install
install: install-test install-dev

#################
# Test and Lint #
#################

.PHONY: test
test:
PYTHONPATH=. pytest \
tests
_test_unstructured_client

###########
# Jupyter #
###########

## run-jupyter: starts jupyter notebook
.PHONY: run-jupyter
run-jupyter:
PYTHONPATH=$(realpath .) JUPYTER_PATH=$(realpath .) jupyter-notebook --NotebookApp.token='' --NotebookApp.password=''
112 changes: 112 additions & 0 deletions _jupyter/README_example.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "cd4f8056-2015-4c28-8974-d9862db07e84",
"metadata": {},
"source": [
"Setup"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8b8e5368-6268-4da9-9e8d-5c38637da8a5",
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"\n",
"def get_api_key():\n",
" api_key = os.getenv(\"UNS_API_KEY\")\n",
" if api_key is None:\n",
" raise ValueError(\"\"\"UNS_API_KEY environment variable not set. \n",
"Set it in your current shell session with `export UNS_API_KEY=<api_key>`\"\"\")\n",
" return api_key"
]
},
{
"cell_type": "markdown",
"id": "11822c83-0791-432c-b1fb-05d8e2ae25bb",
"metadata": {},
"source": [
"\"Usage\" instructions from README for `unstructured-python-client` (as of 01/29/2023)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5c28a39c-ad38-47a5-8247-a2fa1488313c",
"metadata": {},
"outputs": [],
"source": [
"from unstructured_client import UnstructuredClient\n",
"from unstructured_client.models import shared\n",
"from unstructured_client.models.errors import SDKError\n",
"\n",
"s = UnstructuredClient(api_key_auth=get_api_key())\n",
"filename = \"../_sample_docs/layout-parser-paper-fast.pdf\"\n",
"\n",
"with open(filename, \"rb\") as f:\n",
" # Note that this currently only supports a single filea\n",
" files=shared.Files(\n",
" content=f.read(),\n",
" file_name=filename,\n",
"\t)\n",
"\n",
"req = shared.PartitionParameters(\n",
" files=files,\n",
" # Other partition params\n",
" strategy='ocr_only',\n",
" languages=[\"eng\"],\n",
")\n",
"\n",
"try:\n",
" resp = s.general.partition(req)\n",
" print(resp.elements[0])\n",
"except SDKError as e:\n",
" print(e)\n",
"\n",
"# {\n",
"# 'type': 'UncategorizedText', \n",
"# 'element_id': 'fc550084fda1e008e07a0356894f5816', \n",
"# 'metadata': {\n",
"# 'filename': 'layout-parser-paper-fast.pdf', \n",
"# 'filetype': 'application/pdf', \n",
"# 'languages': ['eng'], \n",
"# 'page_number': 1\n",
"# }\n",
"# }"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f5dfdb68-ba5d-4d21-98b2-4efe04126b7a",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.13"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Binary file added _sample_docs/layout-parser-paper-fast.pdf
Binary file not shown.
70 changes: 70 additions & 0 deletions _test_unstructured_client/test_check_url_protocol.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
import os
import pytest

from unstructured_client import UnstructuredClient


def get_api_key():
api_key = os.getenv("UNS_API_KEY")
if api_key is None:
raise ValueError("""UNS_API_KEY environment variable not set.
Set it in your current shell session with `export UNS_API_KEY=<api_key>`""")
return api_key


@pytest.mark.parametrize(
("server_url"),
[
("https://unstructured-000mock.api.unstructuredapp.io"), # correct url
("unstructured-000mock.api.unstructuredapp.io"),
("http://unstructured-000mock.api.unstructuredapp.io/general/v0/general"),
("https://unstructured-000mock.api.unstructuredapp.io/general/v0/general"),
("unstructured-000mock.api.unstructuredapp.io/general/v0/general"),
]
)
def test_clean_server_url_on_paid_api_url(server_url: str):
client = UnstructuredClient(
server_url=server_url,
api_key_auth=get_api_key(),
)
assert client.general.sdk_configuration.server_url == "https://unstructured-000mock.api.unstructuredapp.io"


@pytest.mark.parametrize(
("server_url"),
[
("http://localhost:8000"), # correct url
("localhost:8000"),
("localhost:8000/general/v0/general"),
("http://localhost:8000/general/v0/general"),
]
)
def test_clean_server_url_on_localhost(server_url: str):
client = UnstructuredClient(
server_url=server_url,
api_key_auth=get_api_key(),
)
assert client.general.sdk_configuration.server_url == "http://localhost:8000"


def test_clean_server_url_on_empty_string():
client = UnstructuredClient(
server_url="",
api_key_auth=get_api_key(),
)
assert client.general.sdk_configuration.server_url == ""

@pytest.mark.parametrize(
("server_url"),
[
("https://unstructured-000mock.api.unstructuredapp.io"),
("unstructured-000mock.api.unstructuredapp.io/general/v0/general"),
]
)
def test_clean_server_url_with_positional_arguments(server_url: str):
client = UnstructuredClient(
get_api_key(),
"",
server_url,
)
assert client.general.sdk_configuration.server_url == "https://unstructured-000mock.api.unstructuredapp.io"
2 changes: 2 additions & 0 deletions src/unstructured_client/sdk.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,15 @@
from typing import Callable, Dict, Union
from unstructured_client import utils
from unstructured_client.models import shared
from unstructured_client.utils._decorators import clean_server_url

class UnstructuredClient:
r"""Unstructured Pipeline API: Partition documents with the Unstructured library"""
general: General

sdk_configuration: SDKConfiguration

@clean_server_url
def __init__(self,
api_key_auth: Union[str, Callable[[], str]],
server: str = None,
Expand Down
46 changes: 46 additions & 0 deletions src/unstructured_client/utils/_decorators.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
from __future__ import annotations

import functools
from typing import cast, Callable, Optional
from typing_extensions import ParamSpec
from urllib.parse import urlparse, urlunparse, ParseResult


_P = ParamSpec("_P")


def clean_server_url(func: Callable[_P, None]) -> Callable[_P, None]:

@functools.wraps(func)
def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> None:
SERVER_URL_ARG_IDX = 3
url_is_in_kwargs = True

server_url: Optional[str] = cast(Optional[str], kwargs.get("server_url"))

if server_url is None and len(args) > SERVER_URL_ARG_IDX:
server_url = cast(str, args[SERVER_URL_ARG_IDX])
url_is_in_kwargs = False

if server_url:
# -- add a url scheme if not present (urllib.parse does not work reliably without it)
if "http" not in server_url:
server_url = "http://" + server_url

parsed_url: ParseResult = urlparse(server_url)

if "api.unstructuredapp.io" in server_url:
if parsed_url.scheme != "https":
parsed_url = parsed_url._replace(scheme="https")

# -- path should always be empty
cleaned_url = parsed_url._replace(path="")

if url_is_in_kwargs:
kwargs["server_url"] = urlunparse(cleaned_url)
else:
args = args[:SERVER_URL_ARG_IDX] + (urlunparse(cleaned_url),) + args[SERVER_URL_ARG_IDX+1:] # type: ignore

return func(*args, **kwargs)

return wrapper