diff --git a/api-reference/api-services/examples.mdx b/api-reference/api-services/examples.mdx index 029dbf2a..18a39d7e 100644 --- a/api-reference/api-services/examples.mdx +++ b/api-reference/api-services/examples.mdx @@ -99,7 +99,7 @@ The `hi_res` strategy supports different models, and the default is `layout_v1.1 import os from unstructured_client import UnstructuredClient - from unstructured_client.models import shared + from unstructured_client.models import operations, shared from unstructured_client.models.errors import SDKError client = UnstructuredClient( @@ -109,21 +109,23 @@ The `hi_res` strategy supports different models, and the default is `layout_v1.1 filename = "sample-docs/layout-parser-paper.pdf" file = open(filename, "rb") - req = shared.PartitionParameters( - # Note that this currently only supports a single file - files=shared.Files( - content=file.read(), - file_name=filename, - ), - strategy=shared.Strategy.HI_RES, - hi_res_model_name="layout_v1.1.0", - split_pdf_page=True, - split_pdf_allow_failed=True, - split_pdf_concurrency_level=15 + req = operations.PartitionRequest( + partition_parameters=shared.PartitionParameters( + # Note that this currently only supports a single file. + files=shared.Files( + content=file.read(), + file_name=filename, + ), + strategy=shared.Strategy.HI_RES, + hi_res_model_name="layout_v1.1.0", + split_pdf_page=True, + split_pdf_allow_failed=True, + split_pdf_concurrency_level=15 + ) ) try: - res = client.general.partition(req) + res = client.general.partition(request=req) print(res.elements[0]) except SDKError as e: print(e) @@ -248,7 +250,7 @@ For better OCR results, you can specify what languages your document is in using import os from unstructured_client import UnstructuredClient - from unstructured_client.models import shared + from unstructured_client.models import operations, shared from unstructured_client.models.errors import SDKError client = UnstructuredClient( @@ -258,21 +260,23 @@ For better OCR results, you can specify what languages your document is in using filename = "sample-docs/korean.png" file = open(filename, "rb") - req = shared.PartitionParameters( - # Note that this currently only supports a single file - files=shared.Files( - content=file.read(), - file_name=filename, - ), - strategy=shared.Strategy.OCR_ONLY, - languages=["kor"], - split_pdf_page=True, - split_pdf_allow_failed=True, - split_pdf_concurrency_level=15 - ) + req = operations.PartitionRequest( + partition_parameters=shared.PartitionParameters( + # Note that this currently only supports a single file. + files=shared.Files( + content=file.read(), + file_name=filename, + ), + strategy=shared.Strategy.OCR_ONLY, + languages=["kor"], + split_pdf_page=True, + split_pdf_allow_failed=True, + split_pdf_concurrency_level=15 + ) + ) try: - res = client.general.partition(req) + res = client.general.partition(request=req) print(res.elements[0]) except SDKError as e: print(e) @@ -394,7 +398,7 @@ Set the `coordinates` parameter to `true` to add this field to the elements in t import os from unstructured_client import UnstructuredClient - from unstructured_client.models import shared + from unstructured_client.models import operations, shared from unstructured_client.models.errors import SDKError client = UnstructuredClient( @@ -404,21 +408,23 @@ Set the `coordinates` parameter to `true` to add this field to the elements in t filename = "sample-docs/layout-parser-paper.pdf" file = open(filename, "rb") - req = shared.PartitionParameters( - # Note that this currently only supports a single file - files=shared.Files( - content=file.read(), - file_name=filename, - ), - strategy=shared.Strategy.HI_RES, - coordinates=True, - split_pdf_page=True, - split_pdf_allow_failed=True, - split_pdf_concurrency_level=15 + req = operations.PartitionRequest( + partition_parameters=shared.PartitionParameters( + # Note that this currently only supports a single file. + files=shared.Files( + content=file.read(), + file_name=filename, + ), + strategy=shared.Strategy.HI_RES, + coordinates=True, + split_pdf_page=True, + split_pdf_allow_failed=True, + split_pdf_concurrency_level=15 + ) ) try: - res = client.general.partition(req) + res = client.general.partition(request=req) print(res.elements[0]) except SDKError as e: print(e) @@ -543,7 +549,7 @@ This can be helpful if you'd like to use the IDs as a primary key in a database, import os from unstructured_client import UnstructuredClient - from unstructured_client.models import shared + from unstructured_client.models import operations, shared from unstructured_client.models.errors import SDKError client = UnstructuredClient( @@ -553,21 +559,23 @@ This can be helpful if you'd like to use the IDs as a primary key in a database, filename = "sample-docs/layout-parser-paper-fast.pdf" file = open(filename, "rb") - req = shared.PartitionParameters( - # Note that this currently only supports a single file - files=shared.Files( - content=file.read(), - file_name=filename, - ), - unique_element_ids=True, - strategy=shared.Strategy.HI_RES, - split_pdf_page=True, - split_pdf_allow_failed=True, - split_pdf_concurrency_level=15 + req = operations.PartitionRequest( + partition_parameters=shared.PartitionParameters( + # Note that this currently only supports a single file. + files=shared.Files( + content=file.read(), + file_name=filename, + ), + unique_element_ids=True, + strategy=shared.Strategy.HI_RES, + split_pdf_page=True, + split_pdf_allow_failed=True, + split_pdf_concurrency_level=15 + ) ) try: - res = client.general.partition(req) + res = client.general.partition(request=req) print(res.elements[0]) except SDKError as e: print(e) @@ -698,7 +706,7 @@ By default, the `chunking_strategy` is set to `None`, and no chunking is perform import os from unstructured_client import UnstructuredClient - from unstructured_client.models import shared + from unstructured_client.models import operations, shared from unstructured_client.models.errors import SDKError client = UnstructuredClient( @@ -708,22 +716,24 @@ By default, the `chunking_strategy` is set to `None`, and no chunking is perform filename = "sample-docs/layout-parser-paper-fast.pdf" file = open(filename, "rb") - req = shared.PartitionParameters( - # Note that this currently only supports a single file - files=shared.Files( - content=file.read(), - file_name=filename, - ), - chunking_strategy="by_title", - max_characters=1024, - strategy=shared.Strategy.HI_RES, - split_pdf_page=True, - split_pdf_allow_failed=True, - split_pdf_concurrency_level=15 + req = operations.PartitionRequest( + partition_parameters=shared.PartitionParameters( + # Note that this currently only supports a single file. + files=shared.Files( + content=file.read(), + file_name=filename, + ), + chunking_strategy="by_title", + max_characters=1024, + strategy=shared.Strategy.HI_RES, + split_pdf_page=True, + split_pdf_allow_failed=True, + split_pdf_concurrency_level=15 + ) ) try: - res = client.general.partition(req) + res = client.general.partition(request=req) print(res.elements[0]) except SDKError as e: print(e) diff --git a/api-reference/api-services/sdk-python.mdx b/api-reference/api-services/sdk-python.mdx index c70cf183..a00c1c13 100644 --- a/api-reference/api-services/sdk-python.mdx +++ b/api-reference/api-services/sdk-python.mdx @@ -31,21 +31,56 @@ import NoURLForServerlessAPI from '/snippets/general-shared-text/no-url-for-serv The SDK uses semantic versioning and major bumps could bring breaking changes. It is advised to - pin your installed version. + pin your installed version. See the [migration guide](#migration-guide), later on this page, for breaking change announcements. ## Basics - Let's start with a simple example in which you send a PDF document to be partitioned with the Free Unstructured API: - - - Python SDK Deprecation Warning (>v0.22.0): The legacy method of passing `shared.PartitionParameters` - directly to `client.general.partition()` is currently supported but may be deprecated and - could break in future releases. Users should migrate to the new `shared.PartitionRequest` object - to ensure compatibility with future updates. - + Let's start with a simple example in which you send a PDF document to be partitioned with the Unstructured API. Note that the Python SDK gives you a choice of two interfaces. The functionality is identical, but the Pydantic model syntax may provide completion hints in your IDE, for instance. - ```python Python + ```python Python (TypedDict example) + import os, json + + import unstructured_client + from unstructured_client.models import shared + + client = unstructured_client.UnstructuredClient( + api_key_auth=os.getenv("UNSTRUCTURED_API_KEY"), + server_url=os.getenv("UNSTRUCTURED_API_URL"), + ) + + filename = "PATH_TO_INPUT_FILE" + + req = { + "partition_parameters": { + "files": { + "content": open(filename, "rb"), + "file_name": filename, + }, + "strategy": shared.Strategy.HI_RES, + "languages": ['eng'], + "split_pdf_page": True, # If True, splits the PDF file into smaller chunks of pages. + "split_pdf_allow_failed": True, # If True, the partitioning continues even if some pages fail. + "split_pdf_concurrency_level": 15 # Set the number of concurrent request to the maximum value: 15. + } + } + + try: + res = client.general.partition(request=req) + element_dicts = [element for element in res.elements] + + # Print the processed data's first element only. + print(element_dicts[0]) + + # Write the processed data to a local file. + json_elements = json.dumps(element_dicts, indent=2) + + with open("PATH_TO_OUTPUT_FILE", "w") as file: + file.write(json_elements) + except Exception as e: + print(e) + ``` + ```python Python (Pydantic model example) import os, json import unstructured_client @@ -57,13 +92,11 @@ import NoURLForServerlessAPI from '/snippets/general-shared-text/no-url-for-serv ) filename = "PATH_TO_INPUT_FILE" - with open(filename, "rb") as f: - data = f.read() req = operations.PartitionRequest( partition_parameters=shared.PartitionParameters( files=shared.Files( - content=data, + content=open(filename, "rb"), file_name=filename, ), strategy=shared.Strategy.HI_RES, @@ -89,40 +122,69 @@ import NoURLForServerlessAPI from '/snippets/general-shared-text/no-url-for-serv except Exception as e: print(e) ``` - ```python Python (SDK <=v0.22.0) - import os + + +## Async partitioning + + The Python SDK also has a `partition_async`. This call is equivalent to `partition` except that it can be used in a non blocking context. For instance, `asyncio.gather` can be used to concurrently process multiple files at once, as demonstrated here: + + + ```python + import asyncio + import os, json import unstructured_client from unstructured_client.models import shared - from unstructured_client.models.errors import SDKError client = unstructured_client.UnstructuredClient( api_key_auth=os.getenv("UNSTRUCTURED_API_KEY"), server_url=os.getenv("UNSTRUCTURED_API_URL"), ) - filename = "PATH_TO_FILE" - with open(filename, "rb") as f: - data = f.read() + async def call_api(filename): + req = { + "partition_parameters": { + "files": { + "content": open(filename, "rb"), + "file_name": filename, + }, + "strategy": shared.Strategy.HI_RES, + } + } - req = shared.PartitionParameters( - files=shared.Files( - content=data, - file_name=filename, - ), - # Other partition parameters - strategy="hi_res", - languages=['eng'], - ) + try: + res = await client.general.partition_async(request=req) - try: - res = client.general.partition(request=req) - print(res.elements[0]) - except SDKError as e: - print(e) + element_dicts = [element for element in res.elements] + + json_elements = json.dumps(element_dicts, indent=2) + + output_filename = filename + ".json" # Save the JSON response alongside the input file. + with open(output_filename, "w") as file: + file.write(json_elements) + + except Exception as e: + print(e) + + async def process_files(filenames): + filenames = [ + "PATH_TO_INPUT_FILE_1", + "PATH_TO_INPUT_FILE_2", + "PATH_TO_INPUT_FILE_3", + ] + + tasks = [] + + for filename in filenames: + tasks.append(call_api(filename)) + + await asyncio.gather(*tasks) + + asyncio.run(process_files()) ``` + ## Page splitting In order to speed up processing of large PDF files, the `split_pdf_page`[*](#parameter-names) parameter is `True` by default. This @@ -230,3 +292,65 @@ the names used in the SDKs are the same across all methods. * Refer to the [API parameters](/api-reference/api-services/api-parameters) page for the full list of available parameters. * Refer to the [Examples](/api-reference/api-services/examples) page for some inspiration on using the parameters. +## Migration guide + +There are minor breaking changes in 0.26.0. If you encounter any errors when upgrading, please find the solution below. + +**If you see the error: `AttributeError: 'PartitionParameters' object has no attribute 'partition_parameters'`** + +Previously, the SDK accepted a `PartitionParameters` object as input to the `sdk.general.partition` function. Now, this object must be wrapped in a `PartitionRequest` object. The old behavior was deprecated in 0.23.0 and removed in 0.26.0. + +```python +# Instead of: +from unstructured_client.models import shared + +req = shared.PartitionParameters( + files=files, +) + +resp = s.general.partition(request=req) + + +# Switch to: +from unstructured_client.models import shared, operations + +req = operations.PartitionRequest( + partition_parameters=shared.PartitionParameters( + files=files, + ) +) + +resp = s.general.partition(request=req) +``` + +**If you see the error: `TypeError: BaseModel.__init__() takes 1 positional argument but 2 were given`** + +In 0.26.0, the `PartitionRequest` constructor no longer allows for positional arguments. You must specify `partition_parameters` by name. + +```python +# Instead of: +req = operations.PartitionRequest( + shared.PartitionParameters( + files=files, + ) +) + +# Switch to: +req = operations.PartitionRequest( + partition_parameters=shared.PartitionParameters( + files=files, + ) +) +``` + +**If you see the error: `TypeError: General.partition() takes 1 positional argument but 2 were given`** + +In 0.26.0, the `partition` function no longer allows for positional arguments. You must specify `request` by name. + +```python +# Instead of: +resp = s.general.partition(req) + +# Switch to: +resp = s.general.partition(request=req) +``` diff --git a/snippets/general-shared-text/use-ingest-instead.mdx b/snippets/general-shared-text/use-ingest-instead.mdx index 6e020d7f..f32e914e 100644 --- a/snippets/general-shared-text/use-ingest-instead.mdx +++ b/snippets/general-shared-text/use-ingest-instead.mdx @@ -4,6 +4,6 @@ to you: - You need to work with documents in cloud storage. - - You want faster processing of larger individual files. - - You want to process multiple files in batches. - \ No newline at end of file + - You want to cache the results of processing multiple files in batches. + - You want more precise control over document-processing pipeline stages such as partitioning, chunking, filtering, staging, and embedding. +