diff --git a/api-reference/api-services/examples.mdx b/api-reference/api-services/examples.mdx
index 029dbf2a..18a39d7e 100644
--- a/api-reference/api-services/examples.mdx
+++ b/api-reference/api-services/examples.mdx
@@ -99,7 +99,7 @@ The `hi_res` strategy supports different models, and the default is `layout_v1.1
import os
from unstructured_client import UnstructuredClient
- from unstructured_client.models import shared
+ from unstructured_client.models import operations, shared
from unstructured_client.models.errors import SDKError
client = UnstructuredClient(
@@ -109,21 +109,23 @@ The `hi_res` strategy supports different models, and the default is `layout_v1.1
filename = "sample-docs/layout-parser-paper.pdf"
file = open(filename, "rb")
- req = shared.PartitionParameters(
- # Note that this currently only supports a single file
- files=shared.Files(
- content=file.read(),
- file_name=filename,
- ),
- strategy=shared.Strategy.HI_RES,
- hi_res_model_name="layout_v1.1.0",
- split_pdf_page=True,
- split_pdf_allow_failed=True,
- split_pdf_concurrency_level=15
+ req = operations.PartitionRequest(
+ partition_parameters=shared.PartitionParameters(
+ # Note that this currently only supports a single file.
+ files=shared.Files(
+ content=file.read(),
+ file_name=filename,
+ ),
+ strategy=shared.Strategy.HI_RES,
+ hi_res_model_name="layout_v1.1.0",
+ split_pdf_page=True,
+ split_pdf_allow_failed=True,
+ split_pdf_concurrency_level=15
+ )
)
try:
- res = client.general.partition(req)
+ res = client.general.partition(request=req)
print(res.elements[0])
except SDKError as e:
print(e)
@@ -248,7 +250,7 @@ For better OCR results, you can specify what languages your document is in using
import os
from unstructured_client import UnstructuredClient
- from unstructured_client.models import shared
+ from unstructured_client.models import operations, shared
from unstructured_client.models.errors import SDKError
client = UnstructuredClient(
@@ -258,21 +260,23 @@ For better OCR results, you can specify what languages your document is in using
filename = "sample-docs/korean.png"
file = open(filename, "rb")
- req = shared.PartitionParameters(
- # Note that this currently only supports a single file
- files=shared.Files(
- content=file.read(),
- file_name=filename,
- ),
- strategy=shared.Strategy.OCR_ONLY,
- languages=["kor"],
- split_pdf_page=True,
- split_pdf_allow_failed=True,
- split_pdf_concurrency_level=15
- )
+ req = operations.PartitionRequest(
+ partition_parameters=shared.PartitionParameters(
+ # Note that this currently only supports a single file.
+ files=shared.Files(
+ content=file.read(),
+ file_name=filename,
+ ),
+ strategy=shared.Strategy.OCR_ONLY,
+ languages=["kor"],
+ split_pdf_page=True,
+ split_pdf_allow_failed=True,
+ split_pdf_concurrency_level=15
+ )
+ )
try:
- res = client.general.partition(req)
+ res = client.general.partition(request=req)
print(res.elements[0])
except SDKError as e:
print(e)
@@ -394,7 +398,7 @@ Set the `coordinates` parameter to `true` to add this field to the elements in t
import os
from unstructured_client import UnstructuredClient
- from unstructured_client.models import shared
+ from unstructured_client.models import operations, shared
from unstructured_client.models.errors import SDKError
client = UnstructuredClient(
@@ -404,21 +408,23 @@ Set the `coordinates` parameter to `true` to add this field to the elements in t
filename = "sample-docs/layout-parser-paper.pdf"
file = open(filename, "rb")
- req = shared.PartitionParameters(
- # Note that this currently only supports a single file
- files=shared.Files(
- content=file.read(),
- file_name=filename,
- ),
- strategy=shared.Strategy.HI_RES,
- coordinates=True,
- split_pdf_page=True,
- split_pdf_allow_failed=True,
- split_pdf_concurrency_level=15
+ req = operations.PartitionRequest(
+ partition_parameters=shared.PartitionParameters(
+ # Note that this currently only supports a single file.
+ files=shared.Files(
+ content=file.read(),
+ file_name=filename,
+ ),
+ strategy=shared.Strategy.HI_RES,
+ coordinates=True,
+ split_pdf_page=True,
+ split_pdf_allow_failed=True,
+ split_pdf_concurrency_level=15
+ )
)
try:
- res = client.general.partition(req)
+ res = client.general.partition(request=req)
print(res.elements[0])
except SDKError as e:
print(e)
@@ -543,7 +549,7 @@ This can be helpful if you'd like to use the IDs as a primary key in a database,
import os
from unstructured_client import UnstructuredClient
- from unstructured_client.models import shared
+ from unstructured_client.models import operations, shared
from unstructured_client.models.errors import SDKError
client = UnstructuredClient(
@@ -553,21 +559,23 @@ This can be helpful if you'd like to use the IDs as a primary key in a database,
filename = "sample-docs/layout-parser-paper-fast.pdf"
file = open(filename, "rb")
- req = shared.PartitionParameters(
- # Note that this currently only supports a single file
- files=shared.Files(
- content=file.read(),
- file_name=filename,
- ),
- unique_element_ids=True,
- strategy=shared.Strategy.HI_RES,
- split_pdf_page=True,
- split_pdf_allow_failed=True,
- split_pdf_concurrency_level=15
+ req = operations.PartitionRequest(
+ partition_parameters=shared.PartitionParameters(
+ # Note that this currently only supports a single file.
+ files=shared.Files(
+ content=file.read(),
+ file_name=filename,
+ ),
+ unique_element_ids=True,
+ strategy=shared.Strategy.HI_RES,
+ split_pdf_page=True,
+ split_pdf_allow_failed=True,
+ split_pdf_concurrency_level=15
+ )
)
try:
- res = client.general.partition(req)
+ res = client.general.partition(request=req)
print(res.elements[0])
except SDKError as e:
print(e)
@@ -698,7 +706,7 @@ By default, the `chunking_strategy` is set to `None`, and no chunking is perform
import os
from unstructured_client import UnstructuredClient
- from unstructured_client.models import shared
+ from unstructured_client.models import operations, shared
from unstructured_client.models.errors import SDKError
client = UnstructuredClient(
@@ -708,22 +716,24 @@ By default, the `chunking_strategy` is set to `None`, and no chunking is perform
filename = "sample-docs/layout-parser-paper-fast.pdf"
file = open(filename, "rb")
- req = shared.PartitionParameters(
- # Note that this currently only supports a single file
- files=shared.Files(
- content=file.read(),
- file_name=filename,
- ),
- chunking_strategy="by_title",
- max_characters=1024,
- strategy=shared.Strategy.HI_RES,
- split_pdf_page=True,
- split_pdf_allow_failed=True,
- split_pdf_concurrency_level=15
+ req = operations.PartitionRequest(
+ partition_parameters=shared.PartitionParameters(
+ # Note that this currently only supports a single file.
+ files=shared.Files(
+ content=file.read(),
+ file_name=filename,
+ ),
+ chunking_strategy="by_title",
+ max_characters=1024,
+ strategy=shared.Strategy.HI_RES,
+ split_pdf_page=True,
+ split_pdf_allow_failed=True,
+ split_pdf_concurrency_level=15
+ )
)
try:
- res = client.general.partition(req)
+ res = client.general.partition(request=req)
print(res.elements[0])
except SDKError as e:
print(e)
diff --git a/api-reference/api-services/sdk-python.mdx b/api-reference/api-services/sdk-python.mdx
index c70cf183..a00c1c13 100644
--- a/api-reference/api-services/sdk-python.mdx
+++ b/api-reference/api-services/sdk-python.mdx
@@ -31,21 +31,56 @@ import NoURLForServerlessAPI from '/snippets/general-shared-text/no-url-for-serv
The SDK uses semantic versioning and major bumps could bring breaking changes. It is advised to
- pin your installed version.
+ pin your installed version. See the [migration guide](#migration-guide), later on this page, for breaking change announcements.
## Basics
- Let's start with a simple example in which you send a PDF document to be partitioned with the Free Unstructured API:
-
-
- Python SDK Deprecation Warning (>v0.22.0): The legacy method of passing `shared.PartitionParameters`
- directly to `client.general.partition()` is currently supported but may be deprecated and
- could break in future releases. Users should migrate to the new `shared.PartitionRequest` object
- to ensure compatibility with future updates.
-
+ Let's start with a simple example in which you send a PDF document to be partitioned with the Unstructured API. Note that the Python SDK gives you a choice of two interfaces. The functionality is identical, but the Pydantic model syntax may provide completion hints in your IDE, for instance.
- ```python Python
+ ```python Python (TypedDict example)
+ import os, json
+
+ import unstructured_client
+ from unstructured_client.models import shared
+
+ client = unstructured_client.UnstructuredClient(
+ api_key_auth=os.getenv("UNSTRUCTURED_API_KEY"),
+ server_url=os.getenv("UNSTRUCTURED_API_URL"),
+ )
+
+ filename = "PATH_TO_INPUT_FILE"
+
+ req = {
+ "partition_parameters": {
+ "files": {
+ "content": open(filename, "rb"),
+ "file_name": filename,
+ },
+ "strategy": shared.Strategy.HI_RES,
+ "languages": ['eng'],
+ "split_pdf_page": True, # If True, splits the PDF file into smaller chunks of pages.
+ "split_pdf_allow_failed": True, # If True, the partitioning continues even if some pages fail.
+ "split_pdf_concurrency_level": 15 # Set the number of concurrent request to the maximum value: 15.
+ }
+ }
+
+ try:
+ res = client.general.partition(request=req)
+ element_dicts = [element for element in res.elements]
+
+ # Print the processed data's first element only.
+ print(element_dicts[0])
+
+ # Write the processed data to a local file.
+ json_elements = json.dumps(element_dicts, indent=2)
+
+ with open("PATH_TO_OUTPUT_FILE", "w") as file:
+ file.write(json_elements)
+ except Exception as e:
+ print(e)
+ ```
+ ```python Python (Pydantic model example)
import os, json
import unstructured_client
@@ -57,13 +92,11 @@ import NoURLForServerlessAPI from '/snippets/general-shared-text/no-url-for-serv
)
filename = "PATH_TO_INPUT_FILE"
- with open(filename, "rb") as f:
- data = f.read()
req = operations.PartitionRequest(
partition_parameters=shared.PartitionParameters(
files=shared.Files(
- content=data,
+ content=open(filename, "rb"),
file_name=filename,
),
strategy=shared.Strategy.HI_RES,
@@ -89,40 +122,69 @@ import NoURLForServerlessAPI from '/snippets/general-shared-text/no-url-for-serv
except Exception as e:
print(e)
```
- ```python Python (SDK <=v0.22.0)
- import os
+
+
+## Async partitioning
+
+ The Python SDK also has a `partition_async`. This call is equivalent to `partition` except that it can be used in a non blocking context. For instance, `asyncio.gather` can be used to concurrently process multiple files at once, as demonstrated here:
+
+
+ ```python
+ import asyncio
+ import os, json
import unstructured_client
from unstructured_client.models import shared
- from unstructured_client.models.errors import SDKError
client = unstructured_client.UnstructuredClient(
api_key_auth=os.getenv("UNSTRUCTURED_API_KEY"),
server_url=os.getenv("UNSTRUCTURED_API_URL"),
)
- filename = "PATH_TO_FILE"
- with open(filename, "rb") as f:
- data = f.read()
+ async def call_api(filename):
+ req = {
+ "partition_parameters": {
+ "files": {
+ "content": open(filename, "rb"),
+ "file_name": filename,
+ },
+ "strategy": shared.Strategy.HI_RES,
+ }
+ }
- req = shared.PartitionParameters(
- files=shared.Files(
- content=data,
- file_name=filename,
- ),
- # Other partition parameters
- strategy="hi_res",
- languages=['eng'],
- )
+ try:
+ res = await client.general.partition_async(request=req)
- try:
- res = client.general.partition(request=req)
- print(res.elements[0])
- except SDKError as e:
- print(e)
+ element_dicts = [element for element in res.elements]
+
+ json_elements = json.dumps(element_dicts, indent=2)
+
+ output_filename = filename + ".json" # Save the JSON response alongside the input file.
+ with open(output_filename, "w") as file:
+ file.write(json_elements)
+
+ except Exception as e:
+ print(e)
+
+ async def process_files(filenames):
+ filenames = [
+ "PATH_TO_INPUT_FILE_1",
+ "PATH_TO_INPUT_FILE_2",
+ "PATH_TO_INPUT_FILE_3",
+ ]
+
+ tasks = []
+
+ for filename in filenames:
+ tasks.append(call_api(filename))
+
+ await asyncio.gather(*tasks)
+
+ asyncio.run(process_files())
```
+
## Page splitting
In order to speed up processing of large PDF files, the `split_pdf_page`[*](#parameter-names) parameter is `True` by default. This
@@ -230,3 +292,65 @@ the names used in the SDKs are the same across all methods.
* Refer to the [API parameters](/api-reference/api-services/api-parameters) page for the full list of available parameters.
* Refer to the [Examples](/api-reference/api-services/examples) page for some inspiration on using the parameters.
+## Migration guide
+
+There are minor breaking changes in 0.26.0. If you encounter any errors when upgrading, please find the solution below.
+
+**If you see the error: `AttributeError: 'PartitionParameters' object has no attribute 'partition_parameters'`**
+
+Previously, the SDK accepted a `PartitionParameters` object as input to the `sdk.general.partition` function. Now, this object must be wrapped in a `PartitionRequest` object. The old behavior was deprecated in 0.23.0 and removed in 0.26.0.
+
+```python
+# Instead of:
+from unstructured_client.models import shared
+
+req = shared.PartitionParameters(
+ files=files,
+)
+
+resp = s.general.partition(request=req)
+
+
+# Switch to:
+from unstructured_client.models import shared, operations
+
+req = operations.PartitionRequest(
+ partition_parameters=shared.PartitionParameters(
+ files=files,
+ )
+)
+
+resp = s.general.partition(request=req)
+```
+
+**If you see the error: `TypeError: BaseModel.__init__() takes 1 positional argument but 2 were given`**
+
+In 0.26.0, the `PartitionRequest` constructor no longer allows for positional arguments. You must specify `partition_parameters` by name.
+
+```python
+# Instead of:
+req = operations.PartitionRequest(
+ shared.PartitionParameters(
+ files=files,
+ )
+)
+
+# Switch to:
+req = operations.PartitionRequest(
+ partition_parameters=shared.PartitionParameters(
+ files=files,
+ )
+)
+```
+
+**If you see the error: `TypeError: General.partition() takes 1 positional argument but 2 were given`**
+
+In 0.26.0, the `partition` function no longer allows for positional arguments. You must specify `request` by name.
+
+```python
+# Instead of:
+resp = s.general.partition(req)
+
+# Switch to:
+resp = s.general.partition(request=req)
+```
diff --git a/snippets/general-shared-text/use-ingest-instead.mdx b/snippets/general-shared-text/use-ingest-instead.mdx
index 6e020d7f..f32e914e 100644
--- a/snippets/general-shared-text/use-ingest-instead.mdx
+++ b/snippets/general-shared-text/use-ingest-instead.mdx
@@ -4,6 +4,6 @@
to you:
- You need to work with documents in cloud storage.
- - You want faster processing of larger individual files.
- - You want to process multiple files in batches.
-
\ No newline at end of file
+ - You want to cache the results of processing multiple files in batches.
+ - You want more precise control over document-processing pipeline stages such as partitioning, chunking, filtering, staging, and embedding.
+