From d3961240b137a04d63f68e25d62fee6440a4e660 Mon Sep 17 00:00:00 2001 From: Austin Walker Date: Mon, 23 Sep 2024 12:04:16 -0400 Subject: [PATCH 01/22] Remove old deprecation warning --- api-reference/api-services/sdk-python.mdx | 39 ----------------------- 1 file changed, 39 deletions(-) diff --git a/api-reference/api-services/sdk-python.mdx b/api-reference/api-services/sdk-python.mdx index c70cf183..28c1bbc0 100644 --- a/api-reference/api-services/sdk-python.mdx +++ b/api-reference/api-services/sdk-python.mdx @@ -37,13 +37,6 @@ import NoURLForServerlessAPI from '/snippets/general-shared-text/no-url-for-serv ## Basics Let's start with a simple example in which you send a PDF document to be partitioned with the Free Unstructured API: - - Python SDK Deprecation Warning (>v0.22.0): The legacy method of passing `shared.PartitionParameters` - directly to `client.general.partition()` is currently supported but may be deprecated and - could break in future releases. Users should migrate to the new `shared.PartitionRequest` object - to ensure compatibility with future updates. - - ```python Python import os, json @@ -89,38 +82,6 @@ import NoURLForServerlessAPI from '/snippets/general-shared-text/no-url-for-serv except Exception as e: print(e) ``` - ```python Python (SDK <=v0.22.0) - import os - - import unstructured_client - from unstructured_client.models import shared - from unstructured_client.models.errors import SDKError - - client = unstructured_client.UnstructuredClient( - api_key_auth=os.getenv("UNSTRUCTURED_API_KEY"), - server_url=os.getenv("UNSTRUCTURED_API_URL"), - ) - - filename = "PATH_TO_FILE" - with open(filename, "rb") as f: - data = f.read() - - req = shared.PartitionParameters( - files=shared.Files( - content=data, - file_name=filename, - ), - # Other partition parameters - strategy="hi_res", - languages=['eng'], - ) - - try: - res = client.general.partition(request=req) - print(res.elements[0]) - except SDKError as e: - print(e) - ``` ## Page splitting From b2e84c923477cfb5265496e141d3f7fd31614cf4 Mon Sep 17 00:00:00 2001 From: Austin Walker Date: Mon, 23 Sep 2024 17:50:33 -0400 Subject: [PATCH 02/22] Add an example snippet for the new TypedDict interface --- api-reference/api-services/sdk-python.mdx | 50 +++++++++++++++++++---- 1 file changed, 43 insertions(+), 7 deletions(-) diff --git a/api-reference/api-services/sdk-python.mdx b/api-reference/api-services/sdk-python.mdx index 28c1bbc0..7efee985 100644 --- a/api-reference/api-services/sdk-python.mdx +++ b/api-reference/api-services/sdk-python.mdx @@ -35,10 +35,50 @@ import NoURLForServerlessAPI from '/snippets/general-shared-text/no-url-for-serv ## Basics - Let's start with a simple example in which you send a PDF document to be partitioned with the Free Unstructured API: + Let's start with a simple example in which you send a PDF document to be partitioned with the Unstructured API. Note that the Python SDK gives you a choice of two interfaces. The functionality is identical, but the Pydantic model syntax may provide completion hints in your IDE, for instance. - ```python Python + ```python Python (TypedDict example) + import os, json + + import unstructured_client + from unstructured_client.models import shared + + client = unstructured_client.UnstructuredClient( + api_key_auth=os.getenv("UNSTRUCTURED_API_KEY"), + server_url=os.getenv("UNSTRUCTURED_API_URL"), + ) + + filename = "PATH_TO_INPUT_FILE" + + req = { + "partition_parameters": { + "files": { + "content": open(filename, "rb"), + "file_name": filename, + }, + "strategy": shared.Strategy.HI_RES, + "languages": ['eng'], + "split_pdf_page": True, # If True, splits the PDF file into smaller chunks of pages. + } + } + + try: + res = client.general.partition(request=req) + element_dicts = [element for element in res.elements] + + # Print the processed data's first element only. + print(element_dicts[0]) + + # Write the processed data to a local file. + json_elements = json.dumps(element_dicts, indent=2) + + with open("PATH_TO_OUTPUT_FILE", "w") as file: + file.write(json_elements) + except Exception as e: + print(e) + ``` + ```python Python (Pydantic model example) import os, json import unstructured_client @@ -50,20 +90,16 @@ import NoURLForServerlessAPI from '/snippets/general-shared-text/no-url-for-serv ) filename = "PATH_TO_INPUT_FILE" - with open(filename, "rb") as f: - data = f.read() req = operations.PartitionRequest( partition_parameters=shared.PartitionParameters( files=shared.Files( - content=data, + content=open(filename, "rb"), file_name=filename, ), strategy=shared.Strategy.HI_RES, languages=['eng'], split_pdf_page=True, # If True, splits the PDF file into smaller chunks of pages. - split_pdf_allow_failed=True, # If True, the partitioning continues even if some pages fail. - split_pdf_concurrency_level=15 # Set the number of concurrent request to the maximum value: 15. ), ) From a5a4fd159e0b41d7cad9d121cd4ebf36dcf891aa Mon Sep 17 00:00:00 2001 From: Austin Walker Date: Mon, 23 Sep 2024 17:51:03 -0400 Subject: [PATCH 03/22] Add a section about `partition_async` --- api-reference/api-services/sdk-python.mdx | 63 +++++++++++++++++++++++ 1 file changed, 63 insertions(+) diff --git a/api-reference/api-services/sdk-python.mdx b/api-reference/api-services/sdk-python.mdx index 7efee985..b112be74 100644 --- a/api-reference/api-services/sdk-python.mdx +++ b/api-reference/api-services/sdk-python.mdx @@ -120,6 +120,69 @@ import NoURLForServerlessAPI from '/snippets/general-shared-text/no-url-for-serv ``` +## Async Partitioning + + The Python SDK also has a `partition_async`. This call is equivalent to `partition` except that it can be used in a non blocking context. For instance, `asyncio.gather` can be used to concurrently process multiple files at once, as in this example: + + + ```python + import asyncio + import os, json + + import unstructured_client + from unstructured_client.models import shared + + client = unstructured_client.UnstructuredClient( + api_key_auth=os.getenv("UNSTRUCTURED_API_KEY"), + server_url=os.getenv("UNSTRUCTURED_API_URL"), + ) + + + async def call_api(filename): + req = { + "partition_parameters": { + "files": { + "content": open(filename, "rb"), + "file_name": filename, + }, + "strategy": shared.Strategy.HI_RES, + } + } + + + try: + res = await client.general.partition_async(request=req) + + element_dicts = [element for element in res.elements] + + # Write the processed data to a local file. + json_elements = json.dumps(element_dicts, indent=2) + + # Each input will need its own output file + with open("PATH_TO_OUTPUT_FILE", "w") as file: + file.write(json_elements) + + except Exception as e: + print(e) + + + async def process_files(filenames): + filenames = [ + "PATH_TO_INPUT_FILE_1", + "PATH_TO_INPUT_FILE_2", + "PATH_TO_INPUT_FILE_3", + ] + + tasks = [] + + for filename in filenames: + tasks.append(call_api(filename)) + + await asyncio.gather(*tasks) + + asyncio.run(process_files()) + ``` + ## Page splitting In order to speed up processing of large PDF files, the `split_pdf_page`[*](#parameter-names) parameter is `True` by default. This From ae1171153a722f2a10d34acc43c74761506b694c Mon Sep 17 00:00:00 2001 From: Austin Walker Date: Mon, 23 Sep 2024 17:56:34 -0400 Subject: [PATCH 04/22] Remove some whitespace --- api-reference/api-services/sdk-python.mdx | 3 --- 1 file changed, 3 deletions(-) diff --git a/api-reference/api-services/sdk-python.mdx b/api-reference/api-services/sdk-python.mdx index b112be74..75ce2027 100644 --- a/api-reference/api-services/sdk-python.mdx +++ b/api-reference/api-services/sdk-python.mdx @@ -137,7 +137,6 @@ import NoURLForServerlessAPI from '/snippets/general-shared-text/no-url-for-serv server_url=os.getenv("UNSTRUCTURED_API_URL"), ) - async def call_api(filename): req = { "partition_parameters": { @@ -149,7 +148,6 @@ import NoURLForServerlessAPI from '/snippets/general-shared-text/no-url-for-serv } } - try: res = await client.general.partition_async(request=req) @@ -165,7 +163,6 @@ import NoURLForServerlessAPI from '/snippets/general-shared-text/no-url-for-serv except Exception as e: print(e) - async def process_files(filenames): filenames = [ "PATH_TO_INPUT_FILE_1", From 4dfaa1760e0d13cf04bf271b1337f43199124400 Mon Sep 17 00:00:00 2001 From: Austin Walker Date: Mon, 30 Sep 2024 14:40:44 -0400 Subject: [PATCH 05/22] Update output filename in async example --- api-reference/api-services/sdk-python.mdx | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/api-reference/api-services/sdk-python.mdx b/api-reference/api-services/sdk-python.mdx index 75ce2027..1c30ab5f 100644 --- a/api-reference/api-services/sdk-python.mdx +++ b/api-reference/api-services/sdk-python.mdx @@ -153,11 +153,10 @@ import NoURLForServerlessAPI from '/snippets/general-shared-text/no-url-for-serv element_dicts = [element for element in res.elements] - # Write the processed data to a local file. json_elements = json.dumps(element_dicts, indent=2) - # Each input will need its own output file - with open("PATH_TO_OUTPUT_FILE", "w") as file: + output_filename = filename + ".json" # Save off the json response alongside the input file + with open(output_filename, "w") as file: file.write(json_elements) except Exception as e: From 5bb108e34adff956be706ccf0e8cc0b499411079 Mon Sep 17 00:00:00 2001 From: Austin Walker Date: Mon, 30 Sep 2024 15:04:13 -0400 Subject: [PATCH 06/22] Add breaking change notes --- api-reference/api-services/sdk-python.mdx | 72 +++++++++++++++++++++++ 1 file changed, 72 insertions(+) diff --git a/api-reference/api-services/sdk-python.mdx b/api-reference/api-services/sdk-python.mdx index 1c30ab5f..c9c627f7 100644 --- a/api-reference/api-services/sdk-python.mdx +++ b/api-reference/api-services/sdk-python.mdx @@ -179,6 +179,78 @@ import NoURLForServerlessAPI from '/snippets/general-shared-text/no-url-for-serv asyncio.run(process_files()) ``` + + +### Notes on breaking changes +When upgrading to a newer version of the SDK, please be aware of these breaking changes: + +#### Updating to 0.26.0 + +If you see the error: `AttributeError: 'PartitionParameters' object has no attribute 'partition_parameters'` + +Older versions of the SDK accepted a `PartitionParameters` object as input to the `sdk.general.partition` function. Now, this object must be wrapped in a `PartitionRequest` object. + +``` +# Instead of: +from unstructured_client.models import shared + +req = shared.PartitionParameters( + files=files, + strategy="fast", +) + +resp = s.general.partition(request=req) + + +# Switch to: +from unstructured_client.models import shared, operations + +req = operations.PartitionRequest( + partition_parameters=shared.PartitionParameters( + files=files, + strategy="fast", + ) +) + +resp = s.general.partition(request=req) +``` + +If you see the error: `TypeError: BaseModel.__init__() takes 1 positional argument but 2 were given` + +The `PartitionRequest` constructor no longer allows for positional arguments. You must specify `partition_parameters` by name. + +``` +# Instead of: +req = operations.PartitionRequest( + shared.PartitionParameters( + files=files, + strategy="fast", + output_format="text/csv", + ) +) + +# Switch to: +req = operations.PartitionRequest( + partition_parameters=shared.PartitionParameters( + files=files, + strategy="fast", + output_format="text/csv", + ) +) +``` + +If you see the error: `TypeError: General.partition() takes 1 positional argument but 2 were given` + +Likewise, the `partition` function no longer allows for positional arguments. You must specify `request` by name. + +``` +# Instead of: +resp = s.general.partition(req) + +# Switch to: +resp = s.general.partition(request=req) +``` + ## Page splitting In order to speed up processing of large PDF files, the `split_pdf_page`[*](#parameter-names) parameter is `True` by default. This From 1657230a911332b3d63a90a89466f377e150ddcf Mon Sep 17 00:00:00 2001 From: Austin Walker Date: Mon, 30 Sep 2024 15:09:43 -0400 Subject: [PATCH 07/22] Try moving section --- api-reference/api-services/sdk-python.mdx | 135 +++++++++++----------- 1 file changed, 65 insertions(+), 70 deletions(-) diff --git a/api-reference/api-services/sdk-python.mdx b/api-reference/api-services/sdk-python.mdx index c9c627f7..284b6d12 100644 --- a/api-reference/api-services/sdk-python.mdx +++ b/api-reference/api-services/sdk-python.mdx @@ -34,6 +34,71 @@ import NoURLForServerlessAPI from '/snippets/general-shared-text/no-url-for-serv pin your installed version. +### Notes on breaking changes in 0.26.0 + +- If you see the error: `AttributeError: 'PartitionParameters' object has no attribute 'partition_parameters'` + +Older versions of the SDK accepted a PartitionParameters object as input to the sdk.general.partition function. Now, this object must be wrapped in a PartitionRequest object. + +```python +# Instead of: +from unstructured_client.models import shared + +req = shared.PartitionParameters( + files=files, +) + +resp = s.general.partition(request=req) + + +# Switch to: +from unstructured_client.models import shared, operations + +req = operations.PartitionRequest( + partition_parameters=shared.PartitionParameters( + files=files, + ) +) + +resp = s.general.partition(request=req) +``` + +- If you see the error: `TypeError: BaseModel.__init__() takes 1 positional argument but 2 were given` + +The PartitionRequest constructor no longer allows for positional arguments. You must specify partition_parameters by name. + +```python +# Instead of: +req = operations.PartitionRequest( + shared.PartitionParameters( + files=files, + strategy="fast", + output_format="text/csv", + ) +) + +# Switch to: +req = operations.PartitionRequest( + partition_parameters=shared.PartitionParameters( + files=files, + strategy="fast", + output_format="text/csv", + ) +) +``` + +- If you see the error: `TypeError: General.partition() takes 1 positional argument but 2 were given` + +Likewise, the partition function no longer allows for positional arguments. You must specify request by name. + +```python +# Instead of: +resp = s.general.partition(req) + +# Switch to: +resp = s.general.partition(request=req) +``` + ## Basics Let's start with a simple example in which you send a PDF document to be partitioned with the Unstructured API. Note that the Python SDK gives you a choice of two interfaces. The functionality is identical, but the Pydantic model syntax may provide completion hints in your IDE, for instance. @@ -181,76 +246,6 @@ import NoURLForServerlessAPI from '/snippets/general-shared-text/no-url-for-serv -### Notes on breaking changes -When upgrading to a newer version of the SDK, please be aware of these breaking changes: - -#### Updating to 0.26.0 - -If you see the error: `AttributeError: 'PartitionParameters' object has no attribute 'partition_parameters'` - -Older versions of the SDK accepted a `PartitionParameters` object as input to the `sdk.general.partition` function. Now, this object must be wrapped in a `PartitionRequest` object. - -``` -# Instead of: -from unstructured_client.models import shared - -req = shared.PartitionParameters( - files=files, - strategy="fast", -) - -resp = s.general.partition(request=req) - - -# Switch to: -from unstructured_client.models import shared, operations - -req = operations.PartitionRequest( - partition_parameters=shared.PartitionParameters( - files=files, - strategy="fast", - ) -) - -resp = s.general.partition(request=req) -``` - -If you see the error: `TypeError: BaseModel.__init__() takes 1 positional argument but 2 were given` - -The `PartitionRequest` constructor no longer allows for positional arguments. You must specify `partition_parameters` by name. - -``` -# Instead of: -req = operations.PartitionRequest( - shared.PartitionParameters( - files=files, - strategy="fast", - output_format="text/csv", - ) -) - -# Switch to: -req = operations.PartitionRequest( - partition_parameters=shared.PartitionParameters( - files=files, - strategy="fast", - output_format="text/csv", - ) -) -``` - -If you see the error: `TypeError: General.partition() takes 1 positional argument but 2 were given` - -Likewise, the `partition` function no longer allows for positional arguments. You must specify `request` by name. - -``` -# Instead of: -resp = s.general.partition(req) - -# Switch to: -resp = s.general.partition(request=req) -``` - ## Page splitting In order to speed up processing of large PDF files, the `split_pdf_page`[*](#parameter-names) parameter is `True` by default. This From 719c15bfc0cd73115a68c900af6914875ac54826 Mon Sep 17 00:00:00 2001 From: Austin Walker Date: Mon, 30 Sep 2024 15:10:02 -0400 Subject: [PATCH 08/22] Minor updates --- api-reference/api-services/sdk-python.mdx | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/api-reference/api-services/sdk-python.mdx b/api-reference/api-services/sdk-python.mdx index 284b6d12..1db9c09b 100644 --- a/api-reference/api-services/sdk-python.mdx +++ b/api-reference/api-services/sdk-python.mdx @@ -124,7 +124,7 @@ resp = s.general.partition(request=req) }, "strategy": shared.Strategy.HI_RES, "languages": ['eng'], - "split_pdf_page": True, # If True, splits the PDF file into smaller chunks of pages. + "split_pdf_page": True, } } @@ -164,7 +164,7 @@ resp = s.general.partition(request=req) ), strategy=shared.Strategy.HI_RES, languages=['eng'], - split_pdf_page=True, # If True, splits the PDF file into smaller chunks of pages. + split_pdf_page=True, ), ) @@ -187,7 +187,7 @@ resp = s.general.partition(request=req) ## Async Partitioning - The Python SDK also has a `partition_async`. This call is equivalent to `partition` except that it can be used in a non blocking context. For instance, `asyncio.gather` can be used to concurrently process multiple files at once, as in this example: + The Python SDK also has a `partition_async`. This call is equivalent to `partition` except that it can be used in a non blocking context. For instance, `asyncio.gather` can be used to concurrently process multiple files at once, as demonstrated here: ```python From 61cf08a253c5ffaab93777e221ae0473881a1254 Mon Sep 17 00:00:00 2001 From: Austin Walker Date: Mon, 30 Sep 2024 15:11:49 -0400 Subject: [PATCH 09/22] Remove unneeded example params --- api-reference/api-services/sdk-python.mdx | 4 ---- 1 file changed, 4 deletions(-) diff --git a/api-reference/api-services/sdk-python.mdx b/api-reference/api-services/sdk-python.mdx index 1db9c09b..9fed43a4 100644 --- a/api-reference/api-services/sdk-python.mdx +++ b/api-reference/api-services/sdk-python.mdx @@ -72,8 +72,6 @@ The PartitionRequest constructor no longer allows for positional arguments. You req = operations.PartitionRequest( shared.PartitionParameters( files=files, - strategy="fast", - output_format="text/csv", ) ) @@ -81,8 +79,6 @@ req = operations.PartitionRequest( req = operations.PartitionRequest( partition_parameters=shared.PartitionParameters( files=files, - strategy="fast", - output_format="text/csv", ) ) ``` From 614da214224900fba52cfcfca032d3c7d1cf9ac1 Mon Sep 17 00:00:00 2001 From: Austin Walker Date: Tue, 1 Oct 2024 17:17:15 -0400 Subject: [PATCH 10/22] Move breaking change notes --- api-reference/api-services/sdk-python.mdx | 125 +++++++++++----------- 1 file changed, 63 insertions(+), 62 deletions(-) diff --git a/api-reference/api-services/sdk-python.mdx b/api-reference/api-services/sdk-python.mdx index 9fed43a4..c79978f0 100644 --- a/api-reference/api-services/sdk-python.mdx +++ b/api-reference/api-services/sdk-python.mdx @@ -31,70 +31,9 @@ import NoURLForServerlessAPI from '/snippets/general-shared-text/no-url-for-serv The SDK uses semantic versioning and major bumps could bring breaking changes. It is advised to - pin your installed version. + pin your installed version. See the [migration guide](#Migration guide) below for breaking change announcements. -### Notes on breaking changes in 0.26.0 - -- If you see the error: `AttributeError: 'PartitionParameters' object has no attribute 'partition_parameters'` - -Older versions of the SDK accepted a PartitionParameters object as input to the sdk.general.partition function. Now, this object must be wrapped in a PartitionRequest object. - -```python -# Instead of: -from unstructured_client.models import shared - -req = shared.PartitionParameters( - files=files, -) - -resp = s.general.partition(request=req) - - -# Switch to: -from unstructured_client.models import shared, operations - -req = operations.PartitionRequest( - partition_parameters=shared.PartitionParameters( - files=files, - ) -) - -resp = s.general.partition(request=req) -``` - -- If you see the error: `TypeError: BaseModel.__init__() takes 1 positional argument but 2 were given` - -The PartitionRequest constructor no longer allows for positional arguments. You must specify partition_parameters by name. - -```python -# Instead of: -req = operations.PartitionRequest( - shared.PartitionParameters( - files=files, - ) -) - -# Switch to: -req = operations.PartitionRequest( - partition_parameters=shared.PartitionParameters( - files=files, - ) -) -``` - -- If you see the error: `TypeError: General.partition() takes 1 positional argument but 2 were given` - -Likewise, the partition function no longer allows for positional arguments. You must specify request by name. - -```python -# Instead of: -resp = s.general.partition(req) - -# Switch to: -resp = s.general.partition(request=req) -``` - ## Basics Let's start with a simple example in which you send a PDF document to be partitioned with the Unstructured API. Note that the Python SDK gives you a choice of two interfaces. The functionality is identical, but the Pydantic model syntax may provide completion hints in your IDE, for instance. @@ -349,3 +288,65 @@ the names used in the SDKs are the same across all methods. * Refer to the [API parameters](/api-reference/api-services/api-parameters) page for the full list of available parameters. * Refer to the [Examples](/api-reference/api-services/examples) page for some inspiration on using the parameters. +### Migration guide + +#### Upgrading to 0.26.0 + +- If you see the error: `AttributeError: 'PartitionParameters' object has no attribute 'partition_parameters'` + +Previously, the SDK has accepted a PartitionParameters object as input to the sdk.general.partition function. Now, this object must be wrapped in a PartitionRequest object. The old behavior was deprecated in 0.23.0 and removed in 0.26.0. + +```python +# Instead of: +from unstructured_client.models import shared + +req = shared.PartitionParameters( + files=files, +) + +resp = s.general.partition(request=req) + + +# Switch to: +from unstructured_client.models import shared, operations + +req = operations.PartitionRequest( + partition_parameters=shared.PartitionParameters( + files=files, + ) +) + +resp = s.general.partition(request=req) +``` + +- If you see the error: `TypeError: BaseModel.__init__() takes 1 positional argument but 2 were given` + +In 0.26.0 the PartitionRequest constructor no longer allows for positional arguments. You must specify partition_parameters by name. + +```python +# Instead of: +req = operations.PartitionRequest( + shared.PartitionParameters( + files=files, + ) +) + +# Switch to: +req = operations.PartitionRequest( + partition_parameters=shared.PartitionParameters( + files=files, + ) +) +``` + +- If you see the error: `TypeError: General.partition() takes 1 positional argument but 2 were given` + +Likewise, in 0.26.0 the partition function no longer allows for positional arguments. You must specify request by name. + +```python +# Instead of: +resp = s.general.partition(req) + +# Switch to: +resp = s.general.partition(request=req) +``` From 8d15254192e7b5555dce626eb21defca9e8e9d5c Mon Sep 17 00:00:00 2001 From: Austin Walker Date: Tue, 1 Oct 2024 17:22:11 -0400 Subject: [PATCH 11/22] Fix anchor link --- api-reference/api-services/sdk-python.mdx | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/api-reference/api-services/sdk-python.mdx b/api-reference/api-services/sdk-python.mdx index c79978f0..b45e7cf1 100644 --- a/api-reference/api-services/sdk-python.mdx +++ b/api-reference/api-services/sdk-python.mdx @@ -31,7 +31,7 @@ import NoURLForServerlessAPI from '/snippets/general-shared-text/no-url-for-serv The SDK uses semantic versioning and major bumps could bring breaking changes. It is advised to - pin your installed version. See the [migration guide](#Migration guide) below for breaking change announcements. + pin your installed version. See the [migration guide](#migration-guide) below for breaking change announcements. ## Basics @@ -288,9 +288,9 @@ the names used in the SDKs are the same across all methods. * Refer to the [API parameters](/api-reference/api-services/api-parameters) page for the full list of available parameters. * Refer to the [Examples](/api-reference/api-services/examples) page for some inspiration on using the parameters. -### Migration guide +## Migration guide -#### Upgrading to 0.26.0 +### Upgrading to 0.26.0 - If you see the error: `AttributeError: 'PartitionParameters' object has no attribute 'partition_parameters'` From 74f538d4ca3a211b99c358dcecdcc2e1e4630d70 Mon Sep 17 00:00:00 2001 From: Austin Walker Date: Tue, 1 Oct 2024 17:25:30 -0400 Subject: [PATCH 12/22] Fix code indentation --- api-reference/api-services/sdk-python.mdx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/api-reference/api-services/sdk-python.mdx b/api-reference/api-services/sdk-python.mdx index b45e7cf1..ffeafaac 100644 --- a/api-reference/api-services/sdk-python.mdx +++ b/api-reference/api-services/sdk-python.mdx @@ -301,7 +301,7 @@ Previously, the SDK has accepted a PartitionParameters object as input to the sd from unstructured_client.models import shared req = shared.PartitionParameters( - files=files, + files=files, ) resp = s.general.partition(request=req) From 3c271fa0fab52cce7c0d525d5b62a62c4d26a87c Mon Sep 17 00:00:00 2001 From: Austin Walker Date: Tue, 1 Oct 2024 17:38:11 -0400 Subject: [PATCH 13/22] Clean up old SDK usage --- api-reference/api-services/examples.mdx | 144 +++++++++++++----------- 1 file changed, 77 insertions(+), 67 deletions(-) diff --git a/api-reference/api-services/examples.mdx b/api-reference/api-services/examples.mdx index 029dbf2a..9809d1d2 100644 --- a/api-reference/api-services/examples.mdx +++ b/api-reference/api-services/examples.mdx @@ -99,7 +99,7 @@ The `hi_res` strategy supports different models, and the default is `layout_v1.1 import os from unstructured_client import UnstructuredClient - from unstructured_client.models import shared + from unstructured_client.models import operations, shared from unstructured_client.models.errors import SDKError client = UnstructuredClient( @@ -109,21 +109,23 @@ The `hi_res` strategy supports different models, and the default is `layout_v1.1 filename = "sample-docs/layout-parser-paper.pdf" file = open(filename, "rb") - req = shared.PartitionParameters( - # Note that this currently only supports a single file - files=shared.Files( - content=file.read(), - file_name=filename, - ), - strategy=shared.Strategy.HI_RES, - hi_res_model_name="layout_v1.1.0", - split_pdf_page=True, - split_pdf_allow_failed=True, - split_pdf_concurrency_level=15 + req = operations.PartitionRequest( + partition_parameters=shared.PartitionParameters( + # Note that this currently only supports a single file + files=shared.Files( + content=file.read(), + file_name=filename, + ), + strategy=shared.Strategy.HI_RES, + hi_res_model_name="layout_v1.1.0", + split_pdf_page=True, + split_pdf_allow_failed=True, + split_pdf_concurrency_level=15 + ) ) try: - res = client.general.partition(req) + res = client.general.partition(request=req) print(res.elements[0]) except SDKError as e: print(e) @@ -248,7 +250,7 @@ For better OCR results, you can specify what languages your document is in using import os from unstructured_client import UnstructuredClient - from unstructured_client.models import shared + from unstructured_client.models import operations, shared from unstructured_client.models.errors import SDKError client = UnstructuredClient( @@ -258,21 +260,23 @@ For better OCR results, you can specify what languages your document is in using filename = "sample-docs/korean.png" file = open(filename, "rb") - req = shared.PartitionParameters( - # Note that this currently only supports a single file - files=shared.Files( - content=file.read(), - file_name=filename, - ), - strategy=shared.Strategy.OCR_ONLY, - languages=["kor"], - split_pdf_page=True, - split_pdf_allow_failed=True, - split_pdf_concurrency_level=15 - ) + req = operations.PartitionRequest( + partition_parameters=shared.PartitionParameters( + # Note that this currently only supports a single file + files=shared.Files( + content=file.read(), + file_name=filename, + ), + strategy=shared.Strategy.OCR_ONLY, + languages=["kor"], + split_pdf_page=True, + split_pdf_allow_failed=True, + split_pdf_concurrency_level=15 + ) + ) try: - res = client.general.partition(req) + res = client.general.partition(request=req) print(res.elements[0]) except SDKError as e: print(e) @@ -394,7 +398,7 @@ Set the `coordinates` parameter to `true` to add this field to the elements in t import os from unstructured_client import UnstructuredClient - from unstructured_client.models import shared + from unstructured_client.models import operations, shared from unstructured_client.models.errors import SDKError client = UnstructuredClient( @@ -404,21 +408,23 @@ Set the `coordinates` parameter to `true` to add this field to the elements in t filename = "sample-docs/layout-parser-paper.pdf" file = open(filename, "rb") - req = shared.PartitionParameters( - # Note that this currently only supports a single file - files=shared.Files( - content=file.read(), - file_name=filename, - ), - strategy=shared.Strategy.HI_RES, - coordinates=True, - split_pdf_page=True, - split_pdf_allow_failed=True, - split_pdf_concurrency_level=15 + req = operations.PartitionRequest( + partition_parameters=shared.PartitionParameters( + # Note that this currently only supports a single file + files=shared.Files( + content=file.read(), + file_name=filename, + ), + strategy=shared.Strategy.HI_RES, + coordinates=True, + split_pdf_page=True, + split_pdf_allow_failed=True, + split_pdf_concurrency_level=15 + ) ) try: - res = client.general.partition(req) + res = client.general.partition(request=req) print(res.elements[0]) except SDKError as e: print(e) @@ -543,7 +549,7 @@ This can be helpful if you'd like to use the IDs as a primary key in a database, import os from unstructured_client import UnstructuredClient - from unstructured_client.models import shared + from unstructured_client.models import operations, shared from unstructured_client.models.errors import SDKError client = UnstructuredClient( @@ -553,21 +559,23 @@ This can be helpful if you'd like to use the IDs as a primary key in a database, filename = "sample-docs/layout-parser-paper-fast.pdf" file = open(filename, "rb") - req = shared.PartitionParameters( - # Note that this currently only supports a single file - files=shared.Files( - content=file.read(), - file_name=filename, - ), - unique_element_ids=True, - strategy=shared.Strategy.HI_RES, - split_pdf_page=True, - split_pdf_allow_failed=True, - split_pdf_concurrency_level=15 + req = operations.PartitionRequest( + partition_parameters=shared.PartitionParameters( + # Note that this currently only supports a single file + files=shared.Files( + content=file.read(), + file_name=filename, + ), + unique_element_ids=True, + strategy=shared.Strategy.HI_RES, + split_pdf_page=True, + split_pdf_allow_failed=True, + split_pdf_concurrency_level=15 + ) ) try: - res = client.general.partition(req) + res = client.general.partition(request=req) print(res.elements[0]) except SDKError as e: print(e) @@ -698,7 +706,7 @@ By default, the `chunking_strategy` is set to `None`, and no chunking is perform import os from unstructured_client import UnstructuredClient - from unstructured_client.models import shared + from unstructured_client.models import operations, shared from unstructured_client.models.errors import SDKError client = UnstructuredClient( @@ -708,22 +716,24 @@ By default, the `chunking_strategy` is set to `None`, and no chunking is perform filename = "sample-docs/layout-parser-paper-fast.pdf" file = open(filename, "rb") - req = shared.PartitionParameters( - # Note that this currently only supports a single file - files=shared.Files( - content=file.read(), - file_name=filename, - ), - chunking_strategy="by_title", - max_characters=1024, - strategy=shared.Strategy.HI_RES, - split_pdf_page=True, - split_pdf_allow_failed=True, - split_pdf_concurrency_level=15 + req = operations.PartitionRequest( + partition_parameters=shared.PartitionParameters( + # Note that this currently only supports a single file + files=shared.Files( + content=file.read(), + file_name=filename, + ), + chunking_strategy="by_title", + max_characters=1024, + strategy=shared.Strategy.HI_RES, + split_pdf_page=True, + split_pdf_allow_failed=True, + split_pdf_concurrency_level=15 + ) ) try: - res = client.general.partition(req) + res = client.general.partition(request=req) print(res.elements[0]) except SDKError as e: print(e) From f940a589fab4e8e8f289e0ef47670bd22b039be7 Mon Sep 17 00:00:00 2001 From: Austin Walker Date: Tue, 1 Oct 2024 17:39:02 -0400 Subject: [PATCH 14/22] Change use-ingest-instead guidance --- snippets/general-shared-text/use-ingest-instead.mdx | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/snippets/general-shared-text/use-ingest-instead.mdx b/snippets/general-shared-text/use-ingest-instead.mdx index 6e020d7f..f32e914e 100644 --- a/snippets/general-shared-text/use-ingest-instead.mdx +++ b/snippets/general-shared-text/use-ingest-instead.mdx @@ -4,6 +4,6 @@ to you: - You need to work with documents in cloud storage. - - You want faster processing of larger individual files. - - You want to process multiple files in batches. - \ No newline at end of file + - You want to cache the results of processing multiple files in batches. + - You want more precise control over document-processing pipeline stages such as partitioning, chunking, filtering, staging, and embedding. + From 235e489884ef74f4e4e646bddf6ab1ae58341d86 Mon Sep 17 00:00:00 2001 From: Austin Walker Date: Tue, 1 Oct 2024 18:43:32 -0400 Subject: [PATCH 15/22] Update api-reference/api-services/sdk-python.mdx Co-authored-by: Paul-Cornell --- api-reference/api-services/sdk-python.mdx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/api-reference/api-services/sdk-python.mdx b/api-reference/api-services/sdk-python.mdx index ffeafaac..220da70a 100644 --- a/api-reference/api-services/sdk-python.mdx +++ b/api-reference/api-services/sdk-python.mdx @@ -31,7 +31,7 @@ import NoURLForServerlessAPI from '/snippets/general-shared-text/no-url-for-serv The SDK uses semantic versioning and major bumps could bring breaking changes. It is advised to - pin your installed version. See the [migration guide](#migration-guide) below for breaking change announcements. + pin your installed version. See the [migration guide](#migration-guide), later on this page, for breaking change announcements. ## Basics From 8f7bcb42b728995f65535a8ab9986f5dc374d413 Mon Sep 17 00:00:00 2001 From: Austin Walker Date: Tue, 1 Oct 2024 18:44:58 -0400 Subject: [PATCH 16/22] Update api-reference/api-services/sdk-python.mdx Co-authored-by: Paul-Cornell --- api-reference/api-services/sdk-python.mdx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/api-reference/api-services/sdk-python.mdx b/api-reference/api-services/sdk-python.mdx index 220da70a..7a768789 100644 --- a/api-reference/api-services/sdk-python.mdx +++ b/api-reference/api-services/sdk-python.mdx @@ -294,7 +294,7 @@ the names used in the SDKs are the same across all methods. - If you see the error: `AttributeError: 'PartitionParameters' object has no attribute 'partition_parameters'` -Previously, the SDK has accepted a PartitionParameters object as input to the sdk.general.partition function. Now, this object must be wrapped in a PartitionRequest object. The old behavior was deprecated in 0.23.0 and removed in 0.26.0. +Previously, the SDK accepted a `PartitionParameters` object as input to the `sdk.general.partition` function. Now, this object must be wrapped in a `PartitionRequest` object. The old behavior was deprecated in 0.23.0 and removed in 0.26.0. ```python # Instead of: From 892be56201c42d343d8c7211def9d99e5e266c1d Mon Sep 17 00:00:00 2001 From: Austin Walker Date: Tue, 1 Oct 2024 18:45:18 -0400 Subject: [PATCH 17/22] Update api-reference/api-services/sdk-python.mdx Co-authored-by: Paul-Cornell --- api-reference/api-services/sdk-python.mdx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/api-reference/api-services/sdk-python.mdx b/api-reference/api-services/sdk-python.mdx index 7a768789..2efe11a0 100644 --- a/api-reference/api-services/sdk-python.mdx +++ b/api-reference/api-services/sdk-python.mdx @@ -341,7 +341,7 @@ req = operations.PartitionRequest( - If you see the error: `TypeError: General.partition() takes 1 positional argument but 2 were given` -Likewise, in 0.26.0 the partition function no longer allows for positional arguments. You must specify request by name. +In 0.26.0, the `partition` function no longer allows for positional arguments. You must specify `request` by name. ```python # Instead of: From 9515f6ddac4de505bd08180dafac85fb7eee3de5 Mon Sep 17 00:00:00 2001 From: Austin Walker Date: Fri, 4 Oct 2024 11:35:45 -0400 Subject: [PATCH 18/22] Apply suggestions from code review Co-authored-by: Paul-Cornell --- api-reference/api-services/sdk-python.mdx | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/api-reference/api-services/sdk-python.mdx b/api-reference/api-services/sdk-python.mdx index 2efe11a0..1728c7ed 100644 --- a/api-reference/api-services/sdk-python.mdx +++ b/api-reference/api-services/sdk-python.mdx @@ -120,7 +120,7 @@ import NoURLForServerlessAPI from '/snippets/general-shared-text/no-url-for-serv ``` -## Async Partitioning +## Async partitioning The Python SDK also has a `partition_async`. This call is equivalent to `partition` except that it can be used in a non blocking context. For instance, `asyncio.gather` can be used to concurrently process multiple files at once, as demonstrated here: @@ -155,7 +155,7 @@ import NoURLForServerlessAPI from '/snippets/general-shared-text/no-url-for-serv json_elements = json.dumps(element_dicts, indent=2) - output_filename = filename + ".json" # Save off the json response alongside the input file + output_filename = filename + ".json" # Save the JSON response alongside the input file. with open(output_filename, "w") as file: file.write(json_elements) @@ -321,7 +321,7 @@ resp = s.general.partition(request=req) - If you see the error: `TypeError: BaseModel.__init__() takes 1 positional argument but 2 were given` -In 0.26.0 the PartitionRequest constructor no longer allows for positional arguments. You must specify partition_parameters by name. +In 0.26.0, the `PartitionRequest` constructor no longer allows for positional arguments. You must specify `partition_parameters` by name. ```python # Instead of: From 0646ecc13765d5f732d5bd086a4c5b085061a63b Mon Sep 17 00:00:00 2001 From: Austin Walker Date: Fri, 4 Oct 2024 11:36:12 -0400 Subject: [PATCH 19/22] Add split pdf configuration back to sample code --- api-reference/api-services/sdk-python.mdx | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/api-reference/api-services/sdk-python.mdx b/api-reference/api-services/sdk-python.mdx index 1728c7ed..be0037ba 100644 --- a/api-reference/api-services/sdk-python.mdx +++ b/api-reference/api-services/sdk-python.mdx @@ -60,6 +60,9 @@ import NoURLForServerlessAPI from '/snippets/general-shared-text/no-url-for-serv "strategy": shared.Strategy.HI_RES, "languages": ['eng'], "split_pdf_page": True, + "split_pdf_page": True, # If True, splits the PDF file into smaller chunks of pages. + "split_pdf_allow_failed": True, # If True, the partitioning continues even if some pages fail. + "split_pdf_concurrency_level": 15 # Set the number of concurrent request to the maximum value: 15. } } @@ -99,7 +102,9 @@ import NoURLForServerlessAPI from '/snippets/general-shared-text/no-url-for-serv ), strategy=shared.Strategy.HI_RES, languages=['eng'], - split_pdf_page=True, + split_pdf_page=True, # If True, splits the PDF file into smaller chunks of pages. + split_pdf_allow_failed=True, # If True, the partitioning continues even if some pages fail. + split_pdf_concurrency_level=15 # Set the number of concurrent request to the maximum value: 15. ), ) From fe7ee5a11b1f637f3e0ab44a7304858463e89b4a Mon Sep 17 00:00:00 2001 From: Austin Walker Date: Fri, 4 Oct 2024 11:46:00 -0400 Subject: [PATCH 20/22] Remove migration guide sub header and use bold for sections --- api-reference/api-services/sdk-python.mdx | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/api-reference/api-services/sdk-python.mdx b/api-reference/api-services/sdk-python.mdx index be0037ba..5f70cc01 100644 --- a/api-reference/api-services/sdk-python.mdx +++ b/api-reference/api-services/sdk-python.mdx @@ -295,9 +295,9 @@ the names used in the SDKs are the same across all methods. ## Migration guide -### Upgrading to 0.26.0 +There are minor breaking changes in 0.26.0. If you encounter any errors when upgrading, please find the solution below. -- If you see the error: `AttributeError: 'PartitionParameters' object has no attribute 'partition_parameters'` +**If you see the error: `AttributeError: 'PartitionParameters' object has no attribute 'partition_parameters'`** Previously, the SDK accepted a `PartitionParameters` object as input to the `sdk.general.partition` function. Now, this object must be wrapped in a `PartitionRequest` object. The old behavior was deprecated in 0.23.0 and removed in 0.26.0. @@ -324,7 +324,7 @@ req = operations.PartitionRequest( resp = s.general.partition(request=req) ``` -- If you see the error: `TypeError: BaseModel.__init__() takes 1 positional argument but 2 were given` +**If you see the error: `TypeError: BaseModel.__init__() takes 1 positional argument but 2 were given`** In 0.26.0, the `PartitionRequest` constructor no longer allows for positional arguments. You must specify `partition_parameters` by name. @@ -344,7 +344,7 @@ req = operations.PartitionRequest( ) ``` -- If you see the error: `TypeError: General.partition() takes 1 positional argument but 2 were given` +**If you see the error: `TypeError: General.partition() takes 1 positional argument but 2 were given`** In 0.26.0, the `partition` function no longer allows for positional arguments. You must specify `request` by name. From 29dd37150f11d40c9cff313ff0ce98ccfa66c1d0 Mon Sep 17 00:00:00 2001 From: Austin Walker Date: Fri, 4 Oct 2024 11:49:30 -0400 Subject: [PATCH 21/22] Add punctuation to code comments --- api-reference/api-services/examples.mdx | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/api-reference/api-services/examples.mdx b/api-reference/api-services/examples.mdx index 9809d1d2..18a39d7e 100644 --- a/api-reference/api-services/examples.mdx +++ b/api-reference/api-services/examples.mdx @@ -111,7 +111,7 @@ The `hi_res` strategy supports different models, and the default is `layout_v1.1 file = open(filename, "rb") req = operations.PartitionRequest( partition_parameters=shared.PartitionParameters( - # Note that this currently only supports a single file + # Note that this currently only supports a single file. files=shared.Files( content=file.read(), file_name=filename, @@ -262,7 +262,7 @@ For better OCR results, you can specify what languages your document is in using file = open(filename, "rb") req = operations.PartitionRequest( partition_parameters=shared.PartitionParameters( - # Note that this currently only supports a single file + # Note that this currently only supports a single file. files=shared.Files( content=file.read(), file_name=filename, @@ -410,7 +410,7 @@ Set the `coordinates` parameter to `true` to add this field to the elements in t file = open(filename, "rb") req = operations.PartitionRequest( partition_parameters=shared.PartitionParameters( - # Note that this currently only supports a single file + # Note that this currently only supports a single file. files=shared.Files( content=file.read(), file_name=filename, @@ -561,7 +561,7 @@ This can be helpful if you'd like to use the IDs as a primary key in a database, file = open(filename, "rb") req = operations.PartitionRequest( partition_parameters=shared.PartitionParameters( - # Note that this currently only supports a single file + # Note that this currently only supports a single file. files=shared.Files( content=file.read(), file_name=filename, @@ -718,7 +718,7 @@ By default, the `chunking_strategy` is set to `None`, and no chunking is perform file = open(filename, "rb") req = operations.PartitionRequest( partition_parameters=shared.PartitionParameters( - # Note that this currently only supports a single file + # Note that this currently only supports a single file. files=shared.Files( content=file.read(), file_name=filename, From c9566eb10764dc18cd0d99d15ed2c7538bc46d1a Mon Sep 17 00:00:00 2001 From: Austin Walker Date: Fri, 4 Oct 2024 12:56:23 -0400 Subject: [PATCH 22/22] Remove duplicated param --- api-reference/api-services/sdk-python.mdx | 1 - 1 file changed, 1 deletion(-) diff --git a/api-reference/api-services/sdk-python.mdx b/api-reference/api-services/sdk-python.mdx index 5f70cc01..a00c1c13 100644 --- a/api-reference/api-services/sdk-python.mdx +++ b/api-reference/api-services/sdk-python.mdx @@ -59,7 +59,6 @@ import NoURLForServerlessAPI from '/snippets/general-shared-text/no-url-for-serv }, "strategy": shared.Strategy.HI_RES, "languages": ['eng'], - "split_pdf_page": True, "split_pdf_page": True, # If True, splits the PDF file into smaller chunks of pages. "split_pdf_allow_failed": True, # If True, the partitioning continues even if some pages fail. "split_pdf_concurrency_level": 15 # Set the number of concurrent request to the maximum value: 15.