From 3466908d1138d2a7bfb3ecd7936e58462d5b5de5 Mon Sep 17 00:00:00 2001 From: Paul Cornell Date: Fri, 8 Aug 2025 11:32:03 -0700 Subject: [PATCH 1/8] Unstructured API Quickstart --- api-reference/partition/quickstart.mdx | 132 +++++++++++++++++++++++++ 1 file changed, 132 insertions(+) create mode 100644 api-reference/partition/quickstart.mdx diff --git a/api-reference/partition/quickstart.mdx b/api-reference/partition/quickstart.mdx new file mode 100644 index 00000000..59e5e256 --- /dev/null +++ b/api-reference/partition/quickstart.mdx @@ -0,0 +1,132 @@ +--- +title: Unstructured API Quickstart +--- + +The following code shows how to use the [Unstructured Python SDK](/api-reference/partition/sdk-python) +to have Unstructured process one or more local files by using +the [Unstructured Partition Endpoint](/api-reference/partition/overview). + +To run this code, you will need the following: + +- An Unstructured account and an Unstructured API key for your account. [Learn how](/api-reference/partition/overview#get-started). +- Python 3.10 or later installed on your local machine. +- A Python virtual environment is recommended for isolating and versioning Python project code dependencies, but this is not required. + To create and activate a virtual environment, you can use a framework such as + [uv](https://docs.astral.sh/uv/) (recommended). Python provides a built-in framework named + [venv](https://docs.python.org/3/library/venv.html). +- You must install the Unstructured Python SDK on your local machine, for example by running one of the + following commands: + + - For `uv`, run `uv add unstructured-client` + - For `venv` (or for no virtual environment), run `pip install unstructured-client` + +- Add the following code to a Python file on your local machine; make the following code changes; and then run the code file to see the results. + + - Replace `` with your Unstructured API key. + - To process all files within a directory, change `None` for `input_dir` to a string that contains the path to the directory. + - To process specific files within a directory or across multiple directories, change `None` for `input_file` to a string that contains + a comma-separated list of filepaths, for example `"./input/2507.13305v1.pdf,./input2/table-multi-row-column-cells.pdf"`. These filepaths + can be relative to the following code file, or absolute references. + + + If `input_dir` and `input_file` are both set to something other than `None`, then the `input_dir` setting takes precedence, and the `input_file` setting is ignored. + + + - For the `output_dir` parameter, specify a string that contains the path to the directory that you want Unstructured to send its JSON output files. This code assumes + a directory named `output` exists in the same directory as the code file. You can change this to any relative or absolute + directory path. + +```python Python SDK +import asyncio +import os +import json +import unstructured_client +from unstructured_client.models import shared, errors + +client = unstructured_client.UnstructuredClient( + api_key_auth="" +) + +async def partition_file_via_api(filename): + req = { + "partition_parameters": { + "files": { + "content": open(filename, "rb"), + "file_name": os.path.basename(filename), + }, + "strategy": shared.Strategy.AUTO, + "vlm_model": "gpt-4o", + "vlm_model_provider": "openai", + "languages": ['eng'], + "split_pdf_page": True, + "split_pdf_allow_failed": True, + "split_pdf_concurrency_level": 15 + } + } + + try: + res = await client.general.partition_async(request=req) + return res.elements + except errors.UnstructuredClientError as e: + print(f"Error partitioning {filename}: {e.message}") + return [] + +async def process_file_and_save_result(input_filename, output_dir): + elements = await partition_file_via_api(input_filename) + + if elements: + results_name = f"{os.path.basename(input_filename)}.json" + output_filename = os.path.join(output_dir, results_name) + + with open(output_filename, "w") as f: + json.dump(elements, f) + +def load_filenames_in_directory(input_dir): + filenames = [] + for root, _, files in os.walk(input_dir): + for file in files: + if not file.endswith('.json'): + filenames.append(os.path.join(root, file)) + + return filenames + +async def process_files(): + # Initialize with either a directory name, to process everything in the dir, + # or a comma-separated list of filepaths. + input_dir = None # "path/to/input/directory" + input_files = None # "path/to/file,path/to/file,path/to/file" + + # Set to the directory for output json files. This dir + # will be created if needed. + output_dir = "./output/" + + if input_dir: + filenames = load_filenames_in_directory(input_dir) + else: + filenames = input_files.split(",") + + os.makedirs(output_dir, exist_ok=True) + + tasks = [] + for filename in filenames: + tasks.append( + process_file_and_save_result(filename, output_dir) + ) + + await asyncio.gather(*tasks) + +if __name__ == "__main__": + asyncio.run(process_files()) +``` + +## Next steps + +This quickstart shows how to use the Unstructured Partition Endpoint, which is intended for rapid prototyping of +Unstructured's various [partitioning](/ui/partitioning) strategies, with limited support for [chunking](/ui/chunking). +It is designed to work only with processing of local files. + +Take your code to the next level by switching over to the [Unstructured Workflow Endpoint](/api-reference/workflow/overview) +for production-level scenarios, file processing in batches, files and data in remote locations, full support for chunking, +generating [embeddings](/ui/embeddings), applying post-transform [enrichments](/ui/enriching/overview), +using the latest and highest-performing models, and much more. +[Get started](/api-reference/workflow/overview). \ No newline at end of file From f0be7383b4cecce9fc85423d01b9bb3dfb9ca494 Mon Sep 17 00:00:00 2001 From: Paul-Cornell Date: Fri, 8 Aug 2025 13:37:10 -0700 Subject: [PATCH 2/8] Apply suggestions from code review --- api-reference/partition/quickstart.mdx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/api-reference/partition/quickstart.mdx b/api-reference/partition/quickstart.mdx index 59e5e256..ef6bafe0 100644 --- a/api-reference/partition/quickstart.mdx +++ b/api-reference/partition/quickstart.mdx @@ -32,8 +32,8 @@ To run this code, you will need the following: If `input_dir` and `input_file` are both set to something other than `None`, then the `input_dir` setting takes precedence, and the `input_file` setting is ignored. - - For the `output_dir` parameter, specify a string that contains the path to the directory that you want Unstructured to send its JSON output files. This code assumes - a directory named `output` exists in the same directory as the code file. You can change this to any relative or absolute + - For the `output_dir` parameter, specify a string that contains the path to the directory that you want Unstructured to send its JSON output files. If the specified directory does not exist at that location, + the code will create the missing directory for you. You can change this to any relative or absolute directory path. ```python Python SDK From 088aff0977927e36f49af624d9e586b9ca1c2cdf Mon Sep 17 00:00:00 2001 From: Paul-Cornell Date: Fri, 8 Aug 2025 13:37:58 -0700 Subject: [PATCH 3/8] Apply suggestions from code review --- api-reference/partition/quickstart.mdx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/api-reference/partition/quickstart.mdx b/api-reference/partition/quickstart.mdx index ef6bafe0..0e831fa5 100644 --- a/api-reference/partition/quickstart.mdx +++ b/api-reference/partition/quickstart.mdx @@ -9,7 +9,7 @@ the [Unstructured Partition Endpoint](/api-reference/partition/overview). To run this code, you will need the following: - An Unstructured account and an Unstructured API key for your account. [Learn how](/api-reference/partition/overview#get-started). -- Python 3.10 or later installed on your local machine. +- Python installed on your local machine. - A Python virtual environment is recommended for isolating and versioning Python project code dependencies, but this is not required. To create and activate a virtual environment, you can use a framework such as [uv](https://docs.astral.sh/uv/) (recommended). Python provides a built-in framework named From 37886d835ebe88108d3cc9235d5d7757a314bf7c Mon Sep 17 00:00:00 2001 From: Paul-Cornell Date: Fri, 8 Aug 2025 13:40:40 -0700 Subject: [PATCH 4/8] Apply suggestions from code review --- api-reference/partition/quickstart.mdx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/api-reference/partition/quickstart.mdx b/api-reference/partition/quickstart.mdx index 0e831fa5..6a7cb735 100644 --- a/api-reference/partition/quickstart.mdx +++ b/api-reference/partition/quickstart.mdx @@ -9,7 +9,7 @@ the [Unstructured Partition Endpoint](/api-reference/partition/overview). To run this code, you will need the following: - An Unstructured account and an Unstructured API key for your account. [Learn how](/api-reference/partition/overview#get-started). -- Python installed on your local machine. +- Python 3.9 or higher installed on your local machine. - A Python virtual environment is recommended for isolating and versioning Python project code dependencies, but this is not required. To create and activate a virtual environment, you can use a framework such as [uv](https://docs.astral.sh/uv/) (recommended). Python provides a built-in framework named From 1f5454c1d85ccabebd07bfb190b6c0d9fd8b076b Mon Sep 17 00:00:00 2001 From: Paul-Cornell Date: Fri, 8 Aug 2025 13:45:12 -0700 Subject: [PATCH 5/8] Apply suggestions from code review --- api-reference/partition/quickstart.mdx | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/api-reference/partition/quickstart.mdx b/api-reference/partition/quickstart.mdx index 6a7cb735..f8d141be 100644 --- a/api-reference/partition/quickstart.mdx +++ b/api-reference/partition/quickstart.mdx @@ -2,6 +2,8 @@ title: Unstructured API Quickstart --- +Just need to copy the sample code? [Skip ahead](#sample-code) to it now! + The following code shows how to use the [Unstructured Python SDK](/api-reference/partition/sdk-python) to have Unstructured process one or more local files by using the [Unstructured Partition Endpoint](/api-reference/partition/overview). @@ -35,6 +37,8 @@ To run this code, you will need the following: - For the `output_dir` parameter, specify a string that contains the path to the directory that you want Unstructured to send its JSON output files. If the specified directory does not exist at that location, the code will create the missing directory for you. You can change this to any relative or absolute directory path. + +## Sample code ```python Python SDK import asyncio From 9263382f3411a7871a2f98a1bd5b61d9f4799a60 Mon Sep 17 00:00:00 2001 From: Paul-Cornell Date: Fri, 8 Aug 2025 13:55:39 -0700 Subject: [PATCH 6/8] Apply suggestions from code review --- api-reference/partition/quickstart.mdx | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/api-reference/partition/quickstart.mdx b/api-reference/partition/quickstart.mdx index f8d141be..a10ae3db 100644 --- a/api-reference/partition/quickstart.mdx +++ b/api-reference/partition/quickstart.mdx @@ -25,18 +25,16 @@ To run this code, you will need the following: - Add the following code to a Python file on your local machine; make the following code changes; and then run the code file to see the results. - Replace `` with your Unstructured API key. - - To process all files within a directory, change `None` for `input_dir` to a string that contains the path to the directory. + - To process all files within a directory, change `None` for `input_dir` to a string that contains the path to the directory on your local machine. This can be a relative or absolute path. - To process specific files within a directory or across multiple directories, change `None` for `input_file` to a string that contains - a comma-separated list of filepaths, for example `"./input/2507.13305v1.pdf,./input2/table-multi-row-column-cells.pdf"`. These filepaths - can be relative to the following code file, or absolute references. + a comma-separated list of filepaths on your local machine, for example `"./input/2507.13305v1.pdf,./input2/table-multi-row-column-cells.pdf"`. These filepaths + can be relative or absolute. If `input_dir` and `input_file` are both set to something other than `None`, then the `input_dir` setting takes precedence, and the `input_file` setting is ignored. - - For the `output_dir` parameter, specify a string that contains the path to the directory that you want Unstructured to send its JSON output files. If the specified directory does not exist at that location, - the code will create the missing directory for you. You can change this to any relative or absolute - directory path. + - For the `output_dir` parameter, specify a string that contains the path to the directory on your local machine that you want Unstructured to send its JSON output files. If the specified directory does not exist at that location, the code will create the missing directory for you. This path can be relative or absolute. ## Sample code From 5a9758b2cc96a4f0087f23cf3fdbea31c4872bf1 Mon Sep 17 00:00:00 2001 From: Paul-Cornell Date: Fri, 8 Aug 2025 14:18:37 -0700 Subject: [PATCH 7/8] Apply suggestions from code review --- api-reference/partition/quickstart.mdx | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/api-reference/partition/quickstart.mdx b/api-reference/partition/quickstart.mdx index a10ae3db..115b32f1 100644 --- a/api-reference/partition/quickstart.mdx +++ b/api-reference/partition/quickstart.mdx @@ -34,7 +34,7 @@ To run this code, you will need the following: If `input_dir` and `input_file` are both set to something other than `None`, then the `input_dir` setting takes precedence, and the `input_file` setting is ignored. - - For the `output_dir` parameter, specify a string that contains the path to the directory on your local machine that you want Unstructured to send its JSON output files. If the specified directory does not exist at that location, the code will create the missing directory for you. This path can be relative or absolute. + - For the `output_dir` parameter, specify a string that contains the path to the directory on your local machine that you want Unstructured to send its JSON output files. If the specified directory does not exist at that location, the code will create the missing directory for you. This path can be relative or absolute. ## Sample code @@ -124,11 +124,11 @@ if __name__ == "__main__": ## Next steps This quickstart shows how to use the Unstructured Partition Endpoint, which is intended for rapid prototyping of -Unstructured's various [partitioning](/ui/partitioning) strategies, with limited support for [chunking](/ui/chunking). +some of Unstructured's [partitioning](/api-reference/partition/partitioning) strategies, with limited support for [chunking](/api-reference/partition/chunking). It is designed to work only with processing of local files. Take your code to the next level by switching over to the [Unstructured Workflow Endpoint](/api-reference/workflow/overview) -for production-level scenarios, file processing in batches, files and data in remote locations, full support for chunking, +for production-level scenarios, file processing in batches, files and data in remote locations, full support for [chunking](/ui/chunking), generating [embeddings](/ui/embeddings), applying post-transform [enrichments](/ui/enriching/overview), using the latest and highest-performing models, and much more. [Get started](/api-reference/workflow/overview). \ No newline at end of file From 04e99394ed52d8b4661506047ec30bb20ab9f1c5 Mon Sep 17 00:00:00 2001 From: Paul-Cornell Date: Fri, 8 Aug 2025 14:23:09 -0700 Subject: [PATCH 8/8] Apply suggestions from code review --- api-reference/partition/quickstart.mdx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/api-reference/partition/quickstart.mdx b/api-reference/partition/quickstart.mdx index 115b32f1..67bceda4 100644 --- a/api-reference/partition/quickstart.mdx +++ b/api-reference/partition/quickstart.mdx @@ -129,6 +129,6 @@ It is designed to work only with processing of local files. Take your code to the next level by switching over to the [Unstructured Workflow Endpoint](/api-reference/workflow/overview) for production-level scenarios, file processing in batches, files and data in remote locations, full support for [chunking](/ui/chunking), -generating [embeddings](/ui/embeddings), applying post-transform [enrichments](/ui/enriching/overview), +generating [embeddings](/ui/embedding), applying post-transform [enrichments](/ui/enriching/overview), using the latest and highest-performing models, and much more. [Get started](/api-reference/workflow/overview). \ No newline at end of file