From 713e885264a55ce534d16ed289fe361945aa5dfd Mon Sep 17 00:00:00 2001 From: Paul Cornell Date: Thu, 27 Feb 2025 17:57:15 -0800 Subject: [PATCH 1/4] Move Ingest-related content from Partition Endpoint docs over into Ingest docs --- .../how-to}/embedding.mdx | 0 ingestion/how-to/examples.mdx | 318 ++++++++++++++++++ .../how-to/extract-image-block-types.mdx | 29 ++ .../how-to}/filter-files.mdx | 0 ingestion/how-to/get-chunked-elements.mdx | 64 ++++ ingestion/how-to/get-elements.mdx | 151 +++++++++ ingestion/how-to/overview.mdx | 0 .../how-to/speed-up-large-files-batches.mdx | 77 +++++ ingestion/how-to/text-as-html.mdx | 27 ++ mint.json | 24 +- open-source/core-functionality/embedding.mdx | 2 +- platform-api/partition-api/api-parameters.mdx | 4 +- .../partition-api/api-validation-errors.mdx | 4 +- platform-api/partition-api/examples.mdx | 270 +-------------- .../extract-image-block-types.mdx | 17 +- .../partition-api/get-chunked-elements.mdx | 17 +- platform-api/partition-api/get-elements.mdx | 137 +------- platform-api/partition-api/sdk-jsts.mdx | 26 +- platform-api/partition-api/sdk-python.mdx | 5 +- .../speed-up-large-files-batches.mdx | 76 +---- platform-api/partition-api/text-as-html.mdx | 17 +- 21 files changed, 725 insertions(+), 540 deletions(-) rename {platform-api/partition-api => ingestion/how-to}/embedding.mdx (100%) create mode 100644 ingestion/how-to/examples.mdx create mode 100644 ingestion/how-to/extract-image-block-types.mdx rename {platform-api/partition-api => ingestion/how-to}/filter-files.mdx (100%) create mode 100644 ingestion/how-to/get-chunked-elements.mdx create mode 100644 ingestion/how-to/get-elements.mdx create mode 100644 ingestion/how-to/overview.mdx create mode 100644 ingestion/how-to/speed-up-large-files-batches.mdx create mode 100644 ingestion/how-to/text-as-html.mdx diff --git a/platform-api/partition-api/embedding.mdx b/ingestion/how-to/embedding.mdx similarity index 100% rename from platform-api/partition-api/embedding.mdx rename to ingestion/how-to/embedding.mdx diff --git a/ingestion/how-to/examples.mdx b/ingestion/how-to/examples.mdx new file mode 100644 index 00000000..9c9c2480 --- /dev/null +++ b/ingestion/how-to/examples.mdx @@ -0,0 +1,318 @@ +--- +title: Examples +description: This page provides some examples of accessing Unstructured by using the Unstructured Ingest CLI and the Unstructured Ingest Python library. +--- + +These examples assume that you have already followed the instructured to set up the +[Unstructured Ingest CLI](/ingestion/ingest-cli) and the [Unstructured Ingest Python library](/ingestion/python-ingest). + +### Changing partition strategy for a PDF + +Here's how you can modify partition strategy for a PDF file, and select an alternative model to use with Unstructured API. +The `hi_res` strategy supports different models, and the default is `layout_v1.1.0`. + + + + + + ```bash CLI + unstructured-ingest \ + local \ + --input-path $LOCAL_FILE_INPUT_DIR \ + --output-dir $LOCAL_FILE_OUTPUT_DIR \ + --strategy hi_res \ + --hi-res-model-name layout_v1.1.0 \ + --partition-by-api \ + --api-key $UNSTRUCTURED_API_KEY \ + --partition-endpoint $UNSTRUCTURED_API_URL \ + --additional-partition-args="{\"split_pdf_page\":\"true\", \"split_pdf_allow_failed\":\"true\", \"split_pdf_concurrency_level\": 15}" + ``` + + + ```python Python + import os + + from unstructured_ingest.v2.pipeline.pipeline import Pipeline + from unstructured_ingest.v2.interfaces import ProcessorConfig + from unstructured_ingest.v2.processes.connectors.local import ( + LocalIndexerConfig, + LocalDownloaderConfig, + LocalConnectionConfig, + LocalUploaderConfig + ) + from unstructured_ingest.v2.processes.partitioner import PartitionerConfig + + if __name__ == "__main__": + Pipeline.from_configs( + context=ProcessorConfig(), + indexer_config=LocalIndexerConfig(input_path=os.getenv("LOCAL_FILE_INPUT_DIR")), + downloader_config=LocalDownloaderConfig(), + source_connection_config=LocalConnectionConfig(), + partitioner_config=PartitionerConfig( + strategy="hi_res", + hi_res_model_name="layout_v1.0.0", + partition_by_api=True, + api_key=os.getenv("UNSTRUCTURED_API_KEY"), + partition_endpoint=os.getenv("UNSTRUCTURED_API_URL"), + additional_partition_args={ + "split_pdf_page": True, + "split_pdf_allow_failed": True, + "split_pdf_concurrency_level": 15 + } + ), + uploader_config=LocalUploaderConfig(output_dir=os.getenv("LOCAL_FILE_OUTPUT_DIR")) + ).run() + ``` + + + +If you have a local deployment of the Unstructured API, you can use other supported models, such as `yolox`. + +### Specifying the language of a document for better OCR results + +For better OCR results, you can specify what languages your document is in using the `languages` parameter. +[View the list of available languages](https://github.com/tesseract-ocr/tessdata). + + + + ```bash CLI + unstructured-ingest \ + local \ + --input-path $LOCAL_FILE_INPUT_DIR \ + --output-dir $LOCAL_FILE_OUTPUT_DIR \ + --strategy ocr_only \ + --ocr-languages kor \ + --partition-by-api \ + --api-key $UNSTRUCTURED_API_KEY \ + --partition-endpoint $UNSTRUCTURED_API_URL \ + --additional-partition-args="{\"split_pdf_page\":\"true\", \"split_pdf_allow_failed\":\"true\", \"split_pdf_concurrency_level\": 15}" + ``` + + + ```python Python + import os + + from unstructured_ingest.v2.pipeline.pipeline import Pipeline + from unstructured_ingest.v2.interfaces import ProcessorConfig + from unstructured_ingest.v2.processes.connectors.local import ( + LocalIndexerConfig, + LocalDownloaderConfig, + LocalConnectionConfig, + LocalUploaderConfig + ) + from unstructured_ingest.v2.processes.partitioner import PartitionerConfig + + if __name__ == "__main__": + Pipeline.from_configs( + context=ProcessorConfig(), + indexer_config=LocalIndexerConfig(input_path=os.getenv("LOCAL_FILE_INPUT_DIR")), + downloader_config=LocalDownloaderConfig(), + source_connection_config=LocalConnectionConfig(), + partitioner_config=PartitionerConfig( + strategy="ocr_only", + ocr_languages=["kor"], + partition_by_api=True, + api_key=os.getenv("UNSTRUCTURED_API_KEY"), + partition_endpoint=os.getenv("UNSTRUCTURED_API_URL"), + additional_partition_args={ + "split_pdf_page": True, + "split_pdf_allow_failed": True, + "split_pdf_concurrency_level": 15 + } + ), + uploader_config=LocalUploaderConfig(output_dir=os.getenv("LOCAL_FILE_OUTPUT_DIR")) + ).run() + ``` + + + +### Saving bounding box coordinates + +When elements are extracted from PDFs or images, it may be useful to get their bounding boxes as well. +Set the `coordinates` parameter to `true` to add this field to the elements in the response. + + + + ```bash CLI + unstructured-ingest \ + local \ + --input-path $LOCAL_FILE_INPUT_DIR \ + --output-dir $LOCAL_FILE_OUTPUT_DIR \ + --partition-by-api \ + --api-key $UNSTRUCTURED_API_KEY \ + --partition-endpoint $UNSTRUCTURED_API_URL \ + --strategy hi_res \ + --additional-partition-args="{\"coordinates\":\"true\", \"split_pdf_page\":\"true\", \"split_pdf_allow_failed\":\"true\", \"split_pdf_concurrency_level\": 15}" + ``` + + + ```python Python + import os + + from unstructured_ingest.v2.pipeline.pipeline import Pipeline + from unstructured_ingest.v2.interfaces import ProcessorConfig + from unstructured_ingest.v2.processes.connectors.local import ( + LocalIndexerConfig, + LocalDownloaderConfig, + LocalConnectionConfig, + LocalUploaderConfig + ) + from unstructured_ingest.v2.processes.partitioner import PartitionerConfig + + if __name__ == "__main__": + Pipeline.from_configs( + context=ProcessorConfig(), + indexer_config=LocalIndexerConfig(input_path=os.getenv("LOCAL_FILE_INPUT_DIR")), + downloader_config=LocalDownloaderConfig(), + source_connection_config=LocalConnectionConfig(), + partitioner_config=PartitionerConfig( + partition_by_api=True, + api_key=os.getenv("UNSTRUCTURED_API_KEY"), + partition_endpoint=os.getenv("UNSTRUCTURED_API_URL"), + strategy="hi_res", + additional_partition_args={ + "coordinates": True, + "split_pdf_page": True, + "split_pdf_allow_failed": True, + "split_pdf_concurrency_level": 15 + } + ), + uploader_config=LocalUploaderConfig(output_dir=os.getenv("LOCAL_FILE_OUTPUT_DIR")) + ).run() + ``` + + + +### Returning unique element IDs + +By default, the element ID is a SHA-256 hash of the element text. This is to ensure that +the ID is deterministic. One downside is that the ID is not guaranteed to be unique. +Different elements with the same text will have the same ID, and there could also be hash collisions. +To use UUIDs in the output instead, set `unique_element_ids=true`. Note: this means that the element IDs +will be random, so with every partition of the same file, you will get different IDs. +This can be helpful if you'd like to use the IDs as a primary key in a database, for example. + + + + ```bash CLI + unstructured-ingest \ + local \ + --input-path $LOCAL_FILE_INPUT_DIR \ + --output-dir $LOCAL_FILE_OUTPUT_DIR \ + --partition-by-api \ + --api-key $UNSTRUCTURED_API_KEY \ + --partition-endpoint $UNSTRUCTURED_API_URL \ + --strategy hi_res \ + --additional-partition-args="{\"unique_element_ids\":\"true\", \"split_pdf_page\":\"true\", \"split_pdf_allow_failed\":\"true\", \"split_pdf_concurrency_level\": 15}" + ``` + + + ```python Python + import os + + from unstructured_ingest.v2.pipeline.pipeline import Pipeline + from unstructured_ingest.v2.interfaces import ProcessorConfig + from unstructured_ingest.v2.processes.connectors.local import ( + LocalIndexerConfig, + LocalDownloaderConfig, + LocalConnectionConfig, + LocalUploaderConfig + ) + from unstructured_ingest.v2.processes.partitioner import PartitionerConfig + + if __name__ == "__main__": + Pipeline.from_configs( + context=ProcessorConfig(), + indexer_config=LocalIndexerConfig(input_path=os.getenv("LOCAL_FILE_INPUT_DIR")), + downloader_config=LocalDownloaderConfig(), + source_connection_config=LocalConnectionConfig(), + partitioner_config=PartitionerConfig( + partition_by_api=True, + api_key=os.getenv("UNSTRUCTURED_API_KEY"), + partition_endpoint=os.getenv("UNSTRUCTURED_API_URL"), + strategy="hi_res", + additional_partition_args={ + "unique_element_ids": True, + "split_pdf_page": True, + "split_pdf_allow_failed": True, + "split_pdf_concurrency_level": 15 + } + ), + uploader_config=LocalUploaderConfig(output_dir=os.getenv("LOCAL_FILE_OUTPUT_DIR")) + ).run() + ``` + + + +### Adding the chunking step after partitioning + +You can combine partitioning and subsequent chunking in a single request by setting the `chunking_strategy` parameter. +By default, the `chunking_strategy` is set to `None`, and no chunking is performed. + +[//]: # (TODO: add a link to the concepts section about chunking strategies. Need to create the shared Concepts section first) + + + + ```bash CLI + unstructured-ingest \ + local \ + --input-path $LOCAL_FILE_INPUT_DIR \ + --output-dir $LOCAL_FILE_OUTPUT_DIR \ + --chunking-strategy by_title \ + --chunk-max-characters 1024 \ + --partition-by-api \ + --api-key $UNSTRUCTURED_API_KEY \ + --partition-endpoint $UNSTRUCTURED_API_URL \ + --strategy hi_res \ + --additional-partition-args="{\"split_pdf_page\":\"true\", \"split_pdf_allow_failed\":\"true\", \"split_pdf_concurrency_level\": 15}" + ``` + + + ```python Python + import os + + from unstructured_ingest.v2.pipeline.pipeline import Pipeline + from unstructured_ingest.v2.interfaces import ProcessorConfig + from unstructured_ingest.v2.processes.connectors.local import ( + LocalIndexerConfig, + LocalDownloaderConfig, + LocalConnectionConfig, + LocalUploaderConfig + ) + from unstructured_ingest.v2.processes.partitioner import PartitionerConfig + from unstructured_ingest.v2.processes.chunker import ChunkerConfig + + if __name__ == "__main__": + Pipeline.from_configs( + context=ProcessorConfig(), + indexer_config=LocalIndexerConfig(input_path=os.getenv("LOCAL_FILE_INPUT_DIR")), + downloader_config=LocalDownloaderConfig(), + source_connection_config=LocalConnectionConfig(), + partitioner_config=PartitionerConfig( + partition_by_api=True, + api_key=os.getenv("UNSTRUCTURED_API_KEY"), + partition_endpoint=os.getenv("UNSTRUCTURED_API_URL"), + strategy="hi_res", + additional_partition_args={ + "split_pdf_page": True, + "split_pdf_allow_failed": True, + "split_pdf_concurrency_level": 15 + } + ), + chunker_config=ChunkerConfig( + chunking_strategy="by_title", + chunk_max_characters=1024 + ), + uploader_config=LocalUploaderConfig(output_dir=os.getenv("LOCAL_FILE_OUTPUT_DIR")) + ).run() + ``` + + \ No newline at end of file diff --git a/ingestion/how-to/extract-image-block-types.mdx b/ingestion/how-to/extract-image-block-types.mdx new file mode 100644 index 00000000..76741bd9 --- /dev/null +++ b/ingestion/how-to/extract-image-block-types.mdx @@ -0,0 +1,29 @@ +--- +title: Extract images and tables from documents +--- + +## Task + +You want to get, decode, and show elements, such as images and tables, that are embedded in a PDF document. + +## Approach + +Extract the Base64-encoded representation of specific elements, such as images and tables, in the document. +For each of these extracted elements, decode the Base64-encoded representation of the element into its original visual representation +and then show it. + +## To run this example + +You will need a document that is one of the document types supported by the `extract_image_block_types` argument. +See the `extract_image_block_types` entry in [API Parameters](/platform-api/partition-api/api-parameters). +This example uses a PDF file with embedded images and tables. + +import SharedAPIKeyURL from '/snippets/general-shared-text/api-key-url.mdx'; +import ExtractImageBlockTypesIngestPy from '/snippets/how-to-api/extract_image_block_types_ingest.py.mdx'; + +## Code + +For the [Unstructured Ingest Python library](/ingestion/python-ingest), you can use the standard Python +[json.load](https://docs.python.org/3/library/json.html#json.load) function to load into a Python dictionary the contents of a JSON +file that the Ingest Python library outputs after the processing is complete. + \ No newline at end of file diff --git a/platform-api/partition-api/filter-files.mdx b/ingestion/how-to/filter-files.mdx similarity index 100% rename from platform-api/partition-api/filter-files.mdx rename to ingestion/how-to/filter-files.mdx diff --git a/ingestion/how-to/get-chunked-elements.mdx b/ingestion/how-to/get-chunked-elements.mdx new file mode 100644 index 00000000..a0ece277 --- /dev/null +++ b/ingestion/how-to/get-chunked-elements.mdx @@ -0,0 +1,64 @@ +--- +title: Get chunked elements +--- + +## Task + +You want to get, and print or save, the contents of elements that have been chunked. + +## Approach + +Chunked elements are typically represented in this format: + +```json +{ + "type": "CompositeElement", + "element_id": "3800f7ff047e97cbf0a5f7df7ad52c80", + "text": "4,5 Exception noted in \u201cSpecial Considerations for Promotional Labeling and Advertising Material\u201d\n\n3", + "metadata": { + "filetype": "application/pdf", + "languages": ["eng"], + "page_number": 6, + "orig_elements": "eJy ... Full content omitted for brevity ... x8=", + "filename": "Portable-Document-Format-Specifications.pdf" + } +} +``` + +```json +{ + "type": "Table", + "element_id": "758127b42c51b93b59abf08640d1ccab", + "text": "Gels and karyotypes High pressure liquid chromatography 600 dpi (8 bit grayscale depth) 300 dpi", + "metadata": { + "text_as_html": "
Gels and karyotypes600 dpi (8 bit grayscale depth)
High pressure liquid chromatography300
", + "filetype": "application/pdf", + "languages": ["eng"], + "page_number": 8, + "orig_elements": "eJy ... Full content omitted for brevity ... MnD", + "filename": "Portable-Document-Format-Specifications.pdf" +} +``` + +To get the element's chunked content, extract the contents of the element's `text` field. + +The chunked content might not contain all of its associated content. To get all of the elements that were used to derive this chunked content, extract the contents of the element's `orig_elements` field, which is nested inside of its parent `metadata` field. + +The contents of the `orig_elements` field is in compressed Base64 gzipped format. To get the content, Base64-decode the bytes, decompress them, and then decode them using UTF-8. + +## To run this example + +You will need to chunk a document during processing. This example uses a PDF file chunked into 200- to 300-character elements. + +## Code + +import GetChunkedElementsIngestPy from '/snippets/how-to-api/get_chunked_elements_ingest.py.mdx'; + +For the [Unstructured Ingest Python library](/ingestion/python-ingest), you can use the standard Python +[json.load](https://docs.python.org/3/library/json.html#json.load) function to load into a Python dictionary the contents of a JSON +file that the Ingest Python library outputs after the processing is complete. + + +## See also + +- [Recovering chunk elements](/open-source/core-functionality/chunking#recovering-chunk-elements) \ No newline at end of file diff --git a/ingestion/how-to/get-elements.mdx b/ingestion/how-to/get-elements.mdx new file mode 100644 index 00000000..bebd5161 --- /dev/null +++ b/ingestion/how-to/get-elements.mdx @@ -0,0 +1,151 @@ +--- +title: Get element contents +--- + +## Task + +You want to get, manipulate, and print or save, the contents of the document elements and metadata from the processed data that Unstructured returns. + +## Approach + +Each element in the document elements contains fields for that element's type, its ID, the extracted text, and associated metadata. + +The programmatic approach you take to get these document elements will depend on which tool, SDK, or library you use: + + + + For the [Unstructured Ingest CLI](/ingestion/overview#unstructured-ingest-cli), you can use a tool such as [jq](https://jqlang.github.io/jq/) + to work with a JSON file that the CLI outputs after the processing is complete. + + For example, the following script uses `jq` to access and print each element's ID, text, and originating file name: + + ```bash Shell + #!/usr/bin/env bash + + JSON_FILE="local-ingest-output/my-file.json" + + jq -r '.[] | "ID: \(.element_id)\nText: \(.text)\nFilename: \(.metadata.filename)\n"' \ + "$JSON_FILE" + ``` + + + For the [Unstructured Ingest Python library](/ingestion/python-ingest), you can use the standard Python + [json.load](https://docs.python.org/3/library/json.html#json.load) function to load into a Python dictionary the contents of a JSON + file that the Ingest Python library outputs after the processing is complete. + + For example, the following code example uses standard Python to access and print each element's ID, text, and originating file name: + + ```python Python + import json + + def parse_json_file(input_file_path: str): + with open(input_file_path, 'r') as file: + file_elements = json.load(file) + + for element in file_elements: + print(f"ID: {element["element_id"]}") + print(f"Text: {element["text"]}") + print(f"Filename: {element["metadata"]["filename"]}\n") + + if __name__ == "__main__": + parse_json_file( + input_file_path="local-ingest-output/my-file.json" + ) + ``` + + + For the [Unstructured open-source library](/open-source/introduction/overview), calling the `partition_via_api` function returns a list of elements (`list[Element]`). For example: + + ```python Python + # ... + + elements = partition_via_api( + filename=input_filepath, + api_key=os.getenv("UNSTRUCTURED_API_KEY"), + api_url=os.getenv("UNSTRUCTURED_API_URL"), + strategy="hi_res" + ) + + # ... + ``` + + You can use standard Python [list operations](https://docs.python.org/3/tutorial/datastructures.html#more-on-lists) on this list. + + You can also use standard Python [looping techniques](https://docs.python.org/3/tutorial/datastructures.html#looping-techniques) on this list to access each element in this list. + + Each individual element has the following attributes: + + - `.text` provides the element's `text` field value as a `str`. See [Element example](/open-source/concepts/document-elements#element-example). + - `.metadata` provides the element's `metadata` field as an `ElementMetadata` object. See [Metadata](/open-source/concepts/document-elements#metadata). + - `.category` provides the element's `type` field value as a `str`. See [Element type](/open-source/concepts/document-elements#element-type). + - `.id` provides the element's `element_id` value as a `str`. See [Element ID](/open-source/concepts/document-elements#element-id). + + In addition, the following methods are available: + + - `.convert_coordinates_to_new_system()` converts the element's location coordinates, if any, to a new coordinate system. See [Element's coordinates](/open-source/concepts/document-elements#elements-coordinates). + - `.to_dict()` gets the element's content as a standard Python key-value dictionary (`dict[str, Any]`). + + For example: + + ```python Python + # ... + + elements = partition_via_api( + filename=input_filepath, + api_key=os.getenv("UNSTRUCTURED_API_KEY"), + api_url=os.getenv("UNSTRUCTURED_API_URL"), + strategy="hi_res" + ) + + for element in elements: + # Do something with each element, for example: + save_element_to_database(f"{element.id}") + save_element_to_database(f"{element.text}") + save_element_to_database(f"{element.metadata.filename}") + + ``` + + To serialize this list as a Python dictionary, you can use the `elements_to_dicts` method, for example: + + ```python Python + from unstructured.staging.base import elements_to_dicts + + # ... + + elements = partition_via_api( + filename=input_filepath, + api_key=os.getenv("UNSTRUCTURED_API_KEY"), + api_url=os.getenv("UNSTRUCTURED_API_URL"), + strategy="hi_res" + ) + + elements_dicts = elements_to_dicts(elements) + ``` + + To serialize this list as JSON, you can use the `elements_to_json` function to convert the list of elements (`Iterable[Element]`) into a JSON-formatted string and then print or save that string. For example: + + ```python Python + from unstructured.staging.base import elements_to_json + + # ... + + elements = partition_via_api( + filename=input_filepath, + api_key=os.getenv("UNSTRUCTURED_API_KEY"), + api_url=os.getenv("UNSTRUCTURED_API_URL"), + strategy="hi_res" + ) + + json_elements = elements_to_json( + elements=elements, + indent=2 + ) + + elements_to_json( + elements=elements, + indent=2, + filename=output_filepath + ) + ``` + + \ No newline at end of file diff --git a/ingestion/how-to/overview.mdx b/ingestion/how-to/overview.mdx new file mode 100644 index 00000000..e69de29b diff --git a/ingestion/how-to/speed-up-large-files-batches.mdx b/ingestion/how-to/speed-up-large-files-batches.mdx new file mode 100644 index 00000000..fd561382 --- /dev/null +++ b/ingestion/how-to/speed-up-large-files-batches.mdx @@ -0,0 +1,77 @@ +--- +title: Speed up processing of large files and batches +--- + +When you use Unstructured, here are some techniques that you can try to help speed up the processing of large files and large batches of files. + +- Choose your partitioning strategy wisely. For example, if you have simple PDFs that don't have images and tables, you might be able to use the `fast` strategy. Try the `fast` strategy on a few of your documents before you try using the `hi_res` strategy. [Learn more](/ingestion/ingest-configuration/partition-configuration). +- For processing large numbers of documents, use [ingestion](/ingestion/overview) and [add CPUs](#adding-cpus). + +## Adding CPUs + +For speeding up file processing during [ingestion](/ingestion/overview), the Unstructured CLI and Unstructured Python Ingest enable you to instruct Unstructured to use additional local CPUs where applicable. + +Using additional local CPUs applies only to pipeline steps that Unstructured logs as being processed across CPUs. It does not apply to pipeline steps that are logged as being processed asynchronously. To get a list of which operations are processed where, look for the following log messages when you run an ingest pipeline: + +- Steps that are processed across CPUs correspond to log messages that read: `processing content across processes`. These steps might benefit by setting a higher number of local CPUs to be used. +- Steps that are processed asynchronously correspond to log messages that read: `processing content across processes`. Any settings to use a higher number of local CPUs are ignored for these steps. + +For the Unstructured CLI, you can set `--num-processes` to the maximum number of available local CPUs that you want to use where applicable, for example: + +```bash +unstructured-ingest \ + local \ + --num-processes + # ... +``` + +To get the maximum number of available local logical CPUs that can be used where applicable, see your operating system's documentation. + +For Unstructured Python Ingest, you can set the `ProcessorConfig` object's `num_processes` parameter to the maximum number of available local CPUs that you want to use where applicable, for example: + + + ```python Python Ingest v2 + from unstructured_ingest.v2.interfaces import ProcessorConfig + + # ... + + if __name__ == "__main__": + Pipeline.from_configs( + context=ProcessorConfig( + num_processes=, + # ... + ), + # ... + ).run() + ``` + + ```python Python Ingest v1 + from unstructured_ingest.interfaces import ( + ProcessorConfig, + # ... + ) + from unstructured_ingest.runner import LocalRunner + + # ... + + if __name__ == "__main__": + runner = LocalRunner( + processor_config=ProcessorConfig( + num_processes=, + # ... + ), + # ... + ).run() + ``` + + +In Python, to specify the maximum number of available local logical CPUs that can be used where applicable, you can call functions such as [os.cpu_count](https://docs.python.org/3/library/os.html#os.cpu_count) and [multiprocessing.cpu_count](https://docs.python.org/3/library/multiprocessing.html#multiprocessing.cpu_count). + +## PDF files + +To speed up PDF file processing, the [Unstructured Ingest CLI](/ingestion/overview#unstructured-ingest-cli) and the [Unstructured Ingest Python library](/ingestion/python-ingest) provide the following parameters to help speed up processing a large PDF file: + +- `split_pdf_page`, when set to true, splits the PDF file on the client side before sending it as batches to Unstructured for processing. The number of pages in each batch is determined internally. Batches can contain between 2 and 20 pages. +- `split_pdf_concurrency_level` is an integer that specifies the number of parallel requests. The default is 5. The maximum is 15. This behavior is ignored unless `split_pdf_page` is also set to true. +- `split_pdf_allow_failed`, when set to true, allows partitioning to continue even if some pages fail. +- `split_pdf_page_range` is a list of two integers that specify the beginning and ending page numbers of the PDF file to be sent. A `ValueError` is raised if the specified range is not valid. This behavior is ignored unless `split_pdf_page` is also set to true. diff --git a/ingestion/how-to/text-as-html.mdx b/ingestion/how-to/text-as-html.mdx new file mode 100644 index 00000000..cacf2d30 --- /dev/null +++ b/ingestion/how-to/text-as-html.mdx @@ -0,0 +1,27 @@ +--- +title: Extract tables as HTML +--- + +## Task + +You want to get, save, or show the contents of elements that are represented as HTML, such as tables that are embedded in a PDF document. + +## Approach + +Extract the contents of an element's `text_as_html` JSON object, which is nested inside of its parent `metadata` object. + +## To run this example + +You will need a document that is one of the document types that can output the `text_as_html` JSON object. For the list of applicable document types, see the entries in the table at the beginning of [Partitioning](/open-source/core-functionality/partitioning) where "Table Support" is "Yes." + +This example uses a PDF file with an embedded table. + +import ExtractTextAsHTMLIngestPy from '/snippets/how-to-api/extract_text_as_html_ingest.py.mdx'; + +## Code + +For the [Unstructured Ingest Python library](/ingestion/python-ingest), you can use the standard Python +[json.load](https://docs.python.org/3/library/json.html#json.load) function to load into a Python dictionary the contents of a JSON +file that the Ingest Python library outputs after the processing is complete. + + \ No newline at end of file diff --git a/mint.json b/mint.json index ef43f394..d5a486b9 100644 --- a/mint.json +++ b/mint.json @@ -345,8 +345,6 @@ "platform-api/partition-api/choose-partitioning-strategy", "platform-api/partition-api/choose-hi-res-model", "platform-api/partition-api/get-elements", - "platform-api/partition-api/filter-files", - "platform-api/partition-api/embedding", "platform-api/partition-api/text-as-html", "platform-api/partition-api/extract-image-block-types", "platform-api/partition-api/get-chunked-elements", @@ -496,6 +494,20 @@ "ingestion/destination-connectors/vectara", "ingestion/destination-connectors/weaviate" ] + }, + { + "group": "How-to", + "pages": [ + "ingestion/how-to/overview", + "ingestion/how-to/examples", + "ingestion/how-to/speed-up-large-files-batches", + "ingestion/how-to/get-elements", + "ingestion/how-to/filter-files", + "ingestion/how-to/embedding", + "ingestion/how-to/text-as-html", + "ingestion/how-to/extract-image-block-types", + "ingestion/how-to/get-chunked-elements" + ] } ] }, @@ -606,6 +618,14 @@ { "source": "/platform/api/:slug*", "destination": "/platform-api/api/:slug*" + }, + { + "source": "/platform-api/partition-api/embedding", + "destination": "/ingestion/how-to/embedding" + }, + { + "source": "/platform-api/partition-api/filter-files", + "destination": "/ingestion/how-to/filter-files" } ], "analytics": { diff --git a/open-source/core-functionality/embedding.mdx b/open-source/core-functionality/embedding.mdx index 1a84313e..e2623f3c 100644 --- a/open-source/core-functionality/embedding.mdx +++ b/open-source/core-functionality/embedding.mdx @@ -5,7 +5,7 @@ title: Embedding The Unstructured open-source library does not offer built-in support for calling embedding providers to obtain embeddings for pieces of text. Alternatively, the [Unstructured Ingest CLI](/ingestion/overview#unstructured-ingest-cli) and the [Unstructured Ingest Python library](/ingestion/python-ingest) -offer built-in support for calling embedding providers as part of an ingest pipeline. [Learn how](/platform-api/partition-api/embedding). +offer built-in support for calling embedding providers as part of an ingest pipeline. [Learn how](/ingestion/how-to/embedding). Also, you can use common third-party tools and libraries to get embeddings for document elements' text within JSON files that are produced by calling the Unstructured open-source library. For example, the following sample Python script: diff --git a/platform-api/partition-api/api-parameters.mdx b/platform-api/partition-api/api-parameters.mdx index b61f665f..3edfc586 100644 --- a/platform-api/partition-api/api-parameters.mdx +++ b/platform-api/partition-api/api-parameters.mdx @@ -27,8 +27,8 @@ The only required parameter is `files` - the file you wish to process. | `starting_page_number` (_int_) | `startingPageNumber` (_number_) | The page number to be be assigned to the first page in the document. This information will be included in elements' metadata and can be be especially useful when partitioning a document that is part of a larger document. | | `strategy` (_str_) | `strategy` (_string_) | The strategy to use for partitioning PDF and image files. Options are `auto`, `vlm`, `hi_res`, `fast`, and `ocr_only`. Default: `auto`. [Learn more](/platform-api/partition-api/partitioning). | | `unique_element_ids` (_bool_) | `uniqueElementIds` (_boolean_) | True to assign UUIDs to element IDs, which guarantees their uniqueness (useful when using them as primary keys in database). Otherwise a SHA-256 of the element's text is used. Default: false. | -| `vlm_model` (_str_) (POST only) | | Applies only when `strategy` is `vlm`. The name of the vision language model (VLM) provider to use for partitioning. `vlm_model_provider` must also be specified. For a list of allowed values, see the end of this article. | -| `vlm_model_provider` (_str_) (POST only) | | Applies only when `strategy` is `vlm`. The name of the vision language model (VLM) to use for partitioning. `vlm_model` must also be specified. For a list of allowed values, see the end of this article. | +| `vlm_model` (_str_) | (Not yet available) | Applies only when `strategy` is `vlm`. The name of the vision language model (VLM) provider to use for partitioning. `vlm_model_provider` must also be specified. For a list of allowed values, see the end of this article. | +| `vlm_model_provider` (_str_) | (Not yet available) | Applies only when `strategy` is `vlm`. The name of the vision language model (VLM) to use for partitioning. `vlm_model` must also be specified. For a list of allowed values, see the end of this article. | | `xml_keep_tags` (_bool_) | `xmlKeepTags` (_boolean_) | True to retain the XML tags in the output. Otherwise it will just extract the text from within the tags. Only applies to XML documents. | The following parameters only apply when a chunking strategy is specified. Otherwise, they are ignored. [Learn more](/platform-api/partition-api/chunking). diff --git a/platform-api/partition-api/api-validation-errors.mdx b/platform-api/partition-api/api-validation-errors.mdx index 04f6e500..4afb0aab 100644 --- a/platform-api/partition-api/api-validation-errors.mdx +++ b/platform-api/partition-api/api-validation-errors.mdx @@ -1,6 +1,6 @@ --- -title: API validation errors -description: This section details the structure of HTTP validation errors returned by the API. +title: Endpoint validation errors +description: This section details the structure of HTTP validation errors returned by the Unstructured Platform Partition Endpoint. --- ## HTTPValidationError diff --git a/platform-api/partition-api/examples.mdx b/platform-api/partition-api/examples.mdx index b173fd62..82d45bbc 100644 --- a/platform-api/partition-api/examples.mdx +++ b/platform-api/partition-api/examples.mdx @@ -1,19 +1,16 @@ --- title: Examples -description: This page provides some examples of accessing Unstructured API via different methods. +description: This page provides some examples of accessing Unstructured Platform Partition Endpoint via different methods. --- -For each of these examples, you'll need: +To use these examples, you'll first need to set an environment variable named `UNSTRUCTURED_API_KEY`, +representing your Unstructured API key. [Get your API key](/platform-api/partition-api/overview). -import SharedAPIKeyURL from '/snippets/general-shared-text/api-key-url.mdx'; +For the POST and Unstructured JavaScript/TypeScript SDK examples, you'll also need to set an environment variable named `UNSTRUCTURED_API_URL` to the +value `https://api.unstructuredapp.io/general/v0/general` - - -import NoURLForServerlessAPI from '/snippets/general-shared-text/no-url-for-serverless-api.mdx'; - - - -import UseIngestOrPlatformInstead from '/snippets/general-shared-text/use-ingest-or-platform-instead.mdx'; +For the Unstructured Python SDK, you do not need to set an environment variable named `UNSTRUCTURED_API_URL`, as the Python SDK uses the API URL of +`https://api.unstructuredapp.io/general/v0/general` by default. (The Unstructured JavaScript/TypeScript SDK does not have this feature yet; you must always specify the API URL.) ### Changing partition strategy for a PDF @@ -31,56 +28,6 @@ The `hi_res` strategy supports different models, and the default is `layout_v1.1 > - - ```bash CLI - unstructured-ingest \ - local \ - --input-path $LOCAL_FILE_INPUT_DIR \ - --output-dir $LOCAL_FILE_OUTPUT_DIR \ - --strategy hi_res \ - --hi-res-model-name layout_v1.1.0 \ - --partition-by-api \ - --api-key $UNSTRUCTURED_API_KEY \ - --partition-endpoint $UNSTRUCTURED_API_URL \ - --additional-partition-args="{\"split_pdf_page\":\"true\", \"split_pdf_allow_failed\":\"true\", \"split_pdf_concurrency_level\": 15}" - ``` - - - ```python Python - import os - - from unstructured_ingest.v2.pipeline.pipeline import Pipeline - from unstructured_ingest.v2.interfaces import ProcessorConfig - from unstructured_ingest.v2.processes.connectors.local import ( - LocalIndexerConfig, - LocalDownloaderConfig, - LocalConnectionConfig, - LocalUploaderConfig - ) - from unstructured_ingest.v2.processes.partitioner import PartitionerConfig - - if __name__ == "__main__": - Pipeline.from_configs( - context=ProcessorConfig(), - indexer_config=LocalIndexerConfig(input_path=os.getenv("LOCAL_FILE_INPUT_DIR")), - downloader_config=LocalDownloaderConfig(), - source_connection_config=LocalConnectionConfig(), - partitioner_config=PartitionerConfig( - strategy="hi_res", - hi_res_model_name="layout_v1.0.0", - partition_by_api=True, - api_key=os.getenv("UNSTRUCTURED_API_KEY"), - partition_endpoint=os.getenv("UNSTRUCTURED_API_URL"), - additional_partition_args={ - "split_pdf_page": True, - "split_pdf_allow_failed": True, - "split_pdf_concurrency_level": 15 - } - ), - uploader_config=LocalUploaderConfig(output_dir=os.getenv("LOCAL_FILE_OUTPUT_DIR")) - ).run() - ``` - ```bash POST @@ -250,56 +197,6 @@ For better OCR results, you can specify what languages your document is in using [View the list of available languages](https://github.com/tesseract-ocr/tessdata). - - ```bash CLI - unstructured-ingest \ - local \ - --input-path $LOCAL_FILE_INPUT_DIR \ - --output-dir $LOCAL_FILE_OUTPUT_DIR \ - --strategy ocr_only \ - --ocr-languages kor \ - --partition-by-api \ - --api-key $UNSTRUCTURED_API_KEY \ - --partition-endpoint $UNSTRUCTURED_API_URL \ - --additional-partition-args="{\"split_pdf_page\":\"true\", \"split_pdf_allow_failed\":\"true\", \"split_pdf_concurrency_level\": 15}" - ``` - - - ```python Python - import os - - from unstructured_ingest.v2.pipeline.pipeline import Pipeline - from unstructured_ingest.v2.interfaces import ProcessorConfig - from unstructured_ingest.v2.processes.connectors.local import ( - LocalIndexerConfig, - LocalDownloaderConfig, - LocalConnectionConfig, - LocalUploaderConfig - ) - from unstructured_ingest.v2.processes.partitioner import PartitionerConfig - - if __name__ == "__main__": - Pipeline.from_configs( - context=ProcessorConfig(), - indexer_config=LocalIndexerConfig(input_path=os.getenv("LOCAL_FILE_INPUT_DIR")), - downloader_config=LocalDownloaderConfig(), - source_connection_config=LocalConnectionConfig(), - partitioner_config=PartitionerConfig( - strategy="ocr_only", - ocr_languages=["kor"], - partition_by_api=True, - api_key=os.getenv("UNSTRUCTURED_API_KEY"), - partition_endpoint=os.getenv("UNSTRUCTURED_API_URL"), - additional_partition_args={ - "split_pdf_page": True, - "split_pdf_allow_failed": True, - "split_pdf_concurrency_level": 15 - } - ), - uploader_config=LocalUploaderConfig(output_dir=os.getenv("LOCAL_FILE_OUTPUT_DIR")) - ).run() - ``` - ```bash POST @@ -467,55 +364,6 @@ When elements are extracted from PDFs or images, it may be useful to get their b Set the `coordinates` parameter to `true` to add this field to the elements in the response. - - ```bash CLI - unstructured-ingest \ - local \ - --input-path $LOCAL_FILE_INPUT_DIR \ - --output-dir $LOCAL_FILE_OUTPUT_DIR \ - --partition-by-api \ - --api-key $UNSTRUCTURED_API_KEY \ - --partition-endpoint $UNSTRUCTURED_API_URL \ - --strategy hi_res \ - --additional-partition-args="{\"coordinates\":\"true\", \"split_pdf_page\":\"true\", \"split_pdf_allow_failed\":\"true\", \"split_pdf_concurrency_level\": 15}" - ``` - - - ```python Python - import os - - from unstructured_ingest.v2.pipeline.pipeline import Pipeline - from unstructured_ingest.v2.interfaces import ProcessorConfig - from unstructured_ingest.v2.processes.connectors.local import ( - LocalIndexerConfig, - LocalDownloaderConfig, - LocalConnectionConfig, - LocalUploaderConfig - ) - from unstructured_ingest.v2.processes.partitioner import PartitionerConfig - - if __name__ == "__main__": - Pipeline.from_configs( - context=ProcessorConfig(), - indexer_config=LocalIndexerConfig(input_path=os.getenv("LOCAL_FILE_INPUT_DIR")), - downloader_config=LocalDownloaderConfig(), - source_connection_config=LocalConnectionConfig(), - partitioner_config=PartitionerConfig( - partition_by_api=True, - api_key=os.getenv("UNSTRUCTURED_API_KEY"), - partition_endpoint=os.getenv("UNSTRUCTURED_API_URL"), - strategy="hi_res", - additional_partition_args={ - "coordinates": True, - "split_pdf_page": True, - "split_pdf_allow_failed": True, - "split_pdf_concurrency_level": 15 - } - ), - uploader_config=LocalUploaderConfig(output_dir=os.getenv("LOCAL_FILE_OUTPUT_DIR")) - ).run() - ``` - ```bash POST @@ -687,55 +535,6 @@ will be random, so with every partition of the same file, you will get different This can be helpful if you'd like to use the IDs as a primary key in a database, for example. - - ```bash CLI - unstructured-ingest \ - local \ - --input-path $LOCAL_FILE_INPUT_DIR \ - --output-dir $LOCAL_FILE_OUTPUT_DIR \ - --partition-by-api \ - --api-key $UNSTRUCTURED_API_KEY \ - --partition-endpoint $UNSTRUCTURED_API_URL \ - --strategy hi_res \ - --additional-partition-args="{\"unique_element_ids\":\"true\", \"split_pdf_page\":\"true\", \"split_pdf_allow_failed\":\"true\", \"split_pdf_concurrency_level\": 15}" - ``` - - - ```python Python - import os - - from unstructured_ingest.v2.pipeline.pipeline import Pipeline - from unstructured_ingest.v2.interfaces import ProcessorConfig - from unstructured_ingest.v2.processes.connectors.local import ( - LocalIndexerConfig, - LocalDownloaderConfig, - LocalConnectionConfig, - LocalUploaderConfig - ) - from unstructured_ingest.v2.processes.partitioner import PartitionerConfig - - if __name__ == "__main__": - Pipeline.from_configs( - context=ProcessorConfig(), - indexer_config=LocalIndexerConfig(input_path=os.getenv("LOCAL_FILE_INPUT_DIR")), - downloader_config=LocalDownloaderConfig(), - source_connection_config=LocalConnectionConfig(), - partitioner_config=PartitionerConfig( - partition_by_api=True, - api_key=os.getenv("UNSTRUCTURED_API_KEY"), - partition_endpoint=os.getenv("UNSTRUCTURED_API_URL"), - strategy="hi_res", - additional_partition_args={ - "unique_element_ids": True, - "split_pdf_page": True, - "split_pdf_allow_failed": True, - "split_pdf_concurrency_level": 15 - } - ), - uploader_config=LocalUploaderConfig(output_dir=os.getenv("LOCAL_FILE_OUTPUT_DIR")) - ).run() - ``` - ```bash POST @@ -904,61 +703,6 @@ By default, the `chunking_strategy` is set to `None`, and no chunking is perform [//]: # (TODO: add a link to the concepts section about chunking strategies. Need to create the shared Concepts section first) - - ```bash CLI - unstructured-ingest \ - local \ - --input-path $LOCAL_FILE_INPUT_DIR \ - --output-dir $LOCAL_FILE_OUTPUT_DIR \ - --chunking-strategy by_title \ - --chunk-max-characters 1024 \ - --partition-by-api \ - --api-key $UNSTRUCTURED_API_KEY \ - --partition-endpoint $UNSTRUCTURED_API_URL \ - --strategy hi_res \ - --additional-partition-args="{\"split_pdf_page\":\"true\", \"split_pdf_allow_failed\":\"true\", \"split_pdf_concurrency_level\": 15}" - ``` - - - ```python Python - import os - - from unstructured_ingest.v2.pipeline.pipeline import Pipeline - from unstructured_ingest.v2.interfaces import ProcessorConfig - from unstructured_ingest.v2.processes.connectors.local import ( - LocalIndexerConfig, - LocalDownloaderConfig, - LocalConnectionConfig, - LocalUploaderConfig - ) - from unstructured_ingest.v2.processes.partitioner import PartitionerConfig - from unstructured_ingest.v2.processes.chunker import ChunkerConfig - - if __name__ == "__main__": - Pipeline.from_configs( - context=ProcessorConfig(), - indexer_config=LocalIndexerConfig(input_path=os.getenv("LOCAL_FILE_INPUT_DIR")), - downloader_config=LocalDownloaderConfig(), - source_connection_config=LocalConnectionConfig(), - partitioner_config=PartitionerConfig( - partition_by_api=True, - api_key=os.getenv("UNSTRUCTURED_API_KEY"), - partition_endpoint=os.getenv("UNSTRUCTURED_API_URL"), - strategy="hi_res", - additional_partition_args={ - "split_pdf_page": True, - "split_pdf_allow_failed": True, - "split_pdf_concurrency_level": 15 - } - ), - chunker_config=ChunkerConfig( - chunking_strategy="by_title", - chunk_max_characters=1024 - ), - uploader_config=LocalUploaderConfig(output_dir=os.getenv("LOCAL_FILE_OUTPUT_DIR")) - ).run() - ``` - ```bash POST diff --git a/platform-api/partition-api/extract-image-block-types.mdx b/platform-api/partition-api/extract-image-block-types.mdx index fa930698..9e601f8b 100644 --- a/platform-api/partition-api/extract-image-block-types.mdx +++ b/platform-api/partition-api/extract-image-block-types.mdx @@ -18,25 +18,14 @@ You will need a document that is one of the document types supported by the `ext See the `extract_image_block_types` entry in [API Parameters](/platform-api/partition-api/api-parameters). This example uses a PDF file with embedded images and tables. -import ExtractImageBlockTypesIngestPy from '/snippets/how-to-api/extract_image_block_types_ingest.py.mdx'; import SharedAPIKeyURL from '/snippets/general-shared-text/api-key-url.mdx'; import ExtractImageBlockTypesPy from '/snippets/how-to-api/extract_image_block_types.py.mdx'; ## Code - - - For the [Unstructured Ingest Python library](/ingestion/python-ingest), you can use the standard Python - [json.load](https://docs.python.org/3/library/json.html#json.load) function to load into a Python dictionary the contents of a JSON - file that the Ingest Python library outputs after the processing is complete. - - - - For the [Unstructured Python SDK](/platform-api/partition-api/sdk-python), you'll need: - - - - +For the [Unstructured Python SDK](/platform-api/partition-api/sdk-python), you'll need: + + ## See also diff --git a/platform-api/partition-api/get-chunked-elements.mdx b/platform-api/partition-api/get-chunked-elements.mdx index 020d8933..efaee4d7 100644 --- a/platform-api/partition-api/get-chunked-elements.mdx +++ b/platform-api/partition-api/get-chunked-elements.mdx @@ -52,23 +52,12 @@ You will need to chunk a document during processing. This example uses a PDF fil ## Code -import GetChunkedElementsIngestPy from '/snippets/how-to-api/get_chunked_elements_ingest.py.mdx'; import GetChunkedElementsPy from '/snippets/how-to-api/get_chunked_elements.py.mdx'; import SharedAPIKeyURL from '/snippets/general-shared-text/api-key-url.mdx'; - - - For the [Unstructured Ingest Python library](/ingestion/python-ingest), you can use the standard Python - [json.load](https://docs.python.org/3/library/json.html#json.load) function to load into a Python dictionary the contents of a JSON - file that the Ingest Python library outputs after the processing is complete. - - - - For the [Unstructured Python SDK](/platform-api/partition-api/sdk-python), you'll need: - - - - +For the [Unstructured Python SDK](/platform-api/partition-api/sdk-python), you'll need: + + ## See also diff --git a/platform-api/partition-api/get-elements.mdx b/platform-api/partition-api/get-elements.mdx index 212a49c5..89e75afb 100644 --- a/platform-api/partition-api/get-elements.mdx +++ b/platform-api/partition-api/get-elements.mdx @@ -10,49 +10,9 @@ You want to get, manipulate, and print or save, the contents of the [document el Each element in the document elements contains fields for that element's type, its ID, the extracted text, and associated metadata. -The programmatic approach you take to get these document elements will depend on which tool, SDK, or library you use: +The programmatic approach you take to get these document elements will depend on which SDK you use: - - For the [Unstructured Ingest CLI](/ingestion/overview#unstructured-ingest-cli), you can use a tool such as [jq](https://jqlang.github.io/jq/) - to work with a JSON file that the CLI outputs after the processing is complete. - - For example, the following script uses `jq` to access and print each element's ID, text, and originating file name: - - ```bash Shell - #!/usr/bin/env bash - - JSON_FILE="local-ingest-output/my-file.json" - - jq -r '.[] | "ID: \(.element_id)\nText: \(.text)\nFilename: \(.metadata.filename)\n"' \ - "$JSON_FILE" - ``` - - - For the [Unstructured Ingest Python library](/ingestion/python-ingest), you can use the standard Python - [json.load](https://docs.python.org/3/library/json.html#json.load) function to load into a Python dictionary the contents of a JSON - file that the Ingest Python library outputs after the processing is complete. - - For example, the following code example uses standard Python to access and print each element's ID, text, and originating file name: - - ```python Python - import json - - def parse_json_file(input_file_path: str): - with open(input_file_path, 'r') as file: - file_elements = json.load(file) - - for element in file_elements: - print(f"ID: {element["element_id"]}") - print(f"Text: {element["text"]}") - print(f"Filename: {element["metadata"]["filename"]}\n") - - if __name__ == "__main__": - parse_json_file( - input_file_path="local-ingest-output/my-file.json" - ) - ``` - For the [Unstructured Python SDK](/platform-api/partition-api/sdk-python), calling an `UnstructuredClient` object's `general.partition_async` method returns a `PartitionResponse` object. @@ -199,99 +159,4 @@ The programmatic approach you take to get these document elements will depend on } // ... ``` - - For the [Unstructured open-source library](/open-source/introduction/overview), calling the `partition_via_api` function returns a list of elements (`list[Element]`). For example: - - ```python Python - # ... - - elements = partition_via_api( - filename=input_filepath, - api_key=os.getenv("UNSTRUCTURED_API_KEY"), - api_url=os.getenv("UNSTRUCTURED_API_URL"), - strategy="hi_res" - ) - - # ... - ``` - - You can use standard Python [list operations](https://docs.python.org/3/tutorial/datastructures.html#more-on-lists) on this list. - - You can also use standard Python [looping techniques](https://docs.python.org/3/tutorial/datastructures.html#looping-techniques) on this list to access each element in this list. - - Each individual element has the following attributes: - - - `.text` provides the element's `text` field value as a `str`. See [Element example](/open-source/concepts/document-elements#element-example). - - `.metadata` provides the element's `metadata` field as an `ElementMetadata` object. See [Metadata](/open-source/concepts/document-elements#metadata). - - `.category` provides the element's `type` field value as a `str`. See [Element type](/open-source/concepts/document-elements#element-type). - - `.id` provides the element's `element_id` value as a `str`. See [Element ID](/open-source/concepts/document-elements#element-id). - - In addition, the following methods are available: - - - `.convert_coordinates_to_new_system()` converts the element's location coordinates, if any, to a new coordinate system. See [Element's coordinates](/open-source/concepts/document-elements#elements-coordinates). - - `.to_dict()` gets the element's content as a standard Python key-value dictionary (`dict[str, Any]`). - - For example: - - ```python Python - # ... - - elements = partition_via_api( - filename=input_filepath, - api_key=os.getenv("UNSTRUCTURED_API_KEY"), - api_url=os.getenv("UNSTRUCTURED_API_URL"), - strategy="hi_res" - ) - - for element in elements: - # Do something with each element, for example: - save_element_to_database(f"{element.id}") - save_element_to_database(f"{element.text}") - save_element_to_database(f"{element.metadata.filename}") - - ``` - - To serialize this list as a Python dictionary, you can use the `elements_to_dicts` method, for example: - - ```python Python - from unstructured.staging.base import elements_to_dicts - - # ... - - elements = partition_via_api( - filename=input_filepath, - api_key=os.getenv("UNSTRUCTURED_API_KEY"), - api_url=os.getenv("UNSTRUCTURED_API_URL"), - strategy="hi_res" - ) - - elements_dicts = elements_to_dicts(elements) - ``` - - To serialize this list as JSON, you can use the `elements_to_json` function to convert the list of elements (`Iterable[Element]`) into a JSON-formatted string and then print or save that string. For example: - - ```python Python - from unstructured.staging.base import elements_to_json - - # ... - - elements = partition_via_api( - filename=input_filepath, - api_key=os.getenv("UNSTRUCTURED_API_KEY"), - api_url=os.getenv("UNSTRUCTURED_API_URL"), - strategy="hi_res" - ) - - json_elements = elements_to_json( - elements=elements, - indent=2 - ) - - elements_to_json( - elements=elements, - indent=2, - filename=output_filepath - ) - ``` - \ No newline at end of file diff --git a/platform-api/partition-api/sdk-jsts.mdx b/platform-api/partition-api/sdk-jsts.mdx index 6981beb7..f5fead98 100644 --- a/platform-api/partition-api/sdk-jsts.mdx +++ b/platform-api/partition-api/sdk-jsts.mdx @@ -1,23 +1,11 @@ --- -title: Process an individual file by using the Unstructured JavaScript/TypeScript SDK -sidebarTitle: JavaScript/TypeScript SDK +title: JavaScript/TypeScript SDK --- -The [Unstructured JavaScript/TypeScript SDK](https://github.com/Unstructured-IO/unstructured-js-client) client allows you to send an individual file for processing by the Unstructured Platform Partiiton API. +The [Unstructured JavaScript/TypeScript SDK](https://github.com/Unstructured-IO/unstructured-js-client) client allows you to send one file at a time for processing by the Unstructured Platform Partition API. -import UseIngestOrPlatformInstead from '/snippets/general-shared-text/use-ingest-or-platform-instead.mdx'; - - - -To use the JavaScript/TypeScript SDK, you'll need: - -import SharedAPIKeyURL from '/snippets/general-shared-text/api-key-url.mdx'; - - - -import NoURLForServerlessAPI from '/snippets/general-shared-text/no-url-for-serverless-api.mdx'; - - +To use the JavaScript/TypeScript SDK, you'll first need to set an environment variable named `UNSTRUCTURED_API_KEY`, +representing your Unstructured API key. [Get your API key](/platform-api/partition-api/overview). ## Installation @@ -35,7 +23,7 @@ import NoURLForServerlessAPI from '/snippets/general-shared-text/no-url-for-serv ## Basics - Let's start with a simple example in which you send a PDF document to be partitioned with the free Unstructured API: + Let's start with a simple example in which you send a PDF document to the Unstructured Platform Parition Endpoint to be partitioned by Unstructured. The JavaScript/TypeScript SDK has the following breaking changes in v0.11.0: @@ -294,8 +282,8 @@ import NoURLForServerlessAPI from '/snippets/general-shared-text/no-url-for-serv ## Parameters & examples -The parameter names used in this document are for the JavaScript/TypeScript SDK, which follow camelCase -convention. The Python SDK uses snake_case convention. Other than this difference in naming convention, +The parameter names used in this document are for the JavaScript/TypeScript SDK, which follows the `camelCase` +convention. The Python SDK follows the `snake_case` convention. Other than this difference in naming convention, the names used in the SDKs are the same across all methods. * Refer to the [API parameters](/platform-api/partition-api/api-parameters) page for the full list of available parameters. diff --git a/platform-api/partition-api/sdk-python.mdx b/platform-api/partition-api/sdk-python.mdx index 29d2cdd2..347eec8b 100644 --- a/platform-api/partition-api/sdk-python.mdx +++ b/platform-api/partition-api/sdk-python.mdx @@ -1,9 +1,8 @@ --- -title: Process an individual file by using the Unstructured Python SDK -sidebarTitle: Python SDK +title: Python SDK --- -The [Unstructured Python SDK](https://github.com/Unstructured-IO/unstructured-python-client) client allows you to send an individual file for processing by +The [Unstructured Python SDK](https://github.com/Unstructured-IO/unstructured-python-client) client allows you to send one file at a time for processing by the [Unstructured Platform Partition Endpoint](/platform-api/partition-api/overview). To use the Python SDK, you'll first need to set an environment variable named `UNSTRUCTURED_API_KEY`, diff --git a/platform-api/partition-api/speed-up-large-files-batches.mdx b/platform-api/partition-api/speed-up-large-files-batches.mdx index 696c0377..9cd3704b 100644 --- a/platform-api/partition-api/speed-up-large-files-batches.mdx +++ b/platform-api/partition-api/speed-up-large-files-batches.mdx @@ -4,77 +4,13 @@ title: Speed up processing of large files and batches When you use Unstructured, here are some techniques that you can try to help speed up the processing of large files and large batches of files. -- Choose your partitioning strategy wisely. For example, if you have simple PDFs that don't have images and tables, you might be able to use the `fast` strategy. Try the `fast` strategy on a few of your documents before you try using the `hi_res` strategy. [Learn more](/platform-api/partition-api/partitioning). -- For processing large numbers of documents, use [ingestion](/ingestion/overview) and [add CPUs](#adding-cpus). -- For processing large individual PDF files with the Unstructured SDKs, [use PDF splitting parameters](#pdf-files). +Choose your partitioning strategy wisely. For example, if you have simple PDFs that don't have images and tables, you might be able to use the `fast` strategy. Try the `fast` strategy on a few of your documents before you try using the `hi_res` strategy. [Learn more](/platform-api/partition-api/partitioning). -## Adding CPUs +To speed up PDF file processing, the [Unstructured SDK for Python](/platform-api/partition-api/sdk-python) and the [Unstructured SDK for JavaScript/TypeScript](/platform-api/partition-api/sdk-jsts) provide the following parameters to help speed up processing a large PDF file: -For speeding up file processing during [ingestion](/ingestion/overview), the Unstructured CLI and Unstructured Python Ingest enable you to instruct Unstructured to use additional local CPUs where applicable. - -Using additional local CPUs applies only to pipeline steps that Unstructured logs as being processed across CPUs. It does not apply to pipeline steps that are logged as being processed asynchronously. To get a list of which operations are processed where, look for the following log messages when you run an ingest pipeline: - -- Steps that are processed across CPUs correspond to log messages that read: `processing content across processes`. These steps might benefit by setting a higher number of local CPUs to be used. -- Steps that are processed asynchronously correspond to log messages that read: `processing content across processes`. Any settings to use a higher number of local CPUs are ignored for these steps. - -For the Unstructured CLI, you can set `--num-processes` to the maximum number of available local CPUs that you want to use where applicable, for example: - -```bash -unstructured-ingest \ - local \ - --num-processes - # ... -``` - -To get the maximum number of available local logical CPUs that can be used where applicable, see your operating system's documentation. - -For Unstructured Python Ingest, you can set the `ProcessorConfig` object's `num_processes` parameter to the maximum number of available local CPUs that you want to use where applicable, for example: - - - ```python Python Ingest v2 - from unstructured_ingest.v2.interfaces import ProcessorConfig - - # ... - - if __name__ == "__main__": - Pipeline.from_configs( - context=ProcessorConfig( - num_processes=, - # ... - ), - # ... - ).run() - ``` - - ```python Python Ingest v1 - from unstructured_ingest.interfaces import ( - ProcessorConfig, - # ... - ) - from unstructured_ingest.runner import LocalRunner - - # ... - - if __name__ == "__main__": - runner = LocalRunner( - processor_config=ProcessorConfig( - num_processes=, - # ... - ), - # ... - ).run() - ``` - - -In Python, to specify the maximum number of available local logical CPUs that can be used where applicable, you can call functions such as [os.cpu_count](https://docs.python.org/3/library/os.html#os.cpu_count) and [multiprocessing.cpu_count](https://docs.python.org/3/library/multiprocessing.html#multiprocessing.cpu_count). - -## PDF files - -To speed up PDF file processing, the [Unstructured Ingest CLI](/ingestion/overview#unstructured-ingest-cli), the [Unstructured Ingest Python library](/ingestion/python-ingest), the [Unstructured SDK for Python](/platform-api/partition-api/sdk-python), and the [Unstructured SDK for JavaScript/TypeScript](/platform-api/partition-api/sdk-jsts) provide the following parameters to help speed up processing a large PDF file: - -- `split_pdf_page` (CLI/Python) or `splitPdfPage` (JavaScript/TypeScript), when set to true, splits the PDF file on the client side before sending it as batches to Unstructured for processing. The number of pages in each batch is determined internally. Batches can contain between 2 and 20 pages. -- `split_pdf_concurrency_level` (CLI/Python) or `splitPdfConcurrencyLevel` (JavaScript/TypeScript) is an integer that specifies the number of parallel requests. The default is 5. The maximum is 15. This behavior is ignored unless `split_pdf_page` (CLI/Python) or `splitPdfPage` (JavaScript/TypeScript) is also set to true. -- `split_pdf_allow_failed` (CLI/Python) or splitPdfAllowFailed` (JavaScript/TypeScript), when set to true, allows partitioning to continue even if some pages fail. -- `split_pdf_page_range` (CLI/Python only) is a list of two integers that specify the beginning and ending page numbers of the PDF file to be sent. A `ValueError` is raised if the specified range is not valid. This behavior is ignored unless `split_pdf_page` is also set to true. +- `split_pdf_page` (Python) or `splitPdfPage` (JavaScript/TypeScript), when set to true, splits the PDF file on the client side before sending it as batches to Unstructured for processing. The number of pages in each batch is determined internally. Batches can contain between 2 and 20 pages. +- `split_pdf_concurrency_level` (Python) or `splitPdfConcurrencyLevel` (JavaScript/TypeScript) is an integer that specifies the number of parallel requests. The default is 5. The maximum is 15. This behavior is ignored unless `split_pdf_page` (Python) or `splitPdfPage` (JavaScript/TypeScript) is also set to true. +- `split_pdf_allow_failed` (Python) or splitPdfAllowFailed` (JavaScript/TypeScript), when set to true, allows partitioning to continue even if some pages fail. +- `split_pdf_page_range` (Python only) is a list of two integers that specify the beginning and ending page numbers of the PDF file to be sent. A `ValueError` is raised if the specified range is not valid. This behavior is ignored unless `split_pdf_page` is also set to true. [Learn more](/platform-api/partition-api/sdk-python#page-splitting). diff --git a/platform-api/partition-api/text-as-html.mdx b/platform-api/partition-api/text-as-html.mdx index 7601ce4c..82f83b70 100644 --- a/platform-api/partition-api/text-as-html.mdx +++ b/platform-api/partition-api/text-as-html.mdx @@ -16,25 +16,14 @@ You will need a document that is one of the document types that can output the ` This example uses a PDF file with an embedded table. -import ExtractTextAsHTMLIngestPy from '/snippets/how-to-api/extract_text_as_html_ingest.py.mdx'; import SharedAPIKeyURL from '/snippets/general-shared-text/api-key-url.mdx'; import ExtractTextAsHTMLPy from '/snippets/how-to-api/extract_text_as_html.py.mdx'; ## Code - - - For the [Unstructured Ingest Python library](/ingestion/python-ingest), you can use the standard Python - [json.load](https://docs.python.org/3/library/json.html#json.load) function to load into a Python dictionary the contents of a JSON - file that the Ingest Python library outputs after the processing is complete. - - - - For the [Unstructured Python SDK](/platform-api/partition-api/sdk-python), you'll need: - - - - +For the [Unstructured Python SDK](/platform-api/partition-api/sdk-python), you'll need: + + ## See also From fb41e0de7a15a303abcc8108a5dabcda01a2ff3a Mon Sep 17 00:00:00 2001 From: Paul Cornell Date: Thu, 27 Feb 2025 18:03:49 -0800 Subject: [PATCH 2/4] Add missing VLM strategy to partitioning page --- snippets/concepts/partitioning-strategies.mdx | 1 + 1 file changed, 1 insertion(+) diff --git a/snippets/concepts/partitioning-strategies.mdx b/snippets/concepts/partitioning-strategies.mdx index ff51e45f..c7dfe211 100644 --- a/snippets/concepts/partitioning-strategies.mdx +++ b/snippets/concepts/partitioning-strategies.mdx @@ -15,6 +15,7 @@ To give you an example, the `fast` strategy is roughly 100x faster than leading * `fast`: The "rule-based" strategy leverages traditional NLP extraction techniques to quickly pull all the text elements. "Fast" strategy is not recommended for image-based file types. * `hi_res`: The "model-based" strategy identifies the layout of the document. The advantage of "hi_res" is that it uses the document layout to gain additional information about document elements. We recommend using this strategy if your use case is highly sensitive to correct classifications for document elements. * `ocr_only`: Another "model-based" strategy that leverages Optical Character Recognition to extract text from the image-based files. +* `vlm`: Uses a vision language model (VLM) to extract text from these file types: `.bmp`, `.gif`, `.heic`, `.jpeg`, `.jpg`, `.pdf`, `.png`, `.tiff`, and `.webp`. **These strategies are available on the following partition functions:** From 191d7805767280226d9296542308a7a49989cd3c Mon Sep 17 00:00:00 2001 From: Paul Cornell Date: Thu, 27 Feb 2025 18:10:27 -0800 Subject: [PATCH 3/4] Removed blank overview page, not needed after all --- ingestion/how-to/overview.mdx | 0 mint.json | 1 - 2 files changed, 1 deletion(-) delete mode 100644 ingestion/how-to/overview.mdx diff --git a/ingestion/how-to/overview.mdx b/ingestion/how-to/overview.mdx deleted file mode 100644 index e69de29b..00000000 diff --git a/mint.json b/mint.json index d5a486b9..b612ea69 100644 --- a/mint.json +++ b/mint.json @@ -498,7 +498,6 @@ { "group": "How-to", "pages": [ - "ingestion/how-to/overview", "ingestion/how-to/examples", "ingestion/how-to/speed-up-large-files-batches", "ingestion/how-to/get-elements", From f3f4803c859b80c49ebc095377f83533fd99eee5 Mon Sep 17 00:00:00 2001 From: Paul Cornell Date: Fri, 28 Feb 2025 08:23:39 -0800 Subject: [PATCH 4/4] Add VLM to remaining Python and POST Partition Endpoint examples --- mint.json | 12 +++-- .../partition-api/choose-hi-res-model.mdx | 43 ----------------- .../choose-partitioning-strategy.mdx | 47 ------------------- platform-api/partition-api/examples.mdx | 43 ++++++++--------- .../extract_image_block_types.py.mdx | 4 +- .../how-to-api/extract_text_as_html.py.mdx | 4 +- .../how-to-api/get_chunked_elements.py.mdx | 4 +- 7 files changed, 40 insertions(+), 117 deletions(-) delete mode 100644 platform-api/partition-api/choose-hi-res-model.mdx delete mode 100644 platform-api/partition-api/choose-partitioning-strategy.mdx diff --git a/mint.json b/mint.json index b612ea69..e05ead5a 100644 --- a/mint.json +++ b/mint.json @@ -342,8 +342,6 @@ "platform-api/partition-api/partitioning", "platform-api/partition-api/chunking", "platform-api/partition-api/speed-up-large-files-batches", - "platform-api/partition-api/choose-partitioning-strategy", - "platform-api/partition-api/choose-hi-res-model", "platform-api/partition-api/get-elements", "platform-api/partition-api/text-as-html", "platform-api/partition-api/extract-image-block-types", @@ -618,6 +616,14 @@ "source": "/platform/api/:slug*", "destination": "/platform-api/api/:slug*" }, + { + "source": "/platform-api/partition-api/choose-hi-res-model", + "destination": "/platform-api/partition-api/partitioning" + }, + { + "source": "/platform-api/partition-api/choose-partitioning-strategy", + "destination": "/platform-api/partition-api/partitioning" + }, { "source": "/platform-api/partition-api/embedding", "destination": "/ingestion/how-to/embedding" @@ -625,7 +631,7 @@ { "source": "/platform-api/partition-api/filter-files", "destination": "/ingestion/how-to/filter-files" - } + } ], "analytics": { "ga4": { diff --git a/platform-api/partition-api/choose-hi-res-model.mdx b/platform-api/partition-api/choose-hi-res-model.mdx deleted file mode 100644 index f30b2c99..00000000 --- a/platform-api/partition-api/choose-hi-res-model.mdx +++ /dev/null @@ -1,43 +0,0 @@ ---- -title: Choose a hi-res model ---- - -## Task - -You want to specify a high-resolution object detection model to be used when processing image files, or PDFs with embedded images or tables, but you are not sure which model to specify. - -## Approach - -Use the following decision-maker to help you determine which model to specify. - - - - - If **Yes**, then continue with **Step 2**. - - If **No**, then Unstructured will not use a high-resolution object detection model when processing your files. Set the command's `--strategy` option (CLI) or `strategy` parameter (Python/JavaScript/TypeScript) to `fast`. See also [Choose a partitioning strategy](/platform-api/partition-api/choose-partitioning-strategy). - - - If you already have your scripts or code in place and just need help in choosing a model, then skip ahead to **Step 3**. Otherwise: - - - To have Unstructured make its best choice on your behalf about the model to use, set the command's `--strategy` option (CLI) or `strategy` parameter (Python/JavaScript/TypeScript) to `auto`. You have completed this decision-maker. See also [Auto partitioning strategy logic](/platform-api/partition-api/choose-partitioning-strategy#auto-partitioning-strategy-logic). - - To specify a specific model, set `--strategy` or `strategy` to `hi_res`. Then set `--hi-res-model-name` (CLI), `hi_res_model_name` (Python), or `hiResModelName` (JavaScript/TypeScript) to one of the models in **Step 3**. - - - - `layout_v1.1.0` generally performs better than `yolox` at bounding box definitions and element classification. `layout_v1.1.0` is a proprietary Unstructured object detection model and is used by default, as applicable, if `--hi-res-model-name`, `hi_res_model_name`, or `hiResModelName` is not specified. - - `yolox` is also provided for backwards compatibility and originally was the replacement for `detectron2_onnx`. - - `detectron2_onnx` generally underperforms the preceding models. However, it is still accessible to maintain backwards compatibility. - - - - - -## Code examples - -See [Changing partition strategy for a PDF](/platform-api/partition-api/examples#changing-partition-strategy-for-a-pdf). \ No newline at end of file diff --git a/platform-api/partition-api/choose-partitioning-strategy.mdx b/platform-api/partition-api/choose-partitioning-strategy.mdx deleted file mode 100644 index fad7d583..00000000 --- a/platform-api/partition-api/choose-partitioning-strategy.mdx +++ /dev/null @@ -1,47 +0,0 @@ ---- -title: Choose a partitioning strategy ---- - -## Task - -You want to use the fastest, highest-precision, yet most cost-effective [partitioning strategy](/platform-api/partition-api/partitioning) overall, but you are not sure which one to specify for your particular files. - -## Approach - -The `--strategy` command option (CLI) or `strategy` parameter (Python/JavaScript/TypeScript) specifies the partitioning strategy to use. - -Use the following decision-maker to help you determine what to set `--strategy` or `strategy` to. - - - - - If **All**, you can increase precision for processing images and tables by specifying `hi_res` for `--strategy` or `strategy`. For additional benefits, skip ahead to **Step 2**. - - If **Some**, you can increase precision for processing images and tables by specifying `auto` for `--strategy` or `strategy`. This will leave the decision up to Unstructured on a file-by-file basis about the best partitioning strategy to use. You have completed this decision-maker. See also [Auto partitioning strategy logic](#auto-partitioning-strategy-logic). - - If **None**, you can increase performance and decrease cost by specifying `fast` for `--strategy` or `strategy`. You have completed this decision-maker. - - For OCR, you can also set `--strategy` or `strategy` to `ocr_only`. Unstructured will use the Tesseract OCR agent as applicable. You cannot specify any other OCR agent to be used instead. - - - - If **No**, then specify `auto` for `--strategy` or `strategy`. This will decrease performance compared to answering **Yes**. You have completed this decision-maker. See also [Auto partitioning strategy logic](#auto-partitioning-strategy-logic). - - If **Yes**, then set `--hi-res-model-name` (CLI), `hi_res_model_name` (Python), or `hiResModelName`(JavaScript/TypeScript) to the available model's name. [Learn about the available models](/platform-api/partition-api/choose-hi-res-model). - - - - - -## Code example - -See [Changing partition strategy for a PDF](/platform-api/partition-api/examples#changing-partition-strategy-for-a-pdf). - -## Auto partitioning strategy logic - -Setting `--strategy` or `strategy` to `auto` leaves the decision up to Unstructured on a page-by-page basis about which partitioning strategy to use. - -If `--strategy` or `strategy` is not specified, the `auto` strategy is used by default. diff --git a/platform-api/partition-api/examples.mdx b/platform-api/partition-api/examples.mdx index 82d45bbc..e41ef17e 100644 --- a/platform-api/partition-api/examples.mdx +++ b/platform-api/partition-api/examples.mdx @@ -15,17 +15,6 @@ For the Unstructured Python SDK, you do not need to set an environment variable ### Changing partition strategy for a PDF Here's how you can modify partition strategy for a PDF file, and select an alternative model to use with Unstructured API. -The `hi_res` strategy supports different models, and the default is `layout_v1.1.0`. - - @@ -36,8 +25,9 @@ The `hi_res` strategy supports different models, and the default is `layout_v1.1 -H 'Content-Type: multipart/form-data' \ -H 'unstructured-api-key: $UNSTRUCTURED_API_KEY' \ -F 'files=@sample-docs/layout-parser-paper.pdf' \ - -F 'strategy=hi_res' \ - -F 'hi_res_model_name=layout_v1.1.0' + -F 'strategy=vlm' \ + -F 'vlm_model_provider=openai' \ + -F 'vlm_model=gpt-4o' ``` @@ -60,8 +50,9 @@ The `hi_res` strategy supports different models, and the default is `layout_v1.1 "content": open(filename, "rb"), "file_name": os.path.basename(filename), }, - "strategy": shared.Strategy.HI_RES, - "hi_res_model_name": "layout_v1.1.0", + "strategy": shared.Strategy.VLM, + "vlm_model": "gpt-4o", + "vlm_model_provider": "openai", "split_pdf_page": True, "split_pdf_allow_failed": True, "split_pdf_concurrency_level": 15 @@ -205,8 +196,9 @@ For better OCR results, you can specify what languages your document is in using -H 'Content-Type: multipart/form-data' \ -H 'unstructured-api-key: $UNSTRUCTURED_API_KEY' \ -F 'files=@sample-docs/korean.png' \ - -F 'strategy=ocr_only' \ - -F 'languages=kor' + -F 'strategy=vlm' \ + -F 'vlm_model_provider=openai' \ + -F 'vlm_model=gpt-4o' \-F 'languages=kor' ``` @@ -543,7 +535,10 @@ This can be helpful if you'd like to use the IDs as a primary key in a database, -H 'Content-Type: multipart/form-data' \ -H 'unstructured-api-key: $UNSTRUCTURED_API_KEY' \ -F 'files=@sample-docs/layout-parser-paper-fast.pdf' \ - -F 'unique_element_ids=true' + -F 'unique_element_ids=true' \ + -F 'strategy=vlm' \ + -F 'vlm_model_provider=openai' \ + -F 'vlm_model=gpt-4o' ``` @@ -566,7 +561,9 @@ This can be helpful if you'd like to use the IDs as a primary key in a database, "content": open(filename, "rb"), "file_name": os.path.basename(filename), }, - "strategy": shared.Strategy.HI_RES, + "strategy": shared.Strategy.VLM, + "vlm_model": "gpt-4o", + "vlm_model_provider": "openai", "unique_element_ids": True, "split_pdf_page": True, "split_pdf_allow_failed": True, @@ -713,7 +710,9 @@ By default, the `chunking_strategy` is set to `None`, and no chunking is perform -F 'files=@sample-docs/layout-parser-paper-fast.pdf' \ -F 'chunking_strategy=by_title' \ -F 'max_characters=1024' \ - -F 'strategy=hi_res' + -F 'strategy=vlm' \ + -F 'vlm_model_provider=openai' \ + -F 'vlm_model=gpt-4o' ``` @@ -738,7 +737,9 @@ By default, the `chunking_strategy` is set to `None`, and no chunking is perform }, "chunking_strategy": "by_title", "max_characters": 1024, - "strategy": shared.Strategy.HI_RES, + "strategy": shared.Strategy.VLM, + "vlm_model": "gpt-4o", + "vlm_model_provider": "openai", "split_pdf_page": True, "split_pdf_allow_failed": True, "split_pdf_concurrency_level": 15 diff --git a/snippets/how-to-api/extract_image_block_types.py.mdx b/snippets/how-to-api/extract_image_block_types.py.mdx index 30cea8ff..3f414540 100644 --- a/snippets/how-to-api/extract_image_block_types.py.mdx +++ b/snippets/how-to-api/extract_image_block_types.py.mdx @@ -28,7 +28,9 @@ if __name__ == "__main__": request = operations.PartitionRequest( shared.PartitionParameters( files=files, - strategy=shared.Strategy.HI_RES, + strategy=shared.Strategy.VLM, + vlm_model="gpt-4o", + vlm_model_provider="openai", split_pdf_page=True, split_pdf_allow_failed=True, split_pdf_concurrency_level=15, diff --git a/snippets/how-to-api/extract_text_as_html.py.mdx b/snippets/how-to-api/extract_text_as_html.py.mdx index e0b7a54e..e0669325 100644 --- a/snippets/how-to-api/extract_text_as_html.py.mdx +++ b/snippets/how-to-api/extract_text_as_html.py.mdx @@ -27,7 +27,9 @@ if __name__ == "__main__": request = operations.PartitionRequest( shared.PartitionParameters( files=files, - strategy=shared.Strategy.HI_RES, + strategy=shared.Strategy.VLM, + vlm_model="gpt-4o", + vlm_model_provider="openai", split_pdf_page=True, split_pdf_allow_failed=True, split_pdf_concurrency_level=15 diff --git a/snippets/how-to-api/get_chunked_elements.py.mdx b/snippets/how-to-api/get_chunked_elements.py.mdx index 8e8914ae..673edff3 100644 --- a/snippets/how-to-api/get_chunked_elements.py.mdx +++ b/snippets/how-to-api/get_chunked_elements.py.mdx @@ -32,7 +32,9 @@ with open(input_filepath, "rb") as f: req = operations.PartitionRequest( shared.PartitionParameters( files=files, - strategy=shared.Strategy.HI_RES, + strategy=shared.Strategy.VLM, + vlm_model="gpt-4o", + vlm_model_provider="openai", split_pdf_page=True, split_pdf_allow_failed=True, split_pdf_concurrency_level=15,