From e8aa63d03c5270a62f5bcb893b518905c6aa60fa Mon Sep 17 00:00:00 2001 From: Austin Walker Date: Mon, 23 Sep 2024 11:24:01 -0400 Subject: [PATCH] chore: update openapi.json Copy over the latest API spec from https://api.unstructured.io/general/openapi.json, for use in our API playground. --- openapi.json | 755 ++++++++++++++++++++++++++++++++++----------------- 1 file changed, 503 insertions(+), 252 deletions(-) diff --git a/openapi.json b/openapi.json index 38275f60..e2526564 100644 --- a/openapi.json +++ b/openapi.json @@ -1,264 +1,515 @@ { - "openapi": "3.1.0", - "info": { - "title": "Unstructured Pipeline API", - "version": "0.0.1", - "summary": "Partition documents with the Unstructured library" - }, - "servers": [ - { - "url": "https://api.unstructured.io", - "description": "Hosted API", - "x-speakeasy-server-id": "prod" + "openapi": "3.1.0", + "info": { + "title": "Unstructured Pipeline API", + "version": "1.0.48" }, - { - "url": "http://localhost:8000", - "description": "Development server", - "x-speakeasy-server-id": "local" - } - ], - "x-speakeasy-retries": { - "strategy": "backoff", - "backoff": { - "initialInterval": 500, - "maxInterval": 60000, - "maxElapsedTime": 900000, - "exponent": 1.5 - }, - "statusCodes": ["5xx"], - "retryConnectionErrors": true - }, - "security": [{ "ApiKeyAuth": [] }], - "tags": [{ "name": "general" }], - "paths": { - "/general/v0/general": { - "post": { - "tags": ["general"], - "summary": "Pipeline 1", - "operationId": "partition", - "x-speakeasy-name-override": "partition", - "requestBody": { - "content": { - "multipart/form-data": { - "schema": { "$ref": "#/components/schemas/partition_parameters" } - } - } + "servers": [ + { + "url": "https://api.unstructuredapp.io", + "description": "Serverless SaaS API", + "x-speakeasy-server-id": "saas-api" }, - "responses": { - "200": { - "description": "Successful Response", - "content": { - "application/json": { - "schema": { "$ref": "#/components/schemas/Elements" } - } - } - }, - "422": { - "description": "Validation Error", - "content": { - "application/json": { - "schema": { "$ref": "#/components/schemas/HTTPValidationError" } - } - } - } + { + "url": "https://api.unstructured.io", + "description": "Hosted API Free", + "x-speakeasy-server-id": "free-api" }, - "x-codeSamples": [ - { - "lang": "python", - "label": "partition", - "source": "import unstructured_client\nfrom unstructured_client.models import shared\n\ns = unstructured_client.UnstructuredClient(\n api_key_auth=\"YOUR_API_KEY\",\n)\n\nreq = shared.PartitionParameters(\n chunking_strategy='by_title',\n combine_under_n_chars=500,\n encoding='utf-8',\n extract_image_block_types=[\n 'image',\n 'table',\n ],\n gz_uncompressed_content_type='application/pdf',\n hi_res_model_name='yolox',\n languages=[\n '[',\n 'e',\n 'n',\n 'g',\n ']',\n ],\n max_characters=1500,\n new_after_n_chars=1500,\n output_format='application/json',\n overlap=25,\n strategy='hi_res',\n)\n\nres = s.partition(req)\n\nif res.elements is not None:\n # handle response\n pass" - }, - { - "lang": "typescript", - "label": "partition", - "source": "import { openAsBlob } from \"node:fs\";\nimport { UnstructuredClient } from \"unstructured-client\";\n\nasync function run() {\n const sdk = new UnstructuredClient({\n security: {\n apiKeyAuth: \"YOUR_API_KEY\",\n },\n });\n\n const result = await sdk.partition({\n chunkingStrategy: \"by_title\",\n combineUnderNChars: 500,\n encoding: \"utf-8\",\n extractImageBlockTypes: [\n \"image\",\n \"table\",\n ],\n files: await openAsBlob(\"./sample-file\"),\n gzUncompressedContentType: \"application/pdf\",\n hiResModelName: \"yolox\",\n languages: [\n \"[\",\n \"e\",\n \"n\",\n \"g\",\n \"]\",\n ],\n maxCharacters: 1500,\n newAfterNChars: 1500,\n outputFormat: \"application/json\",\n overlap: 25,\n skipInferTableTypes: [\n \"pdf\",\n ],\n strategy: \"hi_res\",\n });\n\n // Handle the result\n console.log(result)\n}\n\nrun();" - } - ] - } - } - }, - "components": { - "securitySchemes": { - "ApiKeyAuth": { - "type": "apiKey", - "name": "unstructured-api-key", - "in": "header", - "x-speakeasy-example": "YOUR_API_KEY" - } - }, - "schemas": { - "Elements": { - "type": "array", - "items": { - "Element": { - "type": "object", - "properties": { - "type": {}, - "element_id": {}, - "metadata": {}, - "text": {} + { + "url": "http://localhost:8000", + "description": "Development server", + "x-speakeasy-server-id": "development" + } + ], + "paths": { + "/general/v0/general": { + "post": { + "tags": [ + "general" + ], + "summary": "Summary", + "description": "Description", + "operationId": "partition", + "parameters": [ + { + "name": "unstructured-api-key", + "in": "header", + "required": false, + "schema": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Unstructured-Api-Key" + } + } + ], + "requestBody": { + "required": true, + "content": { + "multipart/form-data": { + "schema": { + "$ref": "#/components/schemas/partition_parameters" + } + } + } + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": { + "items": { + "$ref": "#/components/schemas/Element" + }, + "description": "A list of element dictionaries extracted from the file", + "title": "Elements", + "type": "array" + } + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + }, + "5XX": { + "description": "Server Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ServerError" + } + } + } + } + }, + "x-speakeasy-name-override": "partition" } - } } - }, - "partition_parameters": { - "properties": { - "files": { - "type": "string", - "format": "binary", - "description": "The file to extract", - "required": "true", - "example": { - "summary": "File to be partitioned", - "externalValue": "https://github.com/Unstructured-IO/unstructured-ingest/blob/main/example-docs/pdf/layout-parser-paper.pdf" + }, + "components": { + "schemas": { + "HTTPValidationError": { + "type": "object", + "properties": { + "detail": { + "oneOf": [ + { + "type": "array", + "items": { + "$ref": "#/components/schemas/ValidationError" + } + }, + { + "type": "string" + } + ] + } + }, + "example": { + "detail": [ + { + "type": "int_parsing", + "loc": [ + "body", + "combine_under_n_chars" + ], + "msg": "Input should be a valid integer, unable to parse string as an integer", + "input": "forty" + } + ] + } + }, + "ValidationError": { + "properties": { + "loc": { + "items": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "integer" + } + ] + }, + "type": "array", + "title": "Location" + }, + "msg": { + "type": "string", + "title": "Message" + }, + "type": { + "type": "string", + "title": "Error Type" + } + }, + "type": "object", + "required": [ + "loc", + "msg", + "type" + ], + "title": "ValidationError" + }, + "partition_parameters": { + "properties": { + "files": { + "type": "string", + "format": "binary", + "description": "The file to extract", + "required": "true", + "examples": [ + { + "summary": "File to be partitioned", + "externalValue": "https://github.com/Unstructured-IO/unstructured/blob/98d3541909f64290b5efb65a226fc3ee8a7cc5ee/example-docs/layout-parser-paper.pdf" + } + ] + }, + "coordinates": { + "type": "boolean", + "title": "Coordinates", + "description": "If `True`, return coordinates for each element extracted via OCR. Default: `False`", + "default": false + }, + "content_type": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Content type", + "description": "A hint about the content type to use (such as text/markdown), when there are problems processing a specific file. This value is a MIME type in the format type/subtype." + }, + "encoding": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Encoding", + "description": "The encoding method used to decode the text input. Default: utf-8" + }, + "extract_image_block_types": { + "items": { + "type": "string" + }, + "type": "array", + "title": "Image block types to extract", + "description": "The types of elements to extract, for use in extracting image blocks as base64 encoded data stored in metadata fields.", + "default": [] + }, + "gz_uncompressed_content_type": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Uncompressed Content Type", + "description": "If file is gzipped, use this content type after unzipping." + }, + "hi_res_model_name": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "title": "Hi Res Model Name", + "description": "The name of the inference model used when strategy is hi_res" + }, + "include_page_breaks": { + "type": "boolean", + "title": "Include Page Breaks", + "description": "If true, the output will include page breaks if the filetype supports it. Default: false", + "default": false + }, + "languages": { + "items": { + "type": "string" + }, + "type": "array", + "title": "OCR Languages", + "description": "The languages present in the document, for use in partitioning and/or OCR. See the Tesseract documentation for a full list of languages.", + "default": [] + }, + "ocr_languages": { + "items": { + "type": "string" + }, + "type": "array", + "title": "OCR Languages", + "description": "Deprecated! The languages present in the document, for use in partitioning and/or OCR", + "default": [] + }, + "output_format": { + "type": "string", + "enum": [ + "application/json", + "text/csv" + ], + "title": "Output Format", + "description": "The format of the response. Supported formats are application/json and text/csv. Default: application/json.", + "default": "application/json", + "x-speakeasy-unknown-values": "allow" + }, + "pdf_infer_table_structure": { + "type": "boolean", + "title": "Pdf Infer Table Structure", + "description": "Deprecated! Use skip_infer_table_types to opt out of table extraction for any file type. If False and strategy=hi_res, no Table Elements will be extracted from pdf files regardless of skip_infer_table_types contents.", + "default": true + }, + "skip_infer_table_types": { + "items": { + "type": "string" + }, + "type": "array", + "title": "Skip Infer Table Types", + "description": "The document types that you want to skip table extraction with. Default: []", + "default": [] + }, + "starting_page_number": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ], + "title": "PDF Starting Page Number", + "description": "When PDF is split into pages before sending it into the API, providing this information will allow the page number to be assigned correctly. Introduced in 1.0.27." + }, + "strategy": { + "type": "string", + "enum": [ + "fast", + "hi_res", + "auto", + "ocr_only" + ], + "title": "Strategy", + "description": "The strategy to use for partitioning PDF/image. Options are fast, hi_res, auto. Default: hi_res", + "default": "hi_res", + "examples": [ + "auto", + "hi_res" + ], + "x-speakeasy-unknown-values": "allow" + }, + "unique_element_ids": { + "type": "boolean", + "title": "unique_element_ids", + "description": "When `True`, assign UUIDs to element IDs, which guarantees their uniqueness (useful when using them as primary keys in database). Otherwise a SHA-256 of element text is used. Default: `False`", + "default": false + }, + "xml_keep_tags": { + "type": "boolean", + "title": "Xml Keep Tags", + "description": "If `True`, will retain the XML tags in the output. Otherwise it will simply extract the text from within the tags. Only applies to XML documents.", + "default": false + }, + "chunking_strategy": { + "anyOf": [ + { + "type": "string", + "enum": [ + "basic", + "by_page", + "by_similarity", + "by_title" + ] + }, + { + "type": "null" + } + ], + "title": "Chunking Strategy", + "description": "Use one of the supported strategies to chunk the returned elements after partitioning. When 'chunking_strategy' is not specified, no chunking is performed and any other chunking parameters provided are ignored. Supported strategies: 'basic', 'by_page', 'by_similarity', or 'by_title'", + "examples": [ + "by_title", + "basic" + ], + "x-speakeasy-unknown-values": "allow" + }, + "combine_under_n_chars": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ], + "title": "Combine Under N Chars", + "description": "If chunking strategy is set, combine elements until a section reaches a length of n chars. Default: 500" + }, + "include_orig_elements": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "null" + } + ], + "title": "Include original elements in chunks", + "description": "When a chunking strategy is specified, each returned chunk will include the elements consolidated to form that chunk as `.metadata.orig_elements`. Default: true." + }, + "max_characters": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ], + "title": "Max Characters", + "description": "If chunking strategy is set, cut off new sections after reaching a length of n chars (hard max). Default: 500" + }, + "multipage_sections": { + "type": "boolean", + "title": "Multipage Sections", + "description": "If chunking strategy is set, determines if sections can span multiple sections. Default: true", + "default": true + }, + "new_after_n_chars": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ], + "title": "New after n chars", + "description": "If chunking strategy is set, cut off new sections after reaching a length of n chars (soft max). Default: 1500" + }, + "overlap": { + "type": "integer", + "title": "Overlap", + "description": "Specifies the length of a string ('tail') to be drawn from each chunk and prefixed to the next chunk as a context-preserving mechanism. By default, this only applies to split-chunks where an oversized element is divided into multiple chunks by text-splitting. Default: 0", + "default": 0 + }, + "overlap_all": { + "type": "boolean", + "title": "Overlap all", + "description": "When `True`, apply overlap between 'normal' chunks formed from whole elements and not subject to text-splitting. Use this with caution as it entails a certain level of 'pollution' of otherwise clean semantic chunk boundaries. Default: False", + "default": false + }, + "similarity_threshold": { + "anyOf": [ + { + "type": "number" + }, + { + "type": "null" + } + ], + "title": "similarity-threshold", + "description": "A value between 0.0 and 1.0 describing the minimum similarity two elements must have to be included in the same chunk. Note that similar elements may be separated to meet chunk-size criteria; this value can only guarantees that two elements with similarity below the threshold will appear in separate chunks." + }, + "include_slide_notes": { + "type": "boolean", + "title": "include_slide_notes", + "description": "When `True`, slide notes from .ppt and .pptx files will be included in the response. Default: `True`", + "default": true + } + }, + "type": "object", + "required": [ + "files" + ], + "title": "Partition Parameters" + }, + "Element": { + "title": "Element", + "type": "object", + "additionalProperties": true, + "example": { + "type": "Title", + "element_id": "6aa0ff22f91bbe7e26e8e25ca8052acd", + "text": "LayoutParser: A Unified Toolkit for Deep Learning Based Document Image Analysis", + "metadata": { + "languages": [ + "eng" + ], + "page_number": 1, + "filename": "layout-parser-paper.pdf", + "filetype": "application/pdf" + } + } + }, + "ServerError": { + "type": "object", + "properties": { + "detail": { + "type": "string" + } + }, + "example": { + "detail": "An error occurred" + } } - }, - "strategy": { - "type": "string", - "title": "Strategy", - "description": "The strategy to use for partitioning PDF/image. Options are fast, hi_res, auto. Default: auto", - "example": "hi_res" - }, - "gz_uncompressed_content_type": { - "type": "string", - "title": "Uncompressed Content Type", - "description": "If file is gzipped, use this content type after unzipping", - "example": "application/pdf" - }, - "output_format": { - "type": "string", - "title": "Output Format", - "description": "The format of the response. Supported formats are application/json and text/csv. Default: application/json.", - "example": "application/json" - }, - "coordinates": { - "type": "boolean", - "title": "Coordinates", - "description": "If true, return coordinates for each element. Default: false" - }, - "encoding": { - "type": "string", - "title": "Encoding", - "description": "The encoding method used to decode the text input. Default: utf-8", - "example": "utf-8" - }, - "hi_res_model_name": { - "type": "string", - "title": "Hi Res Model Name", - "description": "The name of the inference model used when strategy is hi_res", - "example": "yolox" - }, - "include_page_breaks": { - "type": "boolean", - "title": "Include Page Breaks", - "description": "If True, the output will include page breaks if the filetype supports it. Default: false" - }, - "languages": { - "items": { "type": "string", "example": "eng" }, - "type": "array", - "title": "OCR Languages", - "default": [], - "description": "The languages present in the document, for use in partitioning and/or OCR", - "example": "[eng]" - }, - "pdf_infer_table_structure": { - "type": "boolean", - "title": "Pdf Infer Table Structure", - "description": "If True and strategy=hi_res, any Table Elements extracted from a PDF will include an additional metadata field, 'text_as_html', where the value (string) is a just a transformation of the data into an HTML ." - }, - "skip_infer_table_types": { - "items": { "type": "string", "example": "pdf" }, - "type": "array", - "title": "Skip Infer Table Types", - "description": "The document types that you want to skip table extraction with. Default: ['pdf', 'jpg', 'png']" - }, - "xml_keep_tags": { - "type": "boolean", - "title": "Xml Keep Tags", - "description": "If True, will retain the XML tags in the output. Otherwise it will simply extract the text from within the tags. Only applies to partition_xml." - }, - "chunking_strategy": { - "type": "string", - "title": "Chunking Strategy", - "description": "Use one of the supported strategies to chunk the returned elements. Currently supports: by_title", - "example": "by_title" - }, - "combine_under_n_chars": { - "type": "integer", - "title": "Combine Under N Chars", - "description": "If chunking strategy is set, combine elements until a section reaches a length of n chars. Default: max_characters", - "example": 500 - }, - "include_orig_elements": { - "type": "boolean", - "title": "Original-elements flag", - "description": "When True (the default), the elements used to form a chunk appear in `.metadata.orig_elements` for that chunk. Only applies when chunking is specified using the `chunking_strategy` argument." - }, - "max_characters": { - "type": "integer", - "title": "Max Characters", - "description": "If chunking strategy is set, cut off new sections after reaching a length of n chars (hard max). Default: 500", - "example": 1500 - }, - "multipage_sections": { - "type": "boolean", - "title": "Multipage Sections", - "description": "If chunking strategy is set, determines if sections can span multiple pages. Only applies to by_title chunking strategy.Default: true" - }, - "new_after_n_chars": { - "type": "integer", - "title": "New after n chars", - "description": "If chunking strategy is set, cut off new sections after reaching a length of n chars (soft max). Default: max_characters (off)", - "example": 1500 - }, - "overlap": { - "type": "integer", - "title": "Intra-chunk overlap", - "description": "A prefix of this many trailing characters from the prior text-split chunk is applied to second and later chunks formed from oversized elements by text-splitting. Default: None", - "example": 25 - }, - "overlap_all": { - "type": "boolean", - "title": "Inter-chunk overlap", - "description": "When True, overlap is also applied to 'normal' chunks formed by combining whole elements. Use with caution as this can introduce noise into otherwise clean semantic units. Default: None" - }, - "extract_image_block_types": { - "items": { "type": "string", "example": "image" }, - "type": "array", - "title": "Image block types to extract", - "default": [], - "description": "The types of elements to extract, for use in extracting image blocks as base64 encoded data stored in metadata fields", - "example": ["image", "table"] - } }, - "type": "object", - "title": "Partition Parameters" - }, - "HTTPValidationError": { - "properties": { - "detail": { - "items": { "$ref": "#/components/schemas/ValidationError" }, - "type": "array", - "title": "Detail" - } + "securitySchemes": { + "ApiKeyAuth": { + "type": "apiKey", + "name": "unstructured-api-key", + "in": "header", + "x-speakeasy-example": "YOUR_API_KEY" + } + } + }, + "tags": [ + { + "name": "general" + } + ], + "security": [ + { + "ApiKeyAuth": [] }, - "type": "object", - "title": "HTTPValidationError" - }, - "ValidationError": { - "properties": { - "loc": { - "items": { "oneOf": [{ "type": "string" }, { "type": "integer" }] }, - "type": "array", - "title": "Location" - }, - "msg": { "type": "string", "title": "Message" }, - "type": { "type": "string", "title": "Error Type" } + {} + ], + "x-speakeasy-retries": { + "strategy": "backoff", + "backoff": { + "initialInterval": 3000, + "maxInterval": 720000, + "maxElapsedTime": 1800000, + "exponent": 1.88 }, - "type": "object", - "required": ["loc", "msg", "type"], - "title": "ValidationError" - } + "statusCodes": [ + "502", + "503", + "504" + ], + "retryConnectionErrors": true } - } }