From 7cd296109a3288f7907a343c52497f1c0410bbd9 Mon Sep 17 00:00:00 2001 From: Paul Cornell Date: Mon, 13 Jan 2025 11:41:04 -0800 Subject: [PATCH 1/2] Ingest v2: Vectara destination connector --- .../ingest/destination-connector/vectara.mdx | 27 ++++++++- .../ingest/destination-connectors/vectara.mdx | 26 ++++++++- snippets/dc-shared-text/vectara-cli-api.mdx | 9 +++ snippets/dc-shared-text/vectara.mdx | 19 ------- .../destination_connectors/vectara.sh.mdx | 21 ++++--- .../{vectara.py.mdx => vectara.v1.py.mdx} | 4 +- .../destination_connectors/vectara.v2.py.mdx | 55 +++++++++++++++++++ .../general-shared-text/vectara-cli-api.mdx | 18 ++++++ snippets/general-shared-text/vectara.mdx | 4 ++ 9 files changed, 149 insertions(+), 34 deletions(-) create mode 100644 snippets/dc-shared-text/vectara-cli-api.mdx delete mode 100644 snippets/dc-shared-text/vectara.mdx rename snippets/destination_connectors/{vectara.py.mdx => vectara.v1.py.mdx} (93%) create mode 100644 snippets/destination_connectors/vectara.v2.py.mdx create mode 100644 snippets/general-shared-text/vectara-cli-api.mdx create mode 100644 snippets/general-shared-text/vectara.mdx diff --git a/api-reference/ingest/destination-connector/vectara.mdx b/api-reference/ingest/destination-connector/vectara.mdx index 5a60780f..c6509ac4 100644 --- a/api-reference/ingest/destination-connector/vectara.mdx +++ b/api-reference/ingest/destination-connector/vectara.mdx @@ -2,6 +2,29 @@ title: Vectara --- -import SharedVectara from '/snippets/dc-shared-text/vectara.mdx'; +import NewDocument from '/snippets/general-shared-text/new-document.mdx'; + + + +import SharedContentVectara from '/snippets/dc-shared-text/vectara-cli-api.mdx'; +import SharedAPIKeyURL from '/snippets/general-shared-text/api-key-url.mdx'; + + + + +Now call the Unstructured CLI or Python SDK. The source connector can be any of the ones supported. + +This example uses the local source connector: + +import VectaraAPISh from '/snippets/destination_connectors/vectara.sh.mdx'; +import VectaraAPIPyV2 from '/snippets/destination_connectors/vectara.v2.py.mdx'; +import VectaraAPIPyV1 from '/snippets/destination_connectors/vectara.v1.py.mdx'; + + + + + + + + - diff --git a/open-source/ingest/destination-connectors/vectara.mdx b/open-source/ingest/destination-connectors/vectara.mdx index 5a60780f..39ded018 100644 --- a/open-source/ingest/destination-connectors/vectara.mdx +++ b/open-source/ingest/destination-connectors/vectara.mdx @@ -2,6 +2,28 @@ title: Vectara --- -import SharedVectara from '/snippets/dc-shared-text/vectara.mdx'; + - +import SharedContentVectara from '/snippets/dc-shared-text/vectara-cli-api.mdx'; + + + +Now call the Unstructured CLI or Python SDK. The source connector can be any of the ones supported. + +This example uses the local source connector. + +This example sends files to Unstructured API services for processing by default. To process files locally instead, see the instructions at the end of this page. + +import VectaraAPISh from '/snippets/destination_connectors/vectara.sh.mdx'; +import VectaraAPIPyV2 from '/snippets/destination_connectors/vectara.v2.py.mdx'; +import VectaraAPIPyV1 from '/snippets/destination_connectors/vectara.v1.py.mdx'; + + + + + + + +import SharedPartitionByAPIOSS from '/snippets/ingest-configuration-shared/partition-by-api-oss.mdx'; + + diff --git a/snippets/dc-shared-text/vectara-cli-api.mdx b/snippets/dc-shared-text/vectara-cli-api.mdx new file mode 100644 index 00000000..bc805a31 --- /dev/null +++ b/snippets/dc-shared-text/vectara-cli-api.mdx @@ -0,0 +1,9 @@ +Batch process all your records to store structured outputs in Vectara. + +The requirements are as follows. + +import SharedVectara from '/snippets/general-shared-text/vectara.mdx'; +import SharedVectaraCLIAPI from '/snippets/general-shared-text/vectara-cli-api.mdx'; + + + \ No newline at end of file diff --git a/snippets/dc-shared-text/vectara.mdx b/snippets/dc-shared-text/vectara.mdx deleted file mode 100644 index 664de04b..00000000 --- a/snippets/dc-shared-text/vectara.mdx +++ /dev/null @@ -1,19 +0,0 @@ -Process all your records using `unstructured-ingest` to store structured outputs locally on your filesystem and upload those to a Vectara corpus. If you don’t yet have a Vectara account, [sign up](https://console.vectara.com/signup/) for one now. - -The upstream connector can be any of the ones supported, but for convenience here, showing a sample command using the upstream local connector. - -import VectaraSh from '/snippets/destination_connectors/vectara.sh.mdx'; -import VectaraPy from '/snippets/destination_connectors/vectara.py.mdx'; - - - - - - - - - - -For a full list of the options the Unstructured Ingest CLI accepts check `unstructured-ingest vectara --help`. - -NOTE: Keep in mind that you will need to have all the appropriate extras and dependencies for the file types of the documents contained in your data storage platform if you’re running this locally. You can find more information about this in the [installation guide](/open-source/installation/overview). diff --git a/snippets/destination_connectors/vectara.sh.mdx b/snippets/destination_connectors/vectara.sh.mdx index a82f8b7a..fe8379ab 100644 --- a/snippets/destination_connectors/vectara.sh.mdx +++ b/snippets/destination_connectors/vectara.sh.mdx @@ -1,19 +1,22 @@ -```bash Shell +```bash CLI #!/usr/bin/env bash -# Chunking is optional. +# Chunking and embedding is optional. unstructured-ingest \ local \ --input-path $LOCAL_FILE_INPUT_DIR \ - --output-dir $LOCAL_FILE_OUTPUT_DIR \ - --strategy hi_res \ - --chunk-elements \ - --num-processes 2 \ - --verbose \ + --chunking-strategy by_title \ + --embedding-provider huggingface \ + --partition-by-api \ + --api-key $UNSTRUCTURED_API_KEY \ + --partition-endpoint $UNSTRUCTURED_API_URL \ + --additional-partition-args="{\"split_pdf_page\":\"true\", \"split_pdf_allow_failed\":\"true\", \"split_pdf_concurrency_level\": 15}" \ vectara \ --customer-id $VECTARA_CUSTOMER_ID \ + --corpus-name $VECTARA_CORPUS_NAME \ + --corpus-key $VECTARA_CORPUS_KEY \ --oauth-client-id $VECTARA_OAUTH_CLIENT_ID \ - --oauth-secret $VECTARA_OAUTH_SECRET \ - --corpus-name test-corpus-vectara + --oauth-secret $VECTARA_OAUTH_CLIENT_SECRET \ + --token-url $VECTARA_OAUTH_TOKEN_URL ``` diff --git a/snippets/destination_connectors/vectara.py.mdx b/snippets/destination_connectors/vectara.v1.py.mdx similarity index 93% rename from snippets/destination_connectors/vectara.py.mdx rename to snippets/destination_connectors/vectara.v1.py.mdx index 6d3da492..522fd0e5 100644 --- a/snippets/destination_connectors/vectara.py.mdx +++ b/snippets/destination_connectors/vectara.v1.py.mdx @@ -1,4 +1,4 @@ -```python Python +```python Python Ingest v1 import os from unstructured_ingest.connector.local import SimpleLocalConfig @@ -24,7 +24,7 @@ def get_writer() -> Writer: connector_config=SimpleVectaraConfig( access_config=VectaraAccessConfig( oauth_client_id=os.getenv("VECTARA_OAUTH_CLIENT_ID"), - oauth_secret=os.getenv("VECTARA_OAUTH_SECRET"), + oauth_secret=os.getenv("VECTARA_OAUTH_CLIENT_SECRET"), ), customer_id=os.getenv("VECTARA_CUSTOMER_ID"), corpus_name="test-corpus-vectara", diff --git a/snippets/destination_connectors/vectara.v2.py.mdx b/snippets/destination_connectors/vectara.v2.py.mdx new file mode 100644 index 00000000..2333578a --- /dev/null +++ b/snippets/destination_connectors/vectara.v2.py.mdx @@ -0,0 +1,55 @@ +```python Python Ingest v2 +import os + +from unstructured_ingest.v2.pipeline.pipeline import Pipeline +from unstructured_ingest.v2.interfaces import ProcessorConfig + +from unstructured_ingest.v2.processes.connectors.vectara import ( + VectaraAccessConfig, + VectaraConnectionConfig, + VectaraUploadStagerConfig, + VectaraUploaderConfig +) +from unstructured_ingest.v2.processes.connectors.local import ( + LocalIndexerConfig, + LocalConnectionConfig, + LocalDownloaderConfig +) +from unstructured_ingest.v2.processes.partitioner import PartitionerConfig +from unstructured_ingest.v2.processes.chunker import ChunkerConfig +from unstructured_ingest.v2.processes.embedder import EmbedderConfig + +# Chunking and embedding is optional. + +if __name__ == "__main__": + Pipeline.from_configs( + context=ProcessorConfig(), + indexer_config=LocalIndexerConfig(input_path=os.getenv("LOCAL_FILE_INPUT_DIR")), + downloader_config=LocalDownloaderConfig(), + source_connection_config=LocalConnectionConfig(), + partitioner_config=PartitionerConfig( + partition_by_api=True, + api_key=os.getenv("UNSTRUCTURED_API_KEY"), + partition_endpoint=os.getenv("UNSTRUCTURED_API_URL"), + additional_partition_args={ + "split_pdf_page": True, + "split_pdf_allow_failed": True, + "split_pdf_concurrency_level": 15 + } + ), + chunker_config=ChunkerConfig(chunking_strategy="by_title"), + embedder_config=EmbedderConfig(embedding_provider="huggingface"), + destination_connection_config=VectaraConnectionConfig( + access_config=VectaraAccessConfig( + oauth_client_id=os.getenv("VECTARA_OAUTH_CLIENT_ID"), + oauth_secret=os.getenv("VECTARA_OAUTH_CLIENT_SECRET") + ), + customer_id=os.getenv("VECTARA_CUSTOMER_ID"), + corpus_name=os.getenv("VECTARA_CORPUS_NAME"), + corpus_key=os.getenv("VECTARA_CORPUS_KEY"), + token_url=os.getenv("VECTARA_OAUTH_TOKEN_URL") + ), + stager_config=VectaraUploadStagerConfig(), + uploader_config=VectaraUploaderConfig() + ).run() +``` \ No newline at end of file diff --git a/snippets/general-shared-text/vectara-cli-api.mdx b/snippets/general-shared-text/vectara-cli-api.mdx new file mode 100644 index 00000000..c60ee2b8 --- /dev/null +++ b/snippets/general-shared-text/vectara-cli-api.mdx @@ -0,0 +1,18 @@ +The Vectara connector dependencies. + +```bash +pip install "unstructured-ingest[vectara]" +``` + +import AdditionalIngestDependencies from '/snippets/general-shared-text/ingest-dependencies.mdx'; + + + +The following environment variables: + +- `VECTARA_CUSTOMER_ID` - The customer ID for the target Vectara account, represented by `--customer-id` (CLI) or `customer_id` (Python). +- `VECTARA_CORPUS_NAME` - The name of the target corpus in the account, represented by `--corpus-name` (CLI) or `corpus_name` (Python). +- `VECTARA_CORPUS_KEY` - The name of the corpus's key, represented by `--corpus-key` (CLI) or `corpus_key` (Python). +- `VECTARA_OAUTH_TOKEN_URL` - The OAuth token URL for getting and refreshing OAuth access tokens in the account, represented by `--token-url` (CLI) or `token_url` (Python). +- `VECTARA_OAUTH_CLIENT_ID` - A valid OAuth client ID in the account, represented by `--oauth-client-id` (CLI) or `oauth_client_id` (Python). +- `VECTARA_OAUTH_CLIENT_SECRET` - The OAuth client secret for the client ID, represented by `--oauth-client-secret` (CLI) or `oauth_client_secret` (Python). \ No newline at end of file diff --git a/snippets/general-shared-text/vectara.mdx b/snippets/general-shared-text/vectara.mdx new file mode 100644 index 00000000..33181cdd --- /dev/null +++ b/snippets/general-shared-text/vectara.mdx @@ -0,0 +1,4 @@ +- A [Vectara account](https://console.vectara.com/signup). +- The [customer ID](https://docs.vectara.com/docs/console-ui/vectara-console-overview#view-the-customer-id) for the account. +- The name and key for the target [corpus](https://docs.vectara.com/docs/console-ui/creating-a-corpus) in the account. +- The [OAuth authentication URL, client ID, and client secret](https://docs.vectara.com/docs/console-ui/app-clients) for accessing the target corpus. From 3e386ddca53269c94bf9f59a4d843b05b3ba99d0 Mon Sep 17 00:00:00 2001 From: Paul Cornell Date: Mon, 13 Jan 2025 11:46:21 -0800 Subject: [PATCH 2/2] Add Vectara ingest dependency --- api-reference/ingest/ingest-dependencies.mdx | 1 + 1 file changed, 1 insertion(+) diff --git a/api-reference/ingest/ingest-dependencies.mdx b/api-reference/ingest/ingest-dependencies.mdx index fd4841c9..baa75ac0 100644 --- a/api-reference/ingest/ingest-dependencies.mdx +++ b/api-reference/ingest/ingest-dependencies.mdx @@ -86,6 +86,7 @@ To add support for additional connectors, run the following: | `pip install "unstructured-ingest[snowflake]"` | Snowflake | | `pip install "unstructured-ingest[sftp]"` | SFTP | | `pip install "unstructured-ingest[slack]"` | Slack | +| `pip install "unstructured-ingest[vectara]"` | Vectara | | `pip install "unstructured-ingest[wikipedia]"` | Wikipedia | | `pip install "unstructured-ingest[weaviate]"` | Weaviate |