From 3ab1e4bba2530e1472971ee0bd79a93fb44f9529 Mon Sep 17 00:00:00 2001 From: Paul Cornell Date: Tue, 26 Nov 2024 09:22:30 -0800 Subject: [PATCH] API: Weaviate v2 updates for local-, embedded-, and cloud-focused classes --- snippets/dc-shared-text/weaviate-cli-api.mdx | 10 -- .../destination_connectors/weaviate.sh.mdx | 42 +++++- .../destination_connectors/weaviate.v1.py.mdx | 3 + .../destination_connectors/weaviate.v2.py.mdx | 72 +++++++-- .../general-shared-text/weaviate-cli-api.mdx | 15 +- snippets/general-shared-text/weaviate.mdx | 140 ++++++++++-------- 6 files changed, 181 insertions(+), 101 deletions(-) diff --git a/snippets/dc-shared-text/weaviate-cli-api.mdx b/snippets/dc-shared-text/weaviate-cli-api.mdx index 5752baea..741ab823 100644 --- a/snippets/dc-shared-text/weaviate-cli-api.mdx +++ b/snippets/dc-shared-text/weaviate-cli-api.mdx @@ -1,15 +1,5 @@ Batch process all your records to store structured outputs in a Weaviate database. - - You will need: import SharedWeaviate from '/snippets/general-shared-text/weaviate.mdx'; diff --git a/snippets/destination_connectors/weaviate.sh.mdx b/snippets/destination_connectors/weaviate.sh.mdx index 8f5d3e8f..d2803d97 100644 --- a/snippets/destination_connectors/weaviate.sh.mdx +++ b/snippets/destination_connectors/weaviate.sh.mdx @@ -3,22 +3,48 @@ # Chunking and embedding is optional. +# For Weaviate installed locally: unstructured-ingest \ local \ --input-path $LOCAL_FILE_INPUT_DIR \ - --output-dir $LOCAL_FILE_OUTPUT_DIR \ + --partition-by-api \ + --api-key $UNSTRUCTURED_API_KEY \ + --partition-endpoint $UNSTRUCTURED_API_URL \ + --chunking-strategy by_title \ + --embedding-provider huggingface \ --strategy hi_res \ - --chunk-elements \ + --additional-partition-args="{\"split_pdf_page\":\"true\", \"split_pdf_allow_failed\":\"true\", \"split_pdf_concurrency_level\": 15}" \ + weaviate-local \ + --collection $WEAVIATE_COLLECTION + +# For Embedded Weaviate: +unstructured-ingest \ + local \ + --input-path $LOCAL_FILE_INPUT_DIR \ + --partition-by-api \ + --api-key $UNSTRUCTURED_API_KEY \ + --partition-endpoint $UNSTRUCTURED_API_URL \ + --chunking-strategy by_title \ --embedding-provider huggingface \ - --num-processes 2 \ - --verbose \ - --strategy fast \ + --strategy hi_res \ + --additional-partition-args="{\"split_pdf_page\":\"true\", \"split_pdf_allow_failed\":\"true\", \"split_pdf_concurrency_level\": 15}" \ + weaviate-embedded \ + --hostname $WEAVIATE_HOST \ + --collection $WEAVIATE_COLLECTION + +# For Weaviate Cloud: +unstructured-ingest \ + local \ + --input-path $LOCAL_FILE_INPUT_DIR \ --partition-by-api \ --api-key $UNSTRUCTURED_API_KEY \ --partition-endpoint $UNSTRUCTURED_API_URL \ + --chunking-strategy by_title \ + --embedding-provider huggingface \ + --strategy hi_res \ --additional-partition-args="{\"split_pdf_page\":\"true\", \"split_pdf_allow_failed\":\"true\", \"split_pdf_concurrency_level\": 15}" \ - weaviate \ - --host-url $WEAVIATE_URL \ + weaviate-cloud \ + --cluster-url $WEAVIATE_URL \ --api-key $WEAVIATE_API_KEY \ - --class-name $WEAVIATE_COLLECTION_CLASS_NAME + --collection $WEAVIATE_COLLECTION ``` diff --git a/snippets/destination_connectors/weaviate.v1.py.mdx b/snippets/destination_connectors/weaviate.v1.py.mdx index 276a14e4..9d7a6706 100644 --- a/snippets/destination_connectors/weaviate.v1.py.mdx +++ b/snippets/destination_connectors/weaviate.v1.py.mdx @@ -1,4 +1,7 @@ ```python Python Ingest v1 +# NOTE: Python Ingest v1 does not provide separate classes for +# Weaviate installed locally, Embedded Weaviate, or Weaviate Cloud. + from unstructured_ingest.connector.local import SimpleLocalConfig from unstructured_ingest.connector.weaviate import ( SimpleWeaviateConfig, diff --git a/snippets/destination_connectors/weaviate.v2.py.mdx b/snippets/destination_connectors/weaviate.v2.py.mdx index 7c4cb5d7..356954e5 100644 --- a/snippets/destination_connectors/weaviate.v2.py.mdx +++ b/snippets/destination_connectors/weaviate.v2.py.mdx @@ -4,22 +4,39 @@ import os from unstructured_ingest.v2.pipeline.pipeline import Pipeline from unstructured_ingest.v2.interfaces import ProcessorConfig -from unstructured_ingest.v2.processes.connectors.weaviate import ( - WeaviateConnectionConfig, - WeaviateAccessConfig, - WeaviateUploaderConfig, - WeaviateUploadStagerConfig -) from unstructured_ingest.v2.processes.connectors.local import ( LocalIndexerConfig, LocalDownloaderConfig, LocalConnectionConfig ) + from unstructured_ingest.v2.processes.partitioner import PartitionerConfig from unstructured_ingest.v2.processes.chunker import ChunkerConfig from unstructured_ingest.v2.processes.embedder import EmbedderConfig -# Chunking and embedding are optional. +# For Weaviate installed locally: +# from unstructured_ingest.v2.processes.connectors.weaviate.local import ( +# LocalWeaviateConnectionConfig, +# LocalWeaviateAccessConfig, +# LocalWeaviateUploadStagerConfig, +# LocalWeaviateUploaderConfig +# ) + +# For Embedded Weaviate: +# from unstructured_ingest.v2.processes.connectors.weaviate.embedded import ( +# EmbeddedWeaviateConnectionConfig, +# EmbeddedWeaviateAccessConfig, +# EmbeddedWeaviateUploadStagerConfig, +# EmbeddedWeaviateUploaderConfig +# ) + +# For Weaviate Cloud: +from unstructured_ingest.v2.processes.connectors.weaviate.cloud import ( + CloudWeaviateConnectionConfig, + CloudWeaviateAccessConfig, + CloudWeaviateUploaderConfig, + CloudWeaviateUploadStagerConfig +) if __name__ == "__main__": Pipeline.from_configs( @@ -31,7 +48,6 @@ if __name__ == "__main__": partition_by_api=True, api_key=os.getenv("UNSTRUCTURED_API_KEY"), partition_endpoint=os.getenv("UNSTRUCTURED_API_URL"), - strategy="hi_res", additional_partition_args={ "split_pdf_page": True, "split_pdf_allow_failed": True, @@ -39,15 +55,41 @@ if __name__ == "__main__": } ), chunker_config=ChunkerConfig(chunking_strategy="by_title"), - embedder_config=EmbedderConfig(embedding_provider="huggingface"), - destination_connection_config=WeaviateConnectionConfig( - access_config=WeaviateAccessConfig( + embedder_config=EmbedderConfig( + embedding_provider="openai", + embedding_model_name=os.getenv("EMBEDDING_MODEL_NAME"), + embedding_api_key=os.getenv("OPENAI_APIKEY") + ), + + # For Weaviate installed locally: + # destination_connection_config=LocalWeaviateConnectionConfig( + # access_config=LocalWeaviateAccessConfig() + # ), + # stager_config=LocalWeaviateUploadStagerConfig(), + # uploader_config=LocalWeaviateUploaderConfig( + # collection=os.getenv("WEAVIATE_COLLECTION") + # ) + + # For Embedded Weaviate: + # destination_connection_config=EmbeddedWeaviateConnectionConfig( + # access_config=EmbeddedWeaviateAccessConfig(), + # hostname=os.getenv("WEAVIATE_HOST") + # ), + # stager_config=EmbeddedWeaviateUploadStagerConfig(), + # uploader_config=EmbeddedWeaviateUploaderConfig( + # collection=os.getenv("WEAVIATE_COLLECTION") + # ) + + # For Weaviate Cloud: + destination_connection_config=CloudWeaviateConnectionConfig( + access_config=CloudWeaviateAccessConfig( api_key=os.getenv("WEAVIATE_API_KEY") ), - host_url=os.getenv("WEAVIATE_URL"), - class_name=os.getenv("WEAVIATE_COLLECTION_CLASS_NAME") + cluster_url=os.getenv("WEAVIATE_CLUSTER_URL") ), - stager_config=WeaviateUploadStagerConfig(), - uploader_config=WeaviateUploaderConfig() + stager_config=CloudWeaviateUploadStagerConfig(), + uploader_config=CloudWeaviateUploaderConfig( + collection=os.getenv("WEAVIATE_COLLECTION") + ) ).run() ``` \ No newline at end of file diff --git a/snippets/general-shared-text/weaviate-cli-api.mdx b/snippets/general-shared-text/weaviate-cli-api.mdx index f0e9954b..19121429 100644 --- a/snippets/general-shared-text/weaviate-cli-api.mdx +++ b/snippets/general-shared-text/weaviate-cli-api.mdx @@ -10,10 +10,17 @@ import AdditionalIngestDependencies from '/snippets/general-shared-text/ingest-d The following environment variables: -- `WEAVIATE_URL` - THE REST endpoint for the Weaviate database cluster, represented by `--host-url` (CLI) or `host_url` (Python). +- For Weaviate installed locally, `WEAVIATE_COLLECTION` - The name of the target collection in the instance, represented by `--collection` (CLI) or `collection` (Python). +- For Embedded Weaviate: -- `WEAVIATE_API_KEY` - The API key for the database cluster, represented by `--api-key` (CLI) or `api_key` (Python). + - `WEAVIATE_HOST` - The connection URL to the instance, represented by `--hostname` (CLI) or `hostname` (Python). + - `WEAVIATE_COLLECTION` - The name of the target collection in the instance, represented by `--collection` (CLI) or `collection` (Python). - For the CLI, the `--api-key` option here is part of the `weaviate` command. For Python, the `api_key` parameter here is part of the `WeaviateAccessConfig` object. +- For Weaviate Cloud: -- `WEAVIATE_COLLECTION_CLASS_NAME` - The name of the collection in the database, represented by `--class-name` (CLI) or `class_name` (Python). \ No newline at end of file + - `WEAVIATE_CLUSTER_URL` - THE REST endpoint for the Weaviate database cluster, represented by `--cluster-url` (CLI) or `cluster_url` (Python). + - `WEAVIATE_API_KEY` - The API key for the database cluster, represented by `--api-key` (CLI) or `api_key` (Python). + + For the CLI, the `--api-key` option here is part of the `weaviate-cloud` command. For Python, the `api_key` parameter here is part of the `CloudWeaviateAccessConfig` object. + + - `WEAVIATE_COLLECTION` - The name of the target collection in the database, represented by `--collection` (CLI) or `collection` (Python). \ No newline at end of file diff --git a/snippets/general-shared-text/weaviate.mdx b/snippets/general-shared-text/weaviate.mdx index 990ec6b4..5fd08572 100644 --- a/snippets/general-shared-text/weaviate.mdx +++ b/snippets/general-shared-text/weaviate.mdx @@ -1,72 +1,84 @@ -The Weaviate prerequisites: +The Weaviate prerequisites. - +- For the [Unstructured Platform](/platform/overview): only [Weaviate Cloud](https://weaviate.io/developers/wcs) clusters are supported. +- For [Unstructured Ingest](/ingestion/overview): Weaviate Cloud clusters, + [Weaviate installed locally](https://weaviate.io/developers/weaviate/quickstart/local), + and [Embedded Weaviate](https://weaviate.io/developers/weaviate/installation/embedded) are supported. +- For Weaviate installed locally, you will need the name of the target collection on the local instance. +- For Embedded Weaviate, you will need the instance's connection URL and the name of the target collection on the instance. +- For Weaviate Cloud, you will need: -1. A Weaviate database instance. The following information assumes that you have a Weaviate Cloud (WCD) account with a Weaviate database cluster in that account. - [Create a WCD account](https://weaviate.io/developers/wcs/quickstart#create-a-wcd-account). [Create a database cluster](https://weaviate.io/developers/wcs/quickstart#create-a-weaviate-cluster). For other database options, [learn more](https://weaviate.io/developers/weaviate/installation). + - A Weaviate database instance. The following information assumes that you have a Weaviate Cloud (WCD) account with a Weaviate database cluster in that account. + [Create a WCD account](https://weaviate.io/developers/wcs/quickstart#create-a-wcd-account). [Create a database cluster](https://weaviate.io/developers/wcs/quickstart#create-a-weaviate-cluster). For other database options, [learn more](https://weaviate.io/developers/weaviate/installation). + - The name of the target collection in the database. [Create a collection](https://weaviate.io/developers/wcs/tools/collections-tool). + - The URL and API key for the database cluster. [Get the URL and API key](https://weaviate.io/developers/wcs/quickstart#explore-the-details-panel). -2. The URL and API key for the database cluster. [Get the URL and API key](https://weaviate.io/developers/wcs/quickstart#explore-the-details-panel). + The following video describes how to set up Weaviate Cloud for Unstructured. -3. A collection in the database cluster. Note the name of the collection, also known as the collection's _class name_. [Create a collection](https://weaviate.io/developers/wcs/tools/collections-tool). + - The schema of the collection that you use must match the data that Unstructured writes to it. Otherwise, you might get unexpected results or errors. - Unstructured cannot provide a schema that is guaranteed to work for everyone in all circumstances. This is because these schemas will vary based on - your source files' types; how you want Unstructured to partition, chunk, and generate embeddings; - any custom post-processing code that you run; and other factors. +Weaviate requires the collection to have a data schema before you add data. However, you don't have to create a data schema manually. +If you don't provide one, Weaviate generates a schema based on the incoming data. - You can adapt the following collection schema example for your own needs: +However, if you have specific schema requirements, you can create the schema manually. +Unstructured cannot provide a schema that is guaranteed to work for everyone in all circumstances. +This is because these schemas will vary based on +your source files' types; how you want Unstructured to partition, chunk, and generate embeddings; +any custom post-processing code that you run; and other factors. - ```json - { - "class": "Elements", - "properties": [ - { - "name": "element_id", - "dataType": ["text"] - }, - { - "name": "text", - "dataType": ["text"] - }, - { - "name": "embeddings", - "dataType": ["number[]"] - }, - { - "name": "metadata", - "dataType": ["object"], - "nestedProperties": [ - { - "name": "parent_id", - "dataType": ["text"] - }, - { - "name": "page_number", - "dataType": ["text"] - }, - { - "name": "is_continuation", - "dataType": ["boolean"] - }, - { - "name": "orig_elements", - "dataType": ["text"] - } - ] - } - ] - } - ``` - - See also : +You can adapt the following collection schema example for your own specific schema requirements: - - [Collection schema](https://weaviate.io/developers/weaviate/config-refs/schema) - - [Unstructured document elements and metadata](/api-reference/api-services/document-elements) \ No newline at end of file +```json +{ + "class": "Elements", + "properties": [ + { + "name": "element_id", + "dataType": ["text"] + }, + { + "name": "text", + "dataType": ["text"] + }, + { + "name": "embeddings", + "dataType": ["number[]"] + }, + { + "name": "metadata", + "dataType": ["object"], + "nestedProperties": [ + { + "name": "parent_id", + "dataType": ["text"] + }, + { + "name": "page_number", + "dataType": ["text"] + }, + { + "name": "is_continuation", + "dataType": ["boolean"] + }, + { + "name": "orig_elements", + "dataType": ["text"] + } + ] + } + ] +} +``` + +See also : + +- [Collection schema](https://weaviate.io/developers/weaviate/config-refs/schema) +- [Unstructured document elements and metadata](/api-reference/api-services/document-elements) \ No newline at end of file