diff --git a/snippets/dc-shared-text/weaviate-cli-api.mdx b/snippets/dc-shared-text/weaviate-cli-api.mdx
index 5752baea..741ab823 100644
--- a/snippets/dc-shared-text/weaviate-cli-api.mdx
+++ b/snippets/dc-shared-text/weaviate-cli-api.mdx
@@ -1,15 +1,5 @@
Batch process all your records to store structured outputs in a Weaviate database.
-
-
You will need:
import SharedWeaviate from '/snippets/general-shared-text/weaviate.mdx';
diff --git a/snippets/destination_connectors/weaviate.sh.mdx b/snippets/destination_connectors/weaviate.sh.mdx
index 8f5d3e8f..d2803d97 100644
--- a/snippets/destination_connectors/weaviate.sh.mdx
+++ b/snippets/destination_connectors/weaviate.sh.mdx
@@ -3,22 +3,48 @@
# Chunking and embedding is optional.
+# For Weaviate installed locally:
unstructured-ingest \
local \
--input-path $LOCAL_FILE_INPUT_DIR \
- --output-dir $LOCAL_FILE_OUTPUT_DIR \
+ --partition-by-api \
+ --api-key $UNSTRUCTURED_API_KEY \
+ --partition-endpoint $UNSTRUCTURED_API_URL \
+ --chunking-strategy by_title \
+ --embedding-provider huggingface \
--strategy hi_res \
- --chunk-elements \
+ --additional-partition-args="{\"split_pdf_page\":\"true\", \"split_pdf_allow_failed\":\"true\", \"split_pdf_concurrency_level\": 15}" \
+ weaviate-local \
+ --collection $WEAVIATE_COLLECTION
+
+# For Embedded Weaviate:
+unstructured-ingest \
+ local \
+ --input-path $LOCAL_FILE_INPUT_DIR \
+ --partition-by-api \
+ --api-key $UNSTRUCTURED_API_KEY \
+ --partition-endpoint $UNSTRUCTURED_API_URL \
+ --chunking-strategy by_title \
--embedding-provider huggingface \
- --num-processes 2 \
- --verbose \
- --strategy fast \
+ --strategy hi_res \
+ --additional-partition-args="{\"split_pdf_page\":\"true\", \"split_pdf_allow_failed\":\"true\", \"split_pdf_concurrency_level\": 15}" \
+ weaviate-embedded \
+ --hostname $WEAVIATE_HOST \
+ --collection $WEAVIATE_COLLECTION
+
+# For Weaviate Cloud:
+unstructured-ingest \
+ local \
+ --input-path $LOCAL_FILE_INPUT_DIR \
--partition-by-api \
--api-key $UNSTRUCTURED_API_KEY \
--partition-endpoint $UNSTRUCTURED_API_URL \
+ --chunking-strategy by_title \
+ --embedding-provider huggingface \
+ --strategy hi_res \
--additional-partition-args="{\"split_pdf_page\":\"true\", \"split_pdf_allow_failed\":\"true\", \"split_pdf_concurrency_level\": 15}" \
- weaviate \
- --host-url $WEAVIATE_URL \
+ weaviate-cloud \
+ --cluster-url $WEAVIATE_URL \
--api-key $WEAVIATE_API_KEY \
- --class-name $WEAVIATE_COLLECTION_CLASS_NAME
+ --collection $WEAVIATE_COLLECTION
```
diff --git a/snippets/destination_connectors/weaviate.v1.py.mdx b/snippets/destination_connectors/weaviate.v1.py.mdx
index 276a14e4..9d7a6706 100644
--- a/snippets/destination_connectors/weaviate.v1.py.mdx
+++ b/snippets/destination_connectors/weaviate.v1.py.mdx
@@ -1,4 +1,7 @@
```python Python Ingest v1
+# NOTE: Python Ingest v1 does not provide separate classes for
+# Weaviate installed locally, Embedded Weaviate, or Weaviate Cloud.
+
from unstructured_ingest.connector.local import SimpleLocalConfig
from unstructured_ingest.connector.weaviate import (
SimpleWeaviateConfig,
diff --git a/snippets/destination_connectors/weaviate.v2.py.mdx b/snippets/destination_connectors/weaviate.v2.py.mdx
index 7c4cb5d7..356954e5 100644
--- a/snippets/destination_connectors/weaviate.v2.py.mdx
+++ b/snippets/destination_connectors/weaviate.v2.py.mdx
@@ -4,22 +4,39 @@ import os
from unstructured_ingest.v2.pipeline.pipeline import Pipeline
from unstructured_ingest.v2.interfaces import ProcessorConfig
-from unstructured_ingest.v2.processes.connectors.weaviate import (
- WeaviateConnectionConfig,
- WeaviateAccessConfig,
- WeaviateUploaderConfig,
- WeaviateUploadStagerConfig
-)
from unstructured_ingest.v2.processes.connectors.local import (
LocalIndexerConfig,
LocalDownloaderConfig,
LocalConnectionConfig
)
+
from unstructured_ingest.v2.processes.partitioner import PartitionerConfig
from unstructured_ingest.v2.processes.chunker import ChunkerConfig
from unstructured_ingest.v2.processes.embedder import EmbedderConfig
-# Chunking and embedding are optional.
+# For Weaviate installed locally:
+# from unstructured_ingest.v2.processes.connectors.weaviate.local import (
+# LocalWeaviateConnectionConfig,
+# LocalWeaviateAccessConfig,
+# LocalWeaviateUploadStagerConfig,
+# LocalWeaviateUploaderConfig
+# )
+
+# For Embedded Weaviate:
+# from unstructured_ingest.v2.processes.connectors.weaviate.embedded import (
+# EmbeddedWeaviateConnectionConfig,
+# EmbeddedWeaviateAccessConfig,
+# EmbeddedWeaviateUploadStagerConfig,
+# EmbeddedWeaviateUploaderConfig
+# )
+
+# For Weaviate Cloud:
+from unstructured_ingest.v2.processes.connectors.weaviate.cloud import (
+ CloudWeaviateConnectionConfig,
+ CloudWeaviateAccessConfig,
+ CloudWeaviateUploaderConfig,
+ CloudWeaviateUploadStagerConfig
+)
if __name__ == "__main__":
Pipeline.from_configs(
@@ -31,7 +48,6 @@ if __name__ == "__main__":
partition_by_api=True,
api_key=os.getenv("UNSTRUCTURED_API_KEY"),
partition_endpoint=os.getenv("UNSTRUCTURED_API_URL"),
- strategy="hi_res",
additional_partition_args={
"split_pdf_page": True,
"split_pdf_allow_failed": True,
@@ -39,15 +55,41 @@ if __name__ == "__main__":
}
),
chunker_config=ChunkerConfig(chunking_strategy="by_title"),
- embedder_config=EmbedderConfig(embedding_provider="huggingface"),
- destination_connection_config=WeaviateConnectionConfig(
- access_config=WeaviateAccessConfig(
+ embedder_config=EmbedderConfig(
+ embedding_provider="openai",
+ embedding_model_name=os.getenv("EMBEDDING_MODEL_NAME"),
+ embedding_api_key=os.getenv("OPENAI_APIKEY")
+ ),
+
+ # For Weaviate installed locally:
+ # destination_connection_config=LocalWeaviateConnectionConfig(
+ # access_config=LocalWeaviateAccessConfig()
+ # ),
+ # stager_config=LocalWeaviateUploadStagerConfig(),
+ # uploader_config=LocalWeaviateUploaderConfig(
+ # collection=os.getenv("WEAVIATE_COLLECTION")
+ # )
+
+ # For Embedded Weaviate:
+ # destination_connection_config=EmbeddedWeaviateConnectionConfig(
+ # access_config=EmbeddedWeaviateAccessConfig(),
+ # hostname=os.getenv("WEAVIATE_HOST")
+ # ),
+ # stager_config=EmbeddedWeaviateUploadStagerConfig(),
+ # uploader_config=EmbeddedWeaviateUploaderConfig(
+ # collection=os.getenv("WEAVIATE_COLLECTION")
+ # )
+
+ # For Weaviate Cloud:
+ destination_connection_config=CloudWeaviateConnectionConfig(
+ access_config=CloudWeaviateAccessConfig(
api_key=os.getenv("WEAVIATE_API_KEY")
),
- host_url=os.getenv("WEAVIATE_URL"),
- class_name=os.getenv("WEAVIATE_COLLECTION_CLASS_NAME")
+ cluster_url=os.getenv("WEAVIATE_CLUSTER_URL")
),
- stager_config=WeaviateUploadStagerConfig(),
- uploader_config=WeaviateUploaderConfig()
+ stager_config=CloudWeaviateUploadStagerConfig(),
+ uploader_config=CloudWeaviateUploaderConfig(
+ collection=os.getenv("WEAVIATE_COLLECTION")
+ )
).run()
```
\ No newline at end of file
diff --git a/snippets/general-shared-text/weaviate-cli-api.mdx b/snippets/general-shared-text/weaviate-cli-api.mdx
index f0e9954b..19121429 100644
--- a/snippets/general-shared-text/weaviate-cli-api.mdx
+++ b/snippets/general-shared-text/weaviate-cli-api.mdx
@@ -10,10 +10,17 @@ import AdditionalIngestDependencies from '/snippets/general-shared-text/ingest-d
The following environment variables:
-- `WEAVIATE_URL` - THE REST endpoint for the Weaviate database cluster, represented by `--host-url` (CLI) or `host_url` (Python).
+- For Weaviate installed locally, `WEAVIATE_COLLECTION` - The name of the target collection in the instance, represented by `--collection` (CLI) or `collection` (Python).
+- For Embedded Weaviate:
-- `WEAVIATE_API_KEY` - The API key for the database cluster, represented by `--api-key` (CLI) or `api_key` (Python).
+ - `WEAVIATE_HOST` - The connection URL to the instance, represented by `--hostname` (CLI) or `hostname` (Python).
+ - `WEAVIATE_COLLECTION` - The name of the target collection in the instance, represented by `--collection` (CLI) or `collection` (Python).
- For the CLI, the `--api-key` option here is part of the `weaviate` command. For Python, the `api_key` parameter here is part of the `WeaviateAccessConfig` object.
+- For Weaviate Cloud:
-- `WEAVIATE_COLLECTION_CLASS_NAME` - The name of the collection in the database, represented by `--class-name` (CLI) or `class_name` (Python).
\ No newline at end of file
+ - `WEAVIATE_CLUSTER_URL` - THE REST endpoint for the Weaviate database cluster, represented by `--cluster-url` (CLI) or `cluster_url` (Python).
+ - `WEAVIATE_API_KEY` - The API key for the database cluster, represented by `--api-key` (CLI) or `api_key` (Python).
+
+ For the CLI, the `--api-key` option here is part of the `weaviate-cloud` command. For Python, the `api_key` parameter here is part of the `CloudWeaviateAccessConfig` object.
+
+ - `WEAVIATE_COLLECTION` - The name of the target collection in the database, represented by `--collection` (CLI) or `collection` (Python).
\ No newline at end of file
diff --git a/snippets/general-shared-text/weaviate.mdx b/snippets/general-shared-text/weaviate.mdx
index 990ec6b4..5fd08572 100644
--- a/snippets/general-shared-text/weaviate.mdx
+++ b/snippets/general-shared-text/weaviate.mdx
@@ -1,72 +1,84 @@
-The Weaviate prerequisites:
+The Weaviate prerequisites.
-
+- For the [Unstructured Platform](/platform/overview): only [Weaviate Cloud](https://weaviate.io/developers/wcs) clusters are supported.
+- For [Unstructured Ingest](/ingestion/overview): Weaviate Cloud clusters,
+ [Weaviate installed locally](https://weaviate.io/developers/weaviate/quickstart/local),
+ and [Embedded Weaviate](https://weaviate.io/developers/weaviate/installation/embedded) are supported.
+- For Weaviate installed locally, you will need the name of the target collection on the local instance.
+- For Embedded Weaviate, you will need the instance's connection URL and the name of the target collection on the instance.
+- For Weaviate Cloud, you will need:
-1. A Weaviate database instance. The following information assumes that you have a Weaviate Cloud (WCD) account with a Weaviate database cluster in that account.
- [Create a WCD account](https://weaviate.io/developers/wcs/quickstart#create-a-wcd-account). [Create a database cluster](https://weaviate.io/developers/wcs/quickstart#create-a-weaviate-cluster). For other database options, [learn more](https://weaviate.io/developers/weaviate/installation).
+ - A Weaviate database instance. The following information assumes that you have a Weaviate Cloud (WCD) account with a Weaviate database cluster in that account.
+ [Create a WCD account](https://weaviate.io/developers/wcs/quickstart#create-a-wcd-account). [Create a database cluster](https://weaviate.io/developers/wcs/quickstart#create-a-weaviate-cluster). For other database options, [learn more](https://weaviate.io/developers/weaviate/installation).
+ - The name of the target collection in the database. [Create a collection](https://weaviate.io/developers/wcs/tools/collections-tool).
+ - The URL and API key for the database cluster. [Get the URL and API key](https://weaviate.io/developers/wcs/quickstart#explore-the-details-panel).
-2. The URL and API key for the database cluster. [Get the URL and API key](https://weaviate.io/developers/wcs/quickstart#explore-the-details-panel).
+ The following video describes how to set up Weaviate Cloud for Unstructured.
-3. A collection in the database cluster. Note the name of the collection, also known as the collection's _class name_. [Create a collection](https://weaviate.io/developers/wcs/tools/collections-tool).
+
- The schema of the collection that you use must match the data that Unstructured writes to it. Otherwise, you might get unexpected results or errors.
- Unstructured cannot provide a schema that is guaranteed to work for everyone in all circumstances. This is because these schemas will vary based on
- your source files' types; how you want Unstructured to partition, chunk, and generate embeddings;
- any custom post-processing code that you run; and other factors.
+Weaviate requires the collection to have a data schema before you add data. However, you don't have to create a data schema manually.
+If you don't provide one, Weaviate generates a schema based on the incoming data.
- You can adapt the following collection schema example for your own needs:
+However, if you have specific schema requirements, you can create the schema manually.
+Unstructured cannot provide a schema that is guaranteed to work for everyone in all circumstances.
+This is because these schemas will vary based on
+your source files' types; how you want Unstructured to partition, chunk, and generate embeddings;
+any custom post-processing code that you run; and other factors.
- ```json
- {
- "class": "Elements",
- "properties": [
- {
- "name": "element_id",
- "dataType": ["text"]
- },
- {
- "name": "text",
- "dataType": ["text"]
- },
- {
- "name": "embeddings",
- "dataType": ["number[]"]
- },
- {
- "name": "metadata",
- "dataType": ["object"],
- "nestedProperties": [
- {
- "name": "parent_id",
- "dataType": ["text"]
- },
- {
- "name": "page_number",
- "dataType": ["text"]
- },
- {
- "name": "is_continuation",
- "dataType": ["boolean"]
- },
- {
- "name": "orig_elements",
- "dataType": ["text"]
- }
- ]
- }
- ]
- }
- ```
-
- See also :
+You can adapt the following collection schema example for your own specific schema requirements:
- - [Collection schema](https://weaviate.io/developers/weaviate/config-refs/schema)
- - [Unstructured document elements and metadata](/api-reference/api-services/document-elements)
\ No newline at end of file
+```json
+{
+ "class": "Elements",
+ "properties": [
+ {
+ "name": "element_id",
+ "dataType": ["text"]
+ },
+ {
+ "name": "text",
+ "dataType": ["text"]
+ },
+ {
+ "name": "embeddings",
+ "dataType": ["number[]"]
+ },
+ {
+ "name": "metadata",
+ "dataType": ["object"],
+ "nestedProperties": [
+ {
+ "name": "parent_id",
+ "dataType": ["text"]
+ },
+ {
+ "name": "page_number",
+ "dataType": ["text"]
+ },
+ {
+ "name": "is_continuation",
+ "dataType": ["boolean"]
+ },
+ {
+ "name": "orig_elements",
+ "dataType": ["text"]
+ }
+ ]
+ }
+ ]
+}
+```
+
+See also :
+
+- [Collection schema](https://weaviate.io/developers/weaviate/config-refs/schema)
+- [Unstructured document elements and metadata](/api-reference/api-services/document-elements)
\ No newline at end of file