Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 0 additions & 10 deletions snippets/dc-shared-text/weaviate-cli-api.mdx
Original file line number Diff line number Diff line change
@@ -1,15 +1,5 @@
Batch process all your records to store structured outputs in a Weaviate database.

<iframe
width="560"
height="315"
src="https://www.youtube.com/embed/uqUrH8ksI0I"
title="YouTube video player"
frameborder="0"
allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture"
allowfullscreen
></iframe>

You will need:

import SharedWeaviate from '/snippets/general-shared-text/weaviate.mdx';
Expand Down
42 changes: 34 additions & 8 deletions snippets/destination_connectors/weaviate.sh.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -3,22 +3,48 @@

# Chunking and embedding is optional.

# For Weaviate installed locally:
unstructured-ingest \
local \
--input-path $LOCAL_FILE_INPUT_DIR \
--output-dir $LOCAL_FILE_OUTPUT_DIR \
--partition-by-api \
--api-key $UNSTRUCTURED_API_KEY \
--partition-endpoint $UNSTRUCTURED_API_URL \
--chunking-strategy by_title \
--embedding-provider huggingface \
--strategy hi_res \
--chunk-elements \
--additional-partition-args="{\"split_pdf_page\":\"true\", \"split_pdf_allow_failed\":\"true\", \"split_pdf_concurrency_level\": 15}" \
weaviate-local \
--collection $WEAVIATE_COLLECTION

# For Embedded Weaviate:
unstructured-ingest \
local \
--input-path $LOCAL_FILE_INPUT_DIR \
--partition-by-api \
--api-key $UNSTRUCTURED_API_KEY \
--partition-endpoint $UNSTRUCTURED_API_URL \
--chunking-strategy by_title \
--embedding-provider huggingface \
--num-processes 2 \
--verbose \
--strategy fast \
--strategy hi_res \
--additional-partition-args="{\"split_pdf_page\":\"true\", \"split_pdf_allow_failed\":\"true\", \"split_pdf_concurrency_level\": 15}" \
weaviate-embedded \
--hostname $WEAVIATE_HOST \
--collection $WEAVIATE_COLLECTION

# For Weaviate Cloud:
unstructured-ingest \
local \
--input-path $LOCAL_FILE_INPUT_DIR \
--partition-by-api \
--api-key $UNSTRUCTURED_API_KEY \
--partition-endpoint $UNSTRUCTURED_API_URL \
--chunking-strategy by_title \
--embedding-provider huggingface \
--strategy hi_res \
--additional-partition-args="{\"split_pdf_page\":\"true\", \"split_pdf_allow_failed\":\"true\", \"split_pdf_concurrency_level\": 15}" \
weaviate \
--host-url $WEAVIATE_URL \
weaviate-cloud \
--cluster-url $WEAVIATE_URL \
--api-key $WEAVIATE_API_KEY \
--class-name $WEAVIATE_COLLECTION_CLASS_NAME
--collection $WEAVIATE_COLLECTION
```
3 changes: 3 additions & 0 deletions snippets/destination_connectors/weaviate.v1.py.mdx
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
```python Python Ingest v1
# NOTE: Python Ingest v1 does not provide separate classes for
# Weaviate installed locally, Embedded Weaviate, or Weaviate Cloud.

from unstructured_ingest.connector.local import SimpleLocalConfig
from unstructured_ingest.connector.weaviate import (
SimpleWeaviateConfig,
Expand Down
72 changes: 57 additions & 15 deletions snippets/destination_connectors/weaviate.v2.py.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -4,22 +4,39 @@ import os
from unstructured_ingest.v2.pipeline.pipeline import Pipeline
from unstructured_ingest.v2.interfaces import ProcessorConfig

from unstructured_ingest.v2.processes.connectors.weaviate import (
WeaviateConnectionConfig,
WeaviateAccessConfig,
WeaviateUploaderConfig,
WeaviateUploadStagerConfig
)
from unstructured_ingest.v2.processes.connectors.local import (
LocalIndexerConfig,
LocalDownloaderConfig,
LocalConnectionConfig
)

from unstructured_ingest.v2.processes.partitioner import PartitionerConfig
from unstructured_ingest.v2.processes.chunker import ChunkerConfig
from unstructured_ingest.v2.processes.embedder import EmbedderConfig

# Chunking and embedding are optional.
# For Weaviate installed locally:
# from unstructured_ingest.v2.processes.connectors.weaviate.local import (
# LocalWeaviateConnectionConfig,
# LocalWeaviateAccessConfig,
# LocalWeaviateUploadStagerConfig,
# LocalWeaviateUploaderConfig
# )

# For Embedded Weaviate:
# from unstructured_ingest.v2.processes.connectors.weaviate.embedded import (
# EmbeddedWeaviateConnectionConfig,
# EmbeddedWeaviateAccessConfig,
# EmbeddedWeaviateUploadStagerConfig,
# EmbeddedWeaviateUploaderConfig
# )

# For Weaviate Cloud:
from unstructured_ingest.v2.processes.connectors.weaviate.cloud import (
CloudWeaviateConnectionConfig,
CloudWeaviateAccessConfig,
CloudWeaviateUploaderConfig,
CloudWeaviateUploadStagerConfig
)

if __name__ == "__main__":
Pipeline.from_configs(
Expand All @@ -31,23 +48,48 @@ if __name__ == "__main__":
partition_by_api=True,
api_key=os.getenv("UNSTRUCTURED_API_KEY"),
partition_endpoint=os.getenv("UNSTRUCTURED_API_URL"),
strategy="hi_res",
additional_partition_args={
"split_pdf_page": True,
"split_pdf_allow_failed": True,
"split_pdf_concurrency_level": 15
}
),
chunker_config=ChunkerConfig(chunking_strategy="by_title"),
embedder_config=EmbedderConfig(embedding_provider="huggingface"),
destination_connection_config=WeaviateConnectionConfig(
access_config=WeaviateAccessConfig(
embedder_config=EmbedderConfig(
embedding_provider="openai",
embedding_model_name=os.getenv("EMBEDDING_MODEL_NAME"),
embedding_api_key=os.getenv("OPENAI_APIKEY")
),

# For Weaviate installed locally:
# destination_connection_config=LocalWeaviateConnectionConfig(
# access_config=LocalWeaviateAccessConfig()
# ),
# stager_config=LocalWeaviateUploadStagerConfig(),
# uploader_config=LocalWeaviateUploaderConfig(
# collection=os.getenv("WEAVIATE_COLLECTION")
# )

# For Embedded Weaviate:
# destination_connection_config=EmbeddedWeaviateConnectionConfig(
# access_config=EmbeddedWeaviateAccessConfig(),
# hostname=os.getenv("WEAVIATE_HOST")
# ),
# stager_config=EmbeddedWeaviateUploadStagerConfig(),
# uploader_config=EmbeddedWeaviateUploaderConfig(
# collection=os.getenv("WEAVIATE_COLLECTION")
# )

# For Weaviate Cloud:
destination_connection_config=CloudWeaviateConnectionConfig(
access_config=CloudWeaviateAccessConfig(
api_key=os.getenv("WEAVIATE_API_KEY")
),
host_url=os.getenv("WEAVIATE_URL"),
class_name=os.getenv("WEAVIATE_COLLECTION_CLASS_NAME")
cluster_url=os.getenv("WEAVIATE_CLUSTER_URL")
),
stager_config=WeaviateUploadStagerConfig(),
uploader_config=WeaviateUploaderConfig()
stager_config=CloudWeaviateUploadStagerConfig(),
uploader_config=CloudWeaviateUploaderConfig(
collection=os.getenv("WEAVIATE_COLLECTION")
)
).run()
```
15 changes: 11 additions & 4 deletions snippets/general-shared-text/weaviate-cli-api.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,17 @@ import AdditionalIngestDependencies from '/snippets/general-shared-text/ingest-d

The following environment variables:

- `WEAVIATE_URL` - THE REST endpoint for the Weaviate database cluster, represented by `--host-url` (CLI) or `host_url` (Python).
- For Weaviate installed locally, `WEAVIATE_COLLECTION` - The name of the target collection in the instance, represented by `--collection` (CLI) or `collection` (Python).
- For Embedded Weaviate:

- `WEAVIATE_API_KEY` - The API key for the database cluster, represented by `--api-key` (CLI) or `api_key` (Python).
- `WEAVIATE_HOST` - The connection URL to the instance, represented by `--hostname` (CLI) or `hostname` (Python).
- `WEAVIATE_COLLECTION` - The name of the target collection in the instance, represented by `--collection` (CLI) or `collection` (Python).

<Note>For the CLI, the `--api-key` option here is part of the `weaviate` command. For Python, the `api_key` parameter here is part of the `WeaviateAccessConfig` object.</Note>
- For Weaviate Cloud:

- `WEAVIATE_COLLECTION_CLASS_NAME` - The name of the collection in the database, represented by `--class-name` (CLI) or `class_name` (Python).
- `WEAVIATE_CLUSTER_URL` - THE REST endpoint for the Weaviate database cluster, represented by `--cluster-url` (CLI) or `cluster_url` (Python).
- `WEAVIATE_API_KEY` - The API key for the database cluster, represented by `--api-key` (CLI) or `api_key` (Python).

<Note>For the CLI, the `--api-key` option here is part of the `weaviate-cloud` command. For Python, the `api_key` parameter here is part of the `CloudWeaviateAccessConfig` object.</Note>

- `WEAVIATE_COLLECTION` - The name of the target collection in the database, represented by `--collection` (CLI) or `collection` (Python).
140 changes: 76 additions & 64 deletions snippets/general-shared-text/weaviate.mdx
Original file line number Diff line number Diff line change
@@ -1,72 +1,84 @@
The Weaviate prerequisites:
The Weaviate prerequisites.

<iframe
width="560"
height="315"
src="https://www.youtube.com/embed/Ldb7PZU-pR4"
title="YouTube video player"
frameborder="0"
allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture"
allowfullscreen
></iframe>
- For the [Unstructured Platform](/platform/overview): only [Weaviate Cloud](https://weaviate.io/developers/wcs) clusters are supported.
- For [Unstructured Ingest](/ingestion/overview): Weaviate Cloud clusters,
[Weaviate installed locally](https://weaviate.io/developers/weaviate/quickstart/local),
and [Embedded Weaviate](https://weaviate.io/developers/weaviate/installation/embedded) are supported.
- For Weaviate installed locally, you will need the name of the target collection on the local instance.
- For Embedded Weaviate, you will need the instance's connection URL and the name of the target collection on the instance.
- For Weaviate Cloud, you will need:

1. A Weaviate database instance. The following information assumes that you have a Weaviate Cloud (WCD) account with a Weaviate database cluster in that account.
[Create a WCD account](https://weaviate.io/developers/wcs/quickstart#create-a-wcd-account). [Create a database cluster](https://weaviate.io/developers/wcs/quickstart#create-a-weaviate-cluster). For other database options, [learn more](https://weaviate.io/developers/weaviate/installation).
- A Weaviate database instance. The following information assumes that you have a Weaviate Cloud (WCD) account with a Weaviate database cluster in that account.
[Create a WCD account](https://weaviate.io/developers/wcs/quickstart#create-a-wcd-account). [Create a database cluster](https://weaviate.io/developers/wcs/quickstart#create-a-weaviate-cluster). For other database options, [learn more](https://weaviate.io/developers/weaviate/installation).
- The name of the target collection in the database. [Create a collection](https://weaviate.io/developers/wcs/tools/collections-tool).
- The URL and API key for the database cluster. [Get the URL and API key](https://weaviate.io/developers/wcs/quickstart#explore-the-details-panel).

2. The URL and API key for the database cluster. [Get the URL and API key](https://weaviate.io/developers/wcs/quickstart#explore-the-details-panel).
The following video describes how to set up Weaviate Cloud for Unstructured.

3. A collection in the database cluster. Note the name of the collection, also known as the collection's _class name_. [Create a collection](https://weaviate.io/developers/wcs/tools/collections-tool).
<iframe
width="560"
height="315"
src="https://www.youtube.com/embed/Ldb7PZU-pR4"
title="YouTube video player"
frameborder="0"
allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture"
allowfullscreen
></iframe>

The schema of the collection that you use must match the data that Unstructured writes to it. Otherwise, you might get unexpected results or errors.
Unstructured cannot provide a schema that is guaranteed to work for everyone in all circumstances. This is because these schemas will vary based on
your source files' types; how you want Unstructured to partition, chunk, and generate embeddings;
any custom post-processing code that you run; and other factors.
Weaviate requires the collection to have a data schema before you add data. However, you don't have to create a data schema manually.
If you don't provide one, Weaviate generates a schema based on the incoming data.

You can adapt the following collection schema example for your own needs:
However, if you have specific schema requirements, you can create the schema manually.
Unstructured cannot provide a schema that is guaranteed to work for everyone in all circumstances.
This is because these schemas will vary based on
your source files' types; how you want Unstructured to partition, chunk, and generate embeddings;
any custom post-processing code that you run; and other factors.

```json
{
"class": "Elements",
"properties": [
{
"name": "element_id",
"dataType": ["text"]
},
{
"name": "text",
"dataType": ["text"]
},
{
"name": "embeddings",
"dataType": ["number[]"]
},
{
"name": "metadata",
"dataType": ["object"],
"nestedProperties": [
{
"name": "parent_id",
"dataType": ["text"]
},
{
"name": "page_number",
"dataType": ["text"]
},
{
"name": "is_continuation",
"dataType": ["boolean"]
},
{
"name": "orig_elements",
"dataType": ["text"]
}
]
}
]
}
```

See also :
You can adapt the following collection schema example for your own specific schema requirements:

- [Collection schema](https://weaviate.io/developers/weaviate/config-refs/schema)
- [Unstructured document elements and metadata](/api-reference/api-services/document-elements)
```json
{
"class": "Elements",
"properties": [
{
"name": "element_id",
"dataType": ["text"]
},
{
"name": "text",
"dataType": ["text"]
},
{
"name": "embeddings",
"dataType": ["number[]"]
},
{
"name": "metadata",
"dataType": ["object"],
"nestedProperties": [
{
"name": "parent_id",
"dataType": ["text"]
},
{
"name": "page_number",
"dataType": ["text"]
},
{
"name": "is_continuation",
"dataType": ["boolean"]
},
{
"name": "orig_elements",
"dataType": ["text"]
}
]
}
]
}
```

See also :

- [Collection schema](https://weaviate.io/developers/weaviate/config-refs/schema)
- [Unstructured document elements and metadata](/api-reference/api-services/document-elements)