From a5d9345644229f30372ce32ce27f7431fd14a4f0 Mon Sep 17 00:00:00 2001 From: Paul Cornell Date: Thu, 21 Nov 2024 08:57:05 -0800 Subject: [PATCH 1/2] API: Qdrant v2 destination connector --- .../ingest/destination-connector/qdrant.mdx | 26 ++++++++- .../ingest/destination-connectors/qdrant.mdx | 26 ++++++++- snippets/dc-shared-text/qdrant-cli-api.mdx | 9 +++ snippets/dc-shared-text/qdrant.mdx | 27 --------- snippets/destination_connectors/qdrant.sh.mdx | 17 +++--- .../{qdrant.py.mdx => qdrant.v1.py.mdx} | 19 +++++-- .../destination_connectors/qdrant.v2.py.mdx | 55 +++++++++++++++++++ .../general-shared-text/qdrant-cli-api.mdx | 15 +++++ snippets/general-shared-text/qdrant.mdx | 13 +++++ 9 files changed, 165 insertions(+), 42 deletions(-) create mode 100644 snippets/dc-shared-text/qdrant-cli-api.mdx delete mode 100644 snippets/dc-shared-text/qdrant.mdx rename snippets/destination_connectors/{qdrant.py.mdx => qdrant.v1.py.mdx} (70%) create mode 100644 snippets/destination_connectors/qdrant.v2.py.mdx create mode 100644 snippets/general-shared-text/qdrant-cli-api.mdx create mode 100644 snippets/general-shared-text/qdrant.mdx diff --git a/api-reference/ingest/destination-connector/qdrant.mdx b/api-reference/ingest/destination-connector/qdrant.mdx index 9c276098..8cd1d7e6 100644 --- a/api-reference/ingest/destination-connector/qdrant.mdx +++ b/api-reference/ingest/destination-connector/qdrant.mdx @@ -2,6 +2,28 @@ title: Qdrant --- -import SharedQdrant from '/snippets/dc-shared-text/qdrant.mdx'; +import NewDocument from '/snippets/general-shared-text/new-document.mdx'; + + + +import SharedContentQdrant from '/snippets/dc-shared-text/qdrant-cli-api.mdx'; +import SharedAPIKeyURL from '/snippets/general-shared-text/api-key-url.mdx'; + + + + +Now call the Unstructured CLI or Python SDK. The source connector can be any of the ones supported. + +This example uses the local source connector: + +import QdrantAPISh from '/snippets/destination_connectors/qdrant.sh.mdx'; +import QdrantAPIPyV2 from '/snippets/destination_connectors/qdrant.v2.py.mdx'; +import QdrantAPIPyV1 from '/snippets/destination_connectors/qdrant.v1.py.mdx'; + + + + + + + - diff --git a/open-source/ingest/destination-connectors/qdrant.mdx b/open-source/ingest/destination-connectors/qdrant.mdx index 9c276098..e14e90b6 100644 --- a/open-source/ingest/destination-connectors/qdrant.mdx +++ b/open-source/ingest/destination-connectors/qdrant.mdx @@ -2,6 +2,28 @@ title: Qdrant --- -import SharedQdrant from '/snippets/dc-shared-text/qdrant.mdx'; + - +import SharedContentQdrant from '/snippets/dc-shared-text/qdrant-cli-api.mdx'; + + + +Now call the Unstructured CLI or Python SDK. The source connector can be any of the ones supported. + +This example uses the local source connector. + +This example sends files to Unstructured API services for processing by default. To process files locally instead, see the instructions at the end of this page. + +import QdrantAPISh from '/snippets/destination_connectors/qdrant.sh.mdx'; +import QdrantAPIPyV2 from '/snippets/destination_connectors/qdrant.v2.py.mdx'; +import QdrantAPIPyV1 from '/snippets/destination_connectors/qdrant.v1.py.mdx'; + + + + + + + +import SharedPartitionByAPIOSS from '/snippets/ingest-configuration-shared/partition-by-api-oss.mdx'; + + diff --git a/snippets/dc-shared-text/qdrant-cli-api.mdx b/snippets/dc-shared-text/qdrant-cli-api.mdx new file mode 100644 index 00000000..3a7c39f7 --- /dev/null +++ b/snippets/dc-shared-text/qdrant-cli-api.mdx @@ -0,0 +1,9 @@ +Batch process all your records to store structured outputs in Qdrant. + +You will need: + +import SharedQdrant from '/snippets/general-shared-text/qdrant.mdx'; +import SharedQdrantCLIAPI from '/snippets/general-shared-text/qdrant-cli-api.mdx'; + + + \ No newline at end of file diff --git a/snippets/dc-shared-text/qdrant.mdx b/snippets/dc-shared-text/qdrant.mdx deleted file mode 100644 index d615d052..00000000 --- a/snippets/dc-shared-text/qdrant.mdx +++ /dev/null @@ -1,27 +0,0 @@ -Batch process all your records using `unstructured-ingest` to store structured outputs and embeddings locally on your filesystem and upload those to a Qdrant collection. - -First you’ll need to install the Qdrant dependencies as shown here. - -```bash -pip install "unstructured-ingest[qdrant]" -``` - -Create a Qdrant collection with the appropriate configurations. Find more information in the [Qdrant collections guide](https://qdrant.tech/documentation/concepts/collections/). - -The upstream connector can be any of the ones supported, but for convenience here, showing a sample command using the upstream local connector. - -import QdrantSh from '/snippets/destination_connectors/qdrant.sh.mdx'; -import QdrantPy from '/snippets/destination_connectors/qdrant.py.mdx'; - - - - - - - - - - -For a full list of the options the Unstructured Ingest CLI accepts check `unstructured-ingest qdrant --help`. - -NOTE: Keep in mind that you will need to have all the appropriate extras and dependencies for the file types of the documents contained in your data storage platform if you’re running this locally. You can find more information about this in the [installation guide](/open-source/installation/overview). \ No newline at end of file diff --git a/snippets/destination_connectors/qdrant.sh.mdx b/snippets/destination_connectors/qdrant.sh.mdx index 2e8c24d5..a0a8f00c 100644 --- a/snippets/destination_connectors/qdrant.sh.mdx +++ b/snippets/destination_connectors/qdrant.sh.mdx @@ -1,4 +1,4 @@ -```bash Shell +```bash CLI #!/usr/bin/env bash # Chunking and embedding are optional. @@ -7,13 +7,16 @@ unstructured-ingest \ local \ --input-path $LOCAL_FILE_INPUT_DIR \ --output-dir $LOCAL_FILE_OUTPUT_DIR \ - --strategy hi_res \ --chunk-elements \ --embedding-provider huggingface \ - --num-processes 2 \ - --verbose \ + --partition-by-api \ + --api-key $UNSTRUCTURED_API_KEY \ + --partition-endpoint $UNSTRUCTURED_API_URL \ + --strategy hi_res \ + --additional-partition-args="{\"split_pdf_page\":\"true\", \"split_pdf_allow_failed\":\"true\", \"split_pdf_concurrency_level\": 15}" \ qdrant \ - --collection-name $QDRANT_COLLECTION_NAME \ - --location http://localhost:6333 \ - --batch-size 80 + --url $QDRANT_URL \ + --collection-name $QDRANT_COLLECTION \ + --api-key $QDRANT_API_KEY \ + --batch-size 50 ``` diff --git a/snippets/destination_connectors/qdrant.py.mdx b/snippets/destination_connectors/qdrant.v1.py.mdx similarity index 70% rename from snippets/destination_connectors/qdrant.py.mdx rename to snippets/destination_connectors/qdrant.v1.py.mdx index 56a1e5ec..9da32ba4 100644 --- a/snippets/destination_connectors/qdrant.py.mdx +++ b/snippets/destination_connectors/qdrant.v1.py.mdx @@ -1,8 +1,11 @@ -```python Python +```python Python Ingest v1 +import os + from unstructured_ingest.connector.local import SimpleLocalConfig from unstructured_ingest.connector.qdrant import ( QdrantWriteConfig, SimpleQdrantConfig, + QdrantAccessConfig, ) from unstructured_ingest.interfaces import ( ChunkingConfig, @@ -15,12 +18,12 @@ from unstructured_ingest.runner import LocalRunner from unstructured_ingest.runner.writers.base_writer import Writer from unstructured_ingest.runner.writers.qdrant import QdrantWriter - def get_writer() -> Writer: return QdrantWriter( connector_config=SimpleQdrantConfig( - location="http://localhost:6333", - collection_name="test", + url=os.getenv("QDRANT_URL"), + access_config=QdrantAccessConfig(api_key=os.getenv("QDRANT_API_KEY")), + collection_name=os.getenv("QDRANT_COLLECTION"), ), write_config=QdrantWriteConfig(batch_size=80), ) @@ -40,7 +43,15 @@ if __name__ == "__main__": ), read_config=ReadConfig(), partition_config=PartitionConfig( + partition_by_api=True, + api_key=os.getenv("UNSTRUCTURED_API_KEY"), + partition_endpoint=os.getenv("UNSTRUCTURED_API_URL"), strategy="hi_res", + additional_partition_args={ + "split_pdf_page": True, + "split_pdf_allow_failed": True, + "split_pdf_concurrency_level": 15 + } ), chunking_config=ChunkingConfig(chunk_elements=True), embedding_config=EmbeddingConfig( diff --git a/snippets/destination_connectors/qdrant.v2.py.mdx b/snippets/destination_connectors/qdrant.v2.py.mdx new file mode 100644 index 00000000..4588b5e7 --- /dev/null +++ b/snippets/destination_connectors/qdrant.v2.py.mdx @@ -0,0 +1,55 @@ +```python Python Ingest v2 +import os + +from unstructured_ingest.v2.pipeline.pipeline import Pipeline +from unstructured_ingest.v2.interfaces import ProcessorConfig + +from unstructured_ingest.v2.processes.connectors.local import ( + LocalIndexerConfig, + LocalDownloaderConfig, + LocalConnectionConfig +) +from unstructured_ingest.v2.processes.partitioner import PartitionerConfig +from unstructured_ingest.v2.processes.chunker import ChunkerConfig +from unstructured_ingest.v2.processes.embedder import EmbedderConfig + +from unstructured_ingest.v2.processes.connectors.qdrant.cloud import ( + CloudQdrantConnectionConfig, + CloudQdrantAccessConfig, + CloudQdrantUploaderConfig, + CloudQdrantUploadStagerConfig +) + +# Chunking and embedding are optional. + +if __name__ == "__main__": + Pipeline.from_configs( + context=ProcessorConfig(), + indexer_config=LocalIndexerConfig(input_path=os.getenv("LOCAL_FILE_INPUT_DIR")), + downloader_config=LocalDownloaderConfig(), + source_connection_config=LocalConnectionConfig(), + partitioner_config=PartitionerConfig( + partition_by_api=True, + api_key=os.getenv("UNSTRUCTURED_API_KEY"), + partition_endpoint=os.getenv("UNSTRUCTURED_API_URL"), + additional_partition_args={ + "split_pdf_page": True, + "split_pdf_allow_failed": True, + "split_pdf_concurrency_level": 15 + } + ), + chunker_config=ChunkerConfig(chunking_strategy="by_title"), + embedder_config=EmbedderConfig(embedding_provider="huggingface"), + destination_connection_config=CloudQdrantConnectionConfig( + access_config=CloudQdrantAccessConfig( + api_key=os.getenv("QDRANT_API_KEY") + ), + url=os.getenv("QDRANT_URL") + ), + stager_config=CloudQdrantUploadStagerConfig(), + uploader_config=CloudQdrantUploaderConfig( + collection_name=os.gentenv("QDRANT_COLLECTION"), + batch_size=50 + ) + ).run() +``` \ No newline at end of file diff --git a/snippets/general-shared-text/qdrant-cli-api.mdx b/snippets/general-shared-text/qdrant-cli-api.mdx new file mode 100644 index 00000000..547a792e --- /dev/null +++ b/snippets/general-shared-text/qdrant-cli-api.mdx @@ -0,0 +1,15 @@ +The Qdrant connector dependencies. + +```bash +pip install "unstructured-ingest[qdrant]" +``` + +import AdditionalIngestDependencies from '/snippets/general-shared-text/ingest-dependencies.mdx'; + + + +The following environment variables: + +- `QDRANT_URL` - The Qdrant cluster's URL, represented by `--url` (CLI) or `url` (Python). +- `QDRANT_API_KEY` - The Qdrant API key, represented by `--api-key` (CLI) or `api_key` (Python). +- `QDRANT_COLLECTION` - The name of the target collection on the cluster, represented by `--collection-name` (CLI) or `collection_name` (Python). \ No newline at end of file diff --git a/snippets/general-shared-text/qdrant.mdx b/snippets/general-shared-text/qdrant.mdx new file mode 100644 index 00000000..86cde6ea --- /dev/null +++ b/snippets/general-shared-text/qdrant.mdx @@ -0,0 +1,13 @@ +The Qdrant prerequisites: + +- A [Qdrant account](https://cloud.qdrant.io/login). +- A [Qdrant cluster](https://qdrant.tech/documentation/cloud/create-cluster/). +- The cluster's URL. To get this URL, do the following: + + 1. Sign in to your Qdrant Cloud account. + 2. On the sidebar, under **Dashboard**, click **Clusters**. + 3. Click the cluster's name. + 4. Note the value of the **Endpoint** field, for example `https://...cloud.qdrant.io`. + +- A [Qdrant API key](https://qdrant.tech/documentation/cloud/authentication/#create-api-keys). +- The name of the target [collection](https://qdrant.tech/documentation/concepts/collections) on the cluster. \ No newline at end of file From 123a118fedb1059145e5ee05c74172a0aa2aaba7 Mon Sep 17 00:00:00 2001 From: Paul Cornell Date: Thu, 21 Nov 2024 12:02:21 -0800 Subject: [PATCH 2/2] Added Qdrant local and client-server options --- snippets/destination_connectors/qdrant.sh.mdx | 42 ++++++++++++++-- .../destination_connectors/qdrant.v1.py.mdx | 2 + .../destination_connectors/qdrant.v2.py.mdx | 50 +++++++++++++++++-- .../general-shared-text/qdrant-cli-api.mdx | 11 ++-- snippets/general-shared-text/qdrant.mdx | 25 ++++++---- 5 files changed, 109 insertions(+), 21 deletions(-) diff --git a/snippets/destination_connectors/qdrant.sh.mdx b/snippets/destination_connectors/qdrant.sh.mdx index a0a8f00c..37ca2d22 100644 --- a/snippets/destination_connectors/qdrant.sh.mdx +++ b/snippets/destination_connectors/qdrant.sh.mdx @@ -3,20 +3,52 @@ # Chunking and embedding are optional. +# For Qdrant local: unstructured-ingest \ local \ --input-path $LOCAL_FILE_INPUT_DIR \ - --output-dir $LOCAL_FILE_OUTPUT_DIR \ - --chunk-elements \ + --chunking-strategy by_title \ --embedding-provider huggingface \ --partition-by-api \ --api-key $UNSTRUCTURED_API_KEY \ --partition-endpoint $UNSTRUCTURED_API_URL \ - --strategy hi_res \ --additional-partition-args="{\"split_pdf_page\":\"true\", \"split_pdf_allow_failed\":\"true\", \"split_pdf_concurrency_level\": 15}" \ - qdrant \ + qdrant-local \ + --path $QDRANT_PATH \ + --collection-name $QDRANT_COLLECTION \ + --batch-size 50 \ + --num-processes 1 + +# For Qdrant client-server: +unstructured-ingest \ + local \ + --input-path $LOCAL_FILE_INPUT_DIR \ + --chunking-strategy by_title \ + --embedding-provider huggingface \ + --partition-by-api \ + --api-key $UNSTRUCTURED_API_KEY \ + --partition-endpoint $UNSTRUCTURED_API_URL \ + --additional-partition-args="{\"split_pdf_page\":\"true\", \"split_pdf_allow_failed\":\"true\", \"split_pdf_concurrency_level\": 15}" \ + qdrant-server \ --url $QDRANT_URL \ --collection-name $QDRANT_COLLECTION \ + --batch-size 50 \ + --num-processes 1 + +# For Qdrant cloud: +unstructured-ingest \ + local \ + --input-path $LOCAL_FILE_INPUT_DIR \ + --chunking-strategy by_title \ + --embedding-provider huggingface \ + --partition-by-api \ + --api-key $UNSTRUCTURED_API_KEY \ + --partition-endpoint $UNSTRUCTURED_API_URL \ + --additional-partition-args="{\"split_pdf_page\":\"true\", \"split_pdf_allow_failed\":\"true\", \"split_pdf_concurrency_level\": 15}" \ + qdrant-cloud \ + --url $QDRANT_URL \ --api-key $QDRANT_API_KEY \ - --batch-size 50 + --collection-name $QDRANT_COLLECTION \ + --batch-size 50 \ + --num-processes 1 ``` diff --git a/snippets/destination_connectors/qdrant.v1.py.mdx b/snippets/destination_connectors/qdrant.v1.py.mdx index 9da32ba4..4a2d97f1 100644 --- a/snippets/destination_connectors/qdrant.v1.py.mdx +++ b/snippets/destination_connectors/qdrant.v1.py.mdx @@ -18,6 +18,8 @@ from unstructured_ingest.runner import LocalRunner from unstructured_ingest.runner.writers.base_writer import Writer from unstructured_ingest.runner.writers.qdrant import QdrantWriter +# This example uses Qdrant Cloud. + def get_writer() -> Writer: return QdrantWriter( connector_config=SimpleQdrantConfig( diff --git a/snippets/destination_connectors/qdrant.v2.py.mdx b/snippets/destination_connectors/qdrant.v2.py.mdx index 4588b5e7..f20f20d2 100644 --- a/snippets/destination_connectors/qdrant.v2.py.mdx +++ b/snippets/destination_connectors/qdrant.v2.py.mdx @@ -13,11 +13,28 @@ from unstructured_ingest.v2.processes.partitioner import PartitionerConfig from unstructured_ingest.v2.processes.chunker import ChunkerConfig from unstructured_ingest.v2.processes.embedder import EmbedderConfig +# For Qdrant local: +# from unstructured_ingest.v2.processes.connectors.qdrant.local import ( +# LocalQdrantConnectionConfig, +# LocalQdrantAccessConfig, +# LocalQdrantUploadStagerConfig, +# LocalQdrantUploaderConfig +# ) + +# For Qdrant client-server: +# from unstructured_ingest.v2.processes.connectors.qdrant.server import ( +# ServerQdrantConnectionConfig, +# ServerQdrantAccessConfig, +# ServerQdrantUploadStagerConfig, +# ServerQdrantUploaderConfig +# ) + +# For Qdrant Cloud: from unstructured_ingest.v2.processes.connectors.qdrant.cloud import ( CloudQdrantConnectionConfig, CloudQdrantAccessConfig, - CloudQdrantUploaderConfig, - CloudQdrantUploadStagerConfig + CloudQdrantUploadStagerConfig, + CloudQdrantUploaderConfig ) # Chunking and embedding are optional. @@ -40,6 +57,32 @@ if __name__ == "__main__": ), chunker_config=ChunkerConfig(chunking_strategy="by_title"), embedder_config=EmbedderConfig(embedding_provider="huggingface"), + + # For Qdrant local: + # destination_connection_config=LocalQdrantConnectionConfig( + # access_config=LocalQdrantAccessConfig(), + # path=os.getenv("QDRANT_PATH") + # ), + # stager_config=LocalQdrantUploadStagerConfig(), + # uploader_config=LocalQdrantUploaderConfig( + # collection_name=os.gentenv("QDRANT_COLLECTION"), + # batch_size=50, + # num_processes=1 + # ) + + # For Qdrant client-server: + # destination_connection_config=ServerQdrantConnectionConfig( + # access_config=ServerQdrantAccessConfig(), + # url=os.getenv("QDRANT_URL") + # ), + # stager_config=ServerQdrantUploadStagerConfig(), + # uploader_config=ServerQdrantUploaderConfig( + # collection_name=os.gentenv("QDRANT_COLLECTION"), + # batch_size=50, + # num_processes=1 + # ) + + # For Qdrant cloud: destination_connection_config=CloudQdrantConnectionConfig( access_config=CloudQdrantAccessConfig( api_key=os.getenv("QDRANT_API_KEY") @@ -49,7 +92,8 @@ if __name__ == "__main__": stager_config=CloudQdrantUploadStagerConfig(), uploader_config=CloudQdrantUploaderConfig( collection_name=os.gentenv("QDRANT_COLLECTION"), - batch_size=50 + batch_size=50, + num_processes=1 ) ).run() ``` \ No newline at end of file diff --git a/snippets/general-shared-text/qdrant-cli-api.mdx b/snippets/general-shared-text/qdrant-cli-api.mdx index 547a792e..67374d40 100644 --- a/snippets/general-shared-text/qdrant-cli-api.mdx +++ b/snippets/general-shared-text/qdrant-cli-api.mdx @@ -10,6 +10,11 @@ import AdditionalIngestDependencies from '/snippets/general-shared-text/ingest-d The following environment variables: -- `QDRANT_URL` - The Qdrant cluster's URL, represented by `--url` (CLI) or `url` (Python). -- `QDRANT_API_KEY` - The Qdrant API key, represented by `--api-key` (CLI) or `api_key` (Python). -- `QDRANT_COLLECTION` - The name of the target collection on the cluster, represented by `--collection-name` (CLI) or `collection_name` (Python). \ No newline at end of file +- `QDRANT_COLLECTION` - The name of the target collection on the Qdrant local installation, + Qdrant server, or Qdrant Cloud cluster, represented by `--collection-name` (CLI) or `collection_name` (Python). +- For Qdrant local, `QDRANT_PATH` - The path to the local Qdrant installation, represented by `--path` (CLI) or `path` (Python). +- For Qdrant client-server, `QDRANT_URL` - The Qdrant server's URL, represented by `--url` (CLI) or `url` (Python). +- For Qdrant Cloud: + + - `QDRANT_URL` - The Qdrant cluster's URL, represented by `--url` (CLI) or `url` (Python). + - `QDRANT_API_KEY` - The Qdrant API key, represented by `--api-key` (CLI) or `api_key` (Python). diff --git a/snippets/general-shared-text/qdrant.mdx b/snippets/general-shared-text/qdrant.mdx index 86cde6ea..772ca94e 100644 --- a/snippets/general-shared-text/qdrant.mdx +++ b/snippets/general-shared-text/qdrant.mdx @@ -1,13 +1,18 @@ -The Qdrant prerequisites: +The Qdrant prerequisites are as follows. -- A [Qdrant account](https://cloud.qdrant.io/login). -- A [Qdrant cluster](https://qdrant.tech/documentation/cloud/create-cluster/). -- The cluster's URL. To get this URL, do the following: +- The name of the target [collection](https://qdrant.tech/documentation/concepts/collections) on the Qdrant local installation, + Qdrant server, or Qdrant Cloud cluster. +- For [Qdrant local](https://github.com/qdrant/qdrant), the path to the local Qdrant installation, for example: `/qdrant/local` +- For [Qdrant client-server](https://qdrant.tech/documentation/quickstart/), the Qdrant server URL, for example: `http://localhost:6333` +- For [Qdrant Cloud](https://qdrant.tech/documentation/cloud-intro/): - 1. Sign in to your Qdrant Cloud account. - 2. On the sidebar, under **Dashboard**, click **Clusters**. - 3. Click the cluster's name. - 4. Note the value of the **Endpoint** field, for example `https://...cloud.qdrant.io`. + - A [Qdrant account](https://cloud.qdrant.io/login). + - A [Qdrant cluster](https://qdrant.tech/documentation/cloud/create-cluster/). + - The cluster's URL. To get this URL, do the following: -- A [Qdrant API key](https://qdrant.tech/documentation/cloud/authentication/#create-api-keys). -- The name of the target [collection](https://qdrant.tech/documentation/concepts/collections) on the cluster. \ No newline at end of file + 1. Sign in to your Qdrant Cloud account. + 2. On the sidebar, under **Dashboard**, click **Clusters**. + 3. Click the cluster's name. + 4. Note the value of the **Endpoint** field, for example: `https://...cloud.qdrant.io`. + + - A [Qdrant API key](https://qdrant.tech/documentation/cloud/authentication/#create-api-keys).