From ed42e06e4fc9aefd34591e55dc3f0c25f157cfb7 Mon Sep 17 00:00:00 2001 From: Paul Cornell Date: Thu, 21 Nov 2024 16:45:10 -0800 Subject: [PATCH] API: LanceDB v2 destination connector --- .../ingest/destination-connector/lancedb.mdx | 23 ++++ mint.json | 2 + .../ingest/destination-connectors/lancedb.mdx | 27 +++++ snippets/dc-shared-text/lancedb-cli-api.mdx | 9 ++ .../destination_connectors/lancedb.sh.mdx | 69 +++++++++++ .../destination_connectors/lancedb.v2.py.mdx | 107 ++++++++++++++++++ .../general-shared-text/lancedb-cli-api.mdx | 37 ++++++ snippets/general-shared-text/lancedb.mdx | 65 +++++++++++ 8 files changed, 339 insertions(+) create mode 100644 api-reference/ingest/destination-connector/lancedb.mdx create mode 100644 open-source/ingest/destination-connectors/lancedb.mdx create mode 100644 snippets/dc-shared-text/lancedb-cli-api.mdx create mode 100644 snippets/destination_connectors/lancedb.sh.mdx create mode 100644 snippets/destination_connectors/lancedb.v2.py.mdx create mode 100644 snippets/general-shared-text/lancedb-cli-api.mdx create mode 100644 snippets/general-shared-text/lancedb.mdx diff --git a/api-reference/ingest/destination-connector/lancedb.mdx b/api-reference/ingest/destination-connector/lancedb.mdx new file mode 100644 index 00000000..a6e350fe --- /dev/null +++ b/api-reference/ingest/destination-connector/lancedb.mdx @@ -0,0 +1,23 @@ +--- +title: LanceDB +--- + +import NewDocument from '/snippets/general-shared-text/new-document.mdx'; + + + +import SharedContentLanceDB from '/snippets/dc-shared-text/lancedb-cli-api.mdx'; +import SharedAPIKeyURL from '/snippets/general-shared-text/api-key-url.mdx'; + + + + +Now call the Unstructured CLI or Python SDK. The source connector can be any of the ones supported. This example uses the local source connector: + +import LanceDBAPISh from '/snippets/destination_connectors/lancedb.sh.mdx'; +import LanceDBAPIPyV2 from '/snippets/destination_connectors/lancedb.v2.py.mdx'; + + + + + \ No newline at end of file diff --git a/mint.json b/mint.json index 307125e4..281242ac 100644 --- a/mint.json +++ b/mint.json @@ -199,6 +199,7 @@ "open-source/ingest/destination-connectors/google-cloud-service", "open-source/ingest/destination-connectors/kafka", "open-source/ingest/destination-connectors/kdbai", + "open-source/ingest/destination-connectors/lancedb", "open-source/ingest/destination-connectors/local", "open-source/ingest/destination-connectors/milvus", "open-source/ingest/destination-connectors/mongodb", @@ -357,6 +358,7 @@ "api-reference/ingest/destination-connector/google-cloud-service", "api-reference/ingest/destination-connector/kafka", "api-reference/ingest/destination-connector/kdbai", + "api-reference/ingest/destination-connector/lancedb", "api-reference/ingest/destination-connector/local", "api-reference/ingest/destination-connector/milvus", "api-reference/ingest/destination-connector/mongodb", diff --git a/open-source/ingest/destination-connectors/lancedb.mdx b/open-source/ingest/destination-connectors/lancedb.mdx new file mode 100644 index 00000000..98056ed4 --- /dev/null +++ b/open-source/ingest/destination-connectors/lancedb.mdx @@ -0,0 +1,27 @@ +--- +title: LanceDB +--- + +import NewDocument from '/snippets/general-shared-text/new-document.mdx'; + + + +import SharedLanceDB from '/snippets/dc-shared-text/lancedb-cli-api.mdx'; + + + +Now call the Unstructured CLI or Python. The source connector can be any of the ones supported. This example uses the local source connector: + +This example sends files to Unstructured API services for processing by default. To process files locally instead, see the instructions at the end of this page. + +import LanceDBAPISh from '/snippets/destination_connectors/lancedb.sh.mdx'; +import LanceDBAPIPyV2 from '/snippets/destination_connectors/lancedb.v2.py.mdx'; + + + + + + +import SharedPartitionByAPIOSS from '/snippets/ingest-configuration-shared/partition-by-api-oss.mdx'; + + \ No newline at end of file diff --git a/snippets/dc-shared-text/lancedb-cli-api.mdx b/snippets/dc-shared-text/lancedb-cli-api.mdx new file mode 100644 index 00000000..a1edce80 --- /dev/null +++ b/snippets/dc-shared-text/lancedb-cli-api.mdx @@ -0,0 +1,9 @@ +Batch process all your records to store structured outputs in LanceDB. + +You will need: + +import SharedLanceDB from '/snippets/general-shared-text/lancedb.mdx'; +import SharedLanceDBCLIAPI from '/snippets/general-shared-text/lancedb-cli-api.mdx'; + + + \ No newline at end of file diff --git a/snippets/destination_connectors/lancedb.sh.mdx b/snippets/destination_connectors/lancedb.sh.mdx new file mode 100644 index 00000000..dc8de378 --- /dev/null +++ b/snippets/destination_connectors/lancedb.sh.mdx @@ -0,0 +1,69 @@ +```bash CLI +#!/usr/bin/env bash + +# Chunking and embedding are optional. + +# For LanceDB OSS with local data storage: +unstructured-ingest \ + local \ + --input-path $LOCAL_FILE_INPUT_DIR \ + --chunking-strategy by_title \ + --embedding-provider huggingface \ + --partition-by-api \ + --api-key $UNSTRUCTURED_API_KEY \ + --partition-endpoint $UNSTRUCTURED_API_URL \ + --additional-partition-args="{\"split_pdf_page\":\"true\", \"split_pdf_allow_failed\":\"true\", \"split_pdf_concurrency_level\": 15}" \ + lancedb-local \ + --uri $LANCEDB_URI \ + --table-name $LANCEDB_TABLE + +# For LanceDB OSS with data storage in an Amazon S3 bucket: +unstructured-ingest \ + local \ + --input-path $LOCAL_FILE_INPUT_DIR \ + --chunking-strategy by_title \ + --embedding-provider huggingface \ + --partition-by-api \ + --api-key $UNSTRUCTURED_API_KEY \ + --partition-endpoint $UNSTRUCTURED_API_URL \ + --additional-partition-args="{\"split_pdf_page\":\"true\", \"split_pdf_allow_failed\":\"true\", \"split_pdf_concurrency_level\": 15}" \ + lancedb-aws \ + --aws-access-key-id $AWS_ACCESS_KEY_ID \ + --aws-secret-access-key $AWS_SECRET_ACCESS_KEY \ + --uri $LANCEDB_URI \ + --table-name $LANCEDB_TABLE \ + --timeout 30s + +# For LanceDB OSS with data storage in an Azure Blob Storage account: +unstructured-ingest \ + local \ + --input-path $LOCAL_FILE_INPUT_DIR \ + --chunking-strategy by_title \ + --embedding-provider huggingface \ + --partition-by-api \ + --api-key $UNSTRUCTURED_API_KEY \ + --partition-endpoint $UNSTRUCTURED_API_URL \ + --additional-partition-args="{\"split_pdf_page\":\"true\", \"split_pdf_allow_failed\":\"true\", \"split_pdf_concurrency_level\": 15}" \ + lancedb-azure \ + --azure-storage-account-name $AZURE_STORAGE_ACCOUNT_NAME \ + --azure-storage-account-key $AZURE_STORAGE_ACCOUNT_KEY \ + --uri $LANCEDB_URI \ + --table-name $LANCEDB_TABLE \ + --timeout 30s + +# For LanceDB OSS with data storage in a Google Cloud Storage bucket: +unstructured-ingest \ + local \ + --input-path $LOCAL_FILE_INPUT_DIR \ + --chunking-strategy by_title \ + --embedding-provider huggingface \ + --partition-by-api \ + --api-key $UNSTRUCTURED_API_KEY \ + --partition-endpoint $UNSTRUCTURED_API_URL \ + --additional-partition-args="{\"split_pdf_page\":\"true\", \"split_pdf_allow_failed\":\"true\", \"split_pdf_concurrency_level\": 15}" \ + lancedb-gcs \ + --google-service-account-key $GCS_SERVICE_ACCOUNT_KEY \ + --uri $LANCEDB_URI \ + --table-name $LANCEDB_TABLE \ + --timeout 30s +``` \ No newline at end of file diff --git a/snippets/destination_connectors/lancedb.v2.py.mdx b/snippets/destination_connectors/lancedb.v2.py.mdx new file mode 100644 index 00000000..8db05131 --- /dev/null +++ b/snippets/destination_connectors/lancedb.v2.py.mdx @@ -0,0 +1,107 @@ +```python Python Ingest v2 +import os + +from unstructured_ingest.v2.pipeline.pipeline import Pipeline +from unstructured_ingest.v2.interfaces import ProcessorConfig + +from unstructured_ingest.v2.processes.connectors.local import ( + LocalIndexerConfig, + LocalDownloaderConfig, + LocalConnectionConfig +) +from unstructured_ingest.v2.processes.partitioner import PartitionerConfig +from unstructured_ingest.v2.processes.chunker import ChunkerConfig +from unstructured_ingest.v2.processes.embedder import EmbedderConfig + +# For LanceDB OSS with local data storage: +# from unstructured_ingest.v2.processes.connectors.lancedb.local import ( +# LanceDBLocalConnectionConfig, +# LanceDBLocalAccessConfig, +# LanceDBUploadStagerConfig, +# LanceDBUploaderConfig +# ) + +# For LanceDB OSS with data storage in an Amazon S3 bucket: +from unstructured_ingest.v2.processes.connectors.lancedb.aws import ( + LanceDBS3ConnectionConfig, + LanceDBS3AccessConfig, + LanceDBUploadStagerConfig, + LanceDBUploaderConfig +) + +# For LanceDB OSS with data storage in an Azure Blob Storage account: +# from unstructured_ingest.v2.processes.connectors.lancedb.azure import ( +# LanceDBAzureConnectionConfig, +# LanceDBAzureAccessConfig, +# LanceDBUploadStagerConfig, +# LanceDBUploaderConfig +# ) + +# For LanceDB OSS with data storage in a Google Cloud Storage bucket: +# from unstructured_ingest.v2.processes.connectors.lancedb.gcp import ( +# LanceDBGCSConnectionConfig, +# LanceDBGCSAccessConfig, +# LanceDBUploadStagerConfig, +# LanceDBUploaderConfig +# ) + +# Chunking and embedding are optional. + +if __name__ == "__main__": + Pipeline.from_configs( + context=ProcessorConfig(), + indexer_config=LocalIndexerConfig(input_path=os.getenv("LOCAL_FILE_INPUT_DIR")), + downloader_config=LocalDownloaderConfig(), + source_connection_config=LocalConnectionConfig(), + partitioner_config=PartitionerConfig( + partition_by_api=True, + api_key=os.getenv("UNSTRUCTURED_API_KEY"), + partition_endpoint=os.getenv("UNSTRUCTURED_API_URL"), + additional_partition_args={ + "split_pdf_page": True, + "split_pdf_allow_failed": True, + "split_pdf_concurrency_level": 15 + } + ), + chunker_config=ChunkerConfig(chunking_strategy="by_title"), + embedder_config=EmbedderConfig(embedding_provider="huggingface"), + + # For LanceDB OSS with local data storage: + # destination_connection_config=LanceDBLocalConnectionConfig( + # access_config=LanceDBLocalAccessConfig(), + # uri=os.getenv("LANCEDB_URI") + # ), + + # For LanceDB OSS with data storage in an Amazon S3 bucket: + destination_connection_config=LanceDBS3ConnectionConfig( + access_config=LanceDBS3AccessConfig( + aws_access_key_id=os.getenv("AWS_ACCESS_KEY_ID"), + aws_secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY") + ), + uri=os.getenv("LANCEDB_URI"), + timeout="30s" + ), + + # For LanceDB OSS with data storage in an Azure Blob Storage account: + # destination_connection_config=LanceDBAzureConnectionConfig( + # access_config=LanceDBAzureAccessConfig( + # azure_storage_account_name=os.getenv("AZURE_STORAGE_ACCOUNT_NAME"), + # azure_storage_account_key=os.getenv("AZURE_STORAGE_ACCOUNT_KEY") + # ), + # uri=os.getenv("LANCEDB_URI"), + # timeout="30s" + # ), + + # For LanceDB OSS with data storage in a Google Cloud Storage bucket: + # destination_connection_config=LanceDBGCSConnectionConfig( + # access_config=LanceDBGCSAccessConfig( + # google_service_account_key=os.getenv("GCS_SERVICE_ACCOUNT_KEY") + # ), + # uri=os.getenv("LANCEDB_URI"), + # timeout="30s" + # ), + + stager_config=LanceDBUploadStagerConfig(), + uploader_config=LanceDBUploaderConfig(table_name=os.gentenv("LANCEDB_TABLE")) + ).run() +``` \ No newline at end of file diff --git a/snippets/general-shared-text/lancedb-cli-api.mdx b/snippets/general-shared-text/lancedb-cli-api.mdx new file mode 100644 index 00000000..25499b84 --- /dev/null +++ b/snippets/general-shared-text/lancedb-cli-api.mdx @@ -0,0 +1,37 @@ +The LanceDB connector dependencies: + +```bash CLI, Python +pip install "unstructured-ingest[lancedb]" +``` + +import AdditionalIngestDependencies from '/snippets/general-shared-text/ingest-dependencies.mdx'; + + + +The following environment variables: + +- For LanceDB OSS with local data storage: + + - `LANCEDB_URI` - The local path to the folder where the LanceDB data is stored, represented by `--uri` (CLI) or `uri` (Python). + - `LANCEDB_TABLE` - The name of the target LanceDB table within the local data folder, represented by `--table-name` (CLI) or `table_name` (Python). + +- For LanceDB OSS with data storage in an Amazon S3 bucket: + + - `LANCEDB_URI` - The URI for the target Amazon S3 bucket and any target folder path within that bucket. Use the format `s3://[/]`. This is represented by `--uri` (CLI) or `uri` (Python). + - `LANCEDB_TABLE` - The name of the target LanceDB table within the Amazon S3 bucket, rrepresented by `--table-name` (CLI) or `table_name` (Python). + - `AWS_ACCESS_KEY_ID` - The AWS access key ID for the AWS IAM entity that has access to the Amazon S3 bucket, represented by `--aws-access-key-id` (CLI) or `aws_access_key_id` (Python). + - `AWS_SECRET_ACCESS_KEY` - The AWS secret access key for the AWS IAM entity that has access to the Amazon S3 bucket, represented by `--aws-secret-access-key` (CLI) or `aws_secret_access_key` (Python). + +- For LanceDB OSS with data storage in an Azure Blob Storage account: + + - `LANCEDB_URI` - The URI for the target container within that Azure Blob Storage account and any target folder path within that container. Use the format `az://[/]`. This is represented by `--uri` (CLI) or `uri` (Python). + - `LANCEDB_TABLE` - The name of the target LanceDB table within the Azure Blob Storage account, represented by `--table-name` (CLI) or `table_name` (Python). + - `AZURE_STORAGE_ACCOUNT_NAME` - The name of the target Azure Blob Storage account, represented by `--azure-storage-account-name` (CLI) or `azure_storage_account_name` (Python). + - `AZURE_STORAGE_ACCOUNT_KEY` - The access key for the Azure Blob Storage account, represented by `--azure-storage-account-key` (CLI) or `azure_storage_account_key` (Python). + +- For LanceDB OSS with data storage in a Google Cloud Storage bucket: + + - `LANCEDB_URI` - The URI for the target Google Cloud Storage bucket and any target folder path within that bucket. Use the format `gs://[/]`. This is represented by `--uri` (CLI) or `uri` (Python). + - `LANCEDB_TABLE` - The name of the target LanceDB table within the Google Cloud Storage bucket, represented by `--table-name` (CLI) or `table_name` (Python). + - `GCS_SERVICE_ACCOUNT_KEY` - A single-line string that contains the contents of the downloaded service account key file for the Google Cloud service account + that has access to the Google Cloud Storage bucket, represented by `--google-service-account-key` (CLI) or `google_service_account_key` (Python). diff --git a/snippets/general-shared-text/lancedb.mdx b/snippets/general-shared-text/lancedb.mdx new file mode 100644 index 00000000..395f30be --- /dev/null +++ b/snippets/general-shared-text/lancedb.mdx @@ -0,0 +1,65 @@ +The LanceDB prerequisites: + +- A [LanceDB open source software (OSS) installation](https://lancedb.github.io/lancedb/basic/#installation) on a local machine, a server, or a virtual machine. + (LanceDB Cloud is not supported.) +- For LanceDB OSS with local data storage: + + - The local path to the folder where the LanceDB data is (or will be) stored. + See [Connect to a database](https://lancedb.github.io/lancedb/basic/#connect-to-a-database) in the LanceDB documentation. + - The name of the target [LanceDB table](https://lancedb.github.io/lancedb/basic/#create-an-empty-table) within the local data folder. + +- For LanceDB OSS with data storage in an Amazon S3 bucket: + + - The URI for the target Amazon S3 bucket and any target folder path within that bucket. Use the format `s3://[/]`. + - The name of the target [LanceDB table](https://lancedb.github.io/lancedb/guides/storage/#object-stores) within the Amazon S3 bucket. + - The AWS access key ID and AWS secret access key for the AWS IAM entity that has access to the Amazon S3 bucket. + + For more information, see [AWS S3](https://lancedb.github.io/lancedb/guides/storage/#aws-s3) in the LanceDB documentation, along with the following video: + + + +- For LanceDB OSS with data storage in an Azure Blob Storage account: + + - The name of the target Azure Blob Storage account. + = The URI for the target container within that Azure Blob Storage account and any target folder path within that container. Use the format `az://[/]`. + - The name of the target [LanceDB table](https://lancedb.github.io/lancedb/guides/storage/#object-stores) within the Azure Blob Storage account. + - The access key for the Azure Blob Storage account. + + For more information, see [Azure Blob Storage](https://lancedb.github.io/lancedb/guides/storage/#azure-blob-storage) in the LanceDB documentation, along with the following video: + + + +- For LanceDB OSS with data storage in a Google Cloud Storage bucket: + + - The URI for the target Google Cloud Storage bucket and any target folder path within that bucket. Use the format `gs://[/]`. + - The name of the target [LanceDB table](https://lancedb.github.io/lancedb/guides/storage/#object-stores) within the Google Cloud Storage bucket. + - A single-line string that contains the contents of the downloaded service account key file for the Google Cloud service account that has access to the + Google Cloud Storage bucket. + + For more information, see [Google Cloud Storage](https://lancedb.github.io/lancedb/guides/storage/#google-cloud-storage) in the LanceDB documentation, along with the following video: + + \ No newline at end of file