From a7393634116910597ee81435382266e9d45f89b3 Mon Sep 17 00:00:00 2001 From: Paul Cornell Date: Thu, 6 Mar 2025 16:33:40 -0800 Subject: [PATCH] Ingest v2: Astra DB connectors - add missing parameters/options --- .../destination_connectors/astradb.sh.mdx | 4 ++- .../destination_connectors/astradb.v2.py.mdx | 12 ++++--- .../general-shared-text/astradb-cli-api.mdx | 10 +++++- snippets/source_connectors/astradb.sh.mdx | 4 +-- snippets/source_connectors/astradb.v2.py.mdx | 34 +++++++++++++------ 5 files changed, 45 insertions(+), 19 deletions(-) diff --git a/snippets/destination_connectors/astradb.sh.mdx b/snippets/destination_connectors/astradb.sh.mdx index df564bc5..fd1c44d3 100644 --- a/snippets/destination_connectors/astradb.sh.mdx +++ b/snippets/destination_connectors/astradb.sh.mdx @@ -17,5 +17,7 @@ unstructured-ingest \ astradb \ --api-endpoint $ASTRA_DB_API_ENDPOINT \ --token $ASTRA_DB_APPLICATION_TOKEN \ + --collection-name $ASTRA_DB_COLLECTION \ --keyspace $ASTRA_DB_KEYSPACE \ - --collection-name $ASTRA_DB_COLLECTION + --flatten-metadata + \ No newline at end of file diff --git a/snippets/destination_connectors/astradb.v2.py.mdx b/snippets/destination_connectors/astradb.v2.py.mdx index e8198577..949d8267 100644 --- a/snippets/destination_connectors/astradb.v2.py.mdx +++ b/snippets/destination_connectors/astradb.v2.py.mdx @@ -23,7 +23,8 @@ from unstructured_ingest.v2.processes.embedder import EmbedderConfig if __name__ == "__main__": Pipeline.from_configs( - context=ProcessorConfig(), + context=ProcessorConfig( + ), indexer_config=LocalIndexerConfig(input_path=os.getenv("LOCAL_FILE_INPUT_DIR")), downloader_config=LocalDownloaderConfig(), source_connection_config=LocalConnectionConfig(), @@ -31,7 +32,6 @@ if __name__ == "__main__": partition_by_api=True, api_key=os.getenv("UNSTRUCTURED_API_KEY"), partition_endpoint=os.getenv("UNSTRUCTURED_API_URL"), - strategy="hi_res", additional_partition_args={ "split_pdf_page": True, "split_pdf_allow_failed": True, @@ -46,10 +46,14 @@ if __name__ == "__main__": token=os.getenv("ASTRA_DB_APPLICATION_TOKEN") ) ), - stager_config=AstraDBUploadStagerConfig(), + stager_config=AstraDBUploadStagerConfig( + flatten_metadata=True + ), uploader_config=AstraDBUploaderConfig( + collection_name=os.getenv("ASTRA_DB_COLLECTION"), keyspace=os.getenv("ASTRA_DB_KEYSPACE"), - collection_name=os.getenv("ASTRA_DB_COLLECTION") + batch_size=20, + record_id_key="record_id" ) ).run() ``` \ No newline at end of file diff --git a/snippets/general-shared-text/astradb-cli-api.mdx b/snippets/general-shared-text/astradb-cli-api.mdx index 48aa4dc5..cf29ee31 100644 --- a/snippets/general-shared-text/astradb-cli-api.mdx +++ b/snippets/general-shared-text/astradb-cli-api.mdx @@ -13,4 +13,12 @@ These environment variables: - `ASTRA_DB_API_ENDPOINT` - The API endpoint for the Astra DB database, represented by `--api-endpoint` (CLI) or `api_endpoint` (Python). To get the endpoint, see the **Database Details > API Endpoint** value on your database's **Overview** tab. - `ASTRA_DB_APPLICATION_TOKEN` - The database application token value for the database, represented by `--token` (CLI) or `token` (Python). To get the token, see the **Database Details > Application Tokens** box on your database's **Overview** tab. - `ASTRA_DB_KEYSPACE` - The name of the keyspace for the database, represented by `--keyspace` (CLI) or `keyspace` (Python). -- `ASTRA_DB_COLLECTION` - The name of the collection for the keyspace, represented by `--collection-name` (CLI) or `collection_name` (Python). \ No newline at end of file +- `ASTRA_DB_COLLECTION` - The name of the collection for the keyspace, represented by `--collection-name` (CLI) or `collection_name` (Python). + +Additional settings include: + +- For the source connector only, `--fields` (CLI) or `fields` (Python): Optionally, a comma-separated list (CLI) or an array of strings (Python) of fields + to include in the output. The default is ti include all fields, if not otherwise specified. +- For the destination connector only, `--flatten-metadata` (CLI) or `flatten_metadata=True` (Python): Optionally, whether to "flatten" the metadata. Specifically, the metadata key values are + brought to the top level of the element, and the `metadata` key itself is removed. To not flatten the metadata (the default), specify `--no-flatten_metadata` (CLI) or + `flatten_metadata=False` (Python). The default is is to not flatten the metadata if not otherwise specified. diff --git a/snippets/source_connectors/astradb.sh.mdx b/snippets/source_connectors/astradb.sh.mdx index b184cc8e..6f10859c 100644 --- a/snippets/source_connectors/astradb.sh.mdx +++ b/snippets/source_connectors/astradb.sh.mdx @@ -5,12 +5,12 @@ unstructured-ingest \ astradb \ --api-endpoint $ASTRA_DB_API_ENDPOINT \ --token $ASTRA_DB_APPLICATION_TOKEN \ - --keyspace $ASTRA_DB_KEYSPACE \ --collection-name $ASTRA_DB_COLLECTION \ + --keyspace $ASTRA_DB_KEYSPACE \ + --fields record_id,content \ --download-dir $LOCAL_FILE_DOWNLOAD_DIR \ --partition-by-api \ --api-key $UNSTRUCTURED_API_KEY \ --partition-endpoint $UNSTRUCTURED_API_URL \ - --strategy hi_res \ --additional-partition-args="{\"split_pdf_page\":\"true\", \"split_pdf_allow_failed\":\"true\", \"split_pdf_concurrency_level\": 15}" ``` \ No newline at end of file diff --git a/snippets/source_connectors/astradb.v2.py.mdx b/snippets/source_connectors/astradb.v2.py.mdx index 4103ef02..1c4c31ff 100644 --- a/snippets/source_connectors/astradb.v2.py.mdx +++ b/snippets/source_connectors/astradb.v2.py.mdx @@ -3,14 +3,18 @@ import os from unstructured_ingest.v2.pipeline.pipeline import Pipeline from unstructured_ingest.v2.interfaces import ProcessorConfig + from unstructured_ingest.v2.processes.connectors.astradb import ( - AstraDBAccessConfig, - AstraDBConnectionConfig, - AstraDBDownloaderConfig, AstraDBIndexerConfig, + AstraDBDownloaderConfig, + AstraDBConnectionConfig, + AstraDBAccessConfig ) + +from unstructured_ingest.v2.processes.connectors.local import LocalConnectionConfig from unstructured_ingest.v2.processes.partitioner import PartitionerConfig -from unstructured_ingest.v2.processes.connectors.local import LocalUploaderConfig +from unstructured_ingest.v2.processes.chunker import ChunkerConfig +from unstructured_ingest.v2.processes.embedder import EmbedderConfig # Chunking and embedding are optional. @@ -19,23 +23,31 @@ if __name__ == "__main__": context=ProcessorConfig(), indexer_config=AstraDBIndexerConfig( collection_name=os.getenv("ASTRA_DB_COLLECTION"), - keyspace=os.getenv("ASTRA_DB_KEYSPACE"), + keyspace=os.getenv("ASTRA_DB_KEYSPACE"), + batch_size=20 ), downloader_config=AstraDBDownloaderConfig( - collection_name=os.getenv("ASTRA_DB_COLLECTION"), - keyspace=os.getenv("ASTRA_DB_KEYSPACE"), + download_dir=os.getenv("LOCAL_FILE_DOWNLOAD_DIR"), + fields=["record_id", "content"] ), source_connection_config=AstraDBConnectionConfig( access_config=AstraDBAccessConfig( - token=os.getenv("ASTRA_DB_APPLICATION_TOKEN"), api_endpoint=os.getenv("ASTRA_DB_API_ENDPOINT"), - ), + token=os.getenv("ASTRA_DB_APPLICATION_TOKEN") + ) ), partitioner_config=PartitionerConfig( partition_by_api=True, - partition_endpoint=os.getenv("UNSTRUCTURED_API_URL"), api_key=os.getenv("UNSTRUCTURED_API_KEY"), + partition_endpoint=os.getenv("UNSTRUCTURED_API_URL"), + additional_partition_args={ + "split_pdf_page": True, + "split_pdf_allow_failed": True, + "split_pdf_concurrency_level": 15 + } ), - uploader_config=LocalUploaderConfig(output_dir=os.getenv("LOCAL_FILE_OUTPUT_DIR")), + chunker_config=ChunkerConfig(chunking_strategy="by_title"), + embedder_config=EmbedderConfig(embedding_provider="huggingface"), + destination_connection_config=LocalConnectionConfig() ).run() ``` \ No newline at end of file