From a7393634116910597ee81435382266e9d45f89b3 Mon Sep 17 00:00:00 2001
From: Paul Cornell <paul@unstructured.io>
Date: Thu, 6 Mar 2025 16:33:40 -0800
Subject: [PATCH] Ingest v2: Astra DB connectors - add missing
 parameters/options

---
 .../destination_connectors/astradb.sh.mdx     |  4 ++-
 .../destination_connectors/astradb.v2.py.mdx  | 12 ++++---
 .../general-shared-text/astradb-cli-api.mdx   | 10 +++++-
 snippets/source_connectors/astradb.sh.mdx     |  4 +--
 snippets/source_connectors/astradb.v2.py.mdx  | 34 +++++++++++++------
 5 files changed, 45 insertions(+), 19 deletions(-)

diff --git a/snippets/destination_connectors/astradb.sh.mdx b/snippets/destination_connectors/astradb.sh.mdx
index df564bc5..fd1c44d3 100644
--- a/snippets/destination_connectors/astradb.sh.mdx
+++ b/snippets/destination_connectors/astradb.sh.mdx
@@ -17,5 +17,7 @@ unstructured-ingest \
   astradb \
     --api-endpoint $ASTRA_DB_API_ENDPOINT \
     --token $ASTRA_DB_APPLICATION_TOKEN \
+    --collection-name $ASTRA_DB_COLLECTION \
     --keyspace $ASTRA_DB_KEYSPACE \
-    --collection-name $ASTRA_DB_COLLECTION
+    --flatten-metadata
+    
\ No newline at end of file
diff --git a/snippets/destination_connectors/astradb.v2.py.mdx b/snippets/destination_connectors/astradb.v2.py.mdx
index e8198577..949d8267 100644
--- a/snippets/destination_connectors/astradb.v2.py.mdx
+++ b/snippets/destination_connectors/astradb.v2.py.mdx
@@ -23,7 +23,8 @@ from unstructured_ingest.v2.processes.embedder import EmbedderConfig
 
 if __name__ == "__main__":
     Pipeline.from_configs(
-        context=ProcessorConfig(),
+        context=ProcessorConfig(
+        ),
         indexer_config=LocalIndexerConfig(input_path=os.getenv("LOCAL_FILE_INPUT_DIR")),
         downloader_config=LocalDownloaderConfig(),
         source_connection_config=LocalConnectionConfig(),
@@ -31,7 +32,6 @@ if __name__ == "__main__":
             partition_by_api=True,
             api_key=os.getenv("UNSTRUCTURED_API_KEY"),
             partition_endpoint=os.getenv("UNSTRUCTURED_API_URL"),
-            strategy="hi_res",
             additional_partition_args={
                 "split_pdf_page": True,
                 "split_pdf_allow_failed": True,
@@ -46,10 +46,14 @@ if __name__ == "__main__":
                 token=os.getenv("ASTRA_DB_APPLICATION_TOKEN")
             )
         ),
-        stager_config=AstraDBUploadStagerConfig(),
+        stager_config=AstraDBUploadStagerConfig(
+            flatten_metadata=True
+        ),
         uploader_config=AstraDBUploaderConfig(
+            collection_name=os.getenv("ASTRA_DB_COLLECTION"),
             keyspace=os.getenv("ASTRA_DB_KEYSPACE"),
-            collection_name=os.getenv("ASTRA_DB_COLLECTION")
+            batch_size=20,
+            record_id_key="record_id"
         )
     ).run()
 ```
\ No newline at end of file
diff --git a/snippets/general-shared-text/astradb-cli-api.mdx b/snippets/general-shared-text/astradb-cli-api.mdx
index 48aa4dc5..cf29ee31 100644
--- a/snippets/general-shared-text/astradb-cli-api.mdx
+++ b/snippets/general-shared-text/astradb-cli-api.mdx
@@ -13,4 +13,12 @@ These environment variables:
 - `ASTRA_DB_API_ENDPOINT` - The API endpoint for the Astra DB database, represented by `--api-endpoint` (CLI) or `api_endpoint` (Python). To get the endpoint, see the **Database Details > API Endpoint** value on your database's **Overview** tab.
 - `ASTRA_DB_APPLICATION_TOKEN` - The database application token value for the database, represented by `--token` (CLI) or `token` (Python). To get the token, see the **Database Details > Application Tokens** box on your database's **Overview** tab.
 - `ASTRA_DB_KEYSPACE` - The name of the keyspace for the database, represented by `--keyspace` (CLI) or `keyspace` (Python).
-- `ASTRA_DB_COLLECTION` - The name of the collection for the keyspace, represented by `--collection-name` (CLI) or `collection_name` (Python).
\ No newline at end of file
+- `ASTRA_DB_COLLECTION` - The name of the collection for the keyspace, represented by `--collection-name` (CLI) or `collection_name` (Python).
+
+Additional settings include:
+
+- For the source connector only, `--fields` (CLI) or `fields` (Python): Optionally, a comma-separated list (CLI) or an array of strings (Python) of fields 
+  to include in the output. The default is ti include all fields, if not otherwise specified.
+- For the destination connector only, `--flatten-metadata` (CLI) or `flatten_metadata=True` (Python): Optionally, whether to "flatten" the metadata. Specifically, the metadata key values are 
+  brought to the top level of the element, and the `metadata` key itself is removed. To not flatten the metadata (the default), specify `--no-flatten_metadata` (CLI) or 
+  `flatten_metadata=False` (Python). The default is is to not flatten the metadata if not otherwise specified.
diff --git a/snippets/source_connectors/astradb.sh.mdx b/snippets/source_connectors/astradb.sh.mdx
index b184cc8e..6f10859c 100644
--- a/snippets/source_connectors/astradb.sh.mdx
+++ b/snippets/source_connectors/astradb.sh.mdx
@@ -5,12 +5,12 @@ unstructured-ingest \
   astradb \
     --api-endpoint $ASTRA_DB_API_ENDPOINT \
     --token $ASTRA_DB_APPLICATION_TOKEN \
-    --keyspace $ASTRA_DB_KEYSPACE \
     --collection-name $ASTRA_DB_COLLECTION \
+    --keyspace $ASTRA_DB_KEYSPACE \
+    --fields record_id,content \
     --download-dir $LOCAL_FILE_DOWNLOAD_DIR \
     --partition-by-api \
     --api-key $UNSTRUCTURED_API_KEY \
     --partition-endpoint $UNSTRUCTURED_API_URL \
-    --strategy hi_res \
     --additional-partition-args="{\"split_pdf_page\":\"true\", \"split_pdf_allow_failed\":\"true\", \"split_pdf_concurrency_level\": 15}"
 ```
\ No newline at end of file
diff --git a/snippets/source_connectors/astradb.v2.py.mdx b/snippets/source_connectors/astradb.v2.py.mdx
index 4103ef02..1c4c31ff 100644
--- a/snippets/source_connectors/astradb.v2.py.mdx
+++ b/snippets/source_connectors/astradb.v2.py.mdx
@@ -3,14 +3,18 @@ import os
 
 from unstructured_ingest.v2.pipeline.pipeline import Pipeline
 from unstructured_ingest.v2.interfaces import ProcessorConfig
+
 from unstructured_ingest.v2.processes.connectors.astradb import (
-    AstraDBAccessConfig,
-    AstraDBConnectionConfig,
-    AstraDBDownloaderConfig,
     AstraDBIndexerConfig,
+    AstraDBDownloaderConfig,
+    AstraDBConnectionConfig,
+    AstraDBAccessConfig
 )
+
+from unstructured_ingest.v2.processes.connectors.local import LocalConnectionConfig
 from unstructured_ingest.v2.processes.partitioner import PartitionerConfig
-from unstructured_ingest.v2.processes.connectors.local import LocalUploaderConfig
+from unstructured_ingest.v2.processes.chunker import ChunkerConfig
+from unstructured_ingest.v2.processes.embedder import EmbedderConfig
 
 # Chunking and embedding are optional.
 
@@ -19,23 +23,31 @@ if __name__ == "__main__":
         context=ProcessorConfig(),
         indexer_config=AstraDBIndexerConfig(
             collection_name=os.getenv("ASTRA_DB_COLLECTION"),
-            keyspace=os.getenv("ASTRA_DB_KEYSPACE"),           
+            keyspace=os.getenv("ASTRA_DB_KEYSPACE"),
+            batch_size=20
         ),
         downloader_config=AstraDBDownloaderConfig(
-            collection_name=os.getenv("ASTRA_DB_COLLECTION"),
-            keyspace=os.getenv("ASTRA_DB_KEYSPACE"),
+            download_dir=os.getenv("LOCAL_FILE_DOWNLOAD_DIR"),
+            fields=["record_id", "content"]
         ),
         source_connection_config=AstraDBConnectionConfig(
             access_config=AstraDBAccessConfig(
-                token=os.getenv("ASTRA_DB_APPLICATION_TOKEN"),
                 api_endpoint=os.getenv("ASTRA_DB_API_ENDPOINT"),
-            ),
+                token=os.getenv("ASTRA_DB_APPLICATION_TOKEN")
+            )
         ),
         partitioner_config=PartitionerConfig(
             partition_by_api=True,
-            partition_endpoint=os.getenv("UNSTRUCTURED_API_URL"),
             api_key=os.getenv("UNSTRUCTURED_API_KEY"),
+            partition_endpoint=os.getenv("UNSTRUCTURED_API_URL"),
+            additional_partition_args={
+                "split_pdf_page": True,
+                "split_pdf_allow_failed": True,
+                "split_pdf_concurrency_level": 15
+            }
         ),
-        uploader_config=LocalUploaderConfig(output_dir=os.getenv("LOCAL_FILE_OUTPUT_DIR")),
+        chunker_config=ChunkerConfig(chunking_strategy="by_title"),
+        embedder_config=EmbedderConfig(embedding_provider="huggingface"),
+        destination_connection_config=LocalConnectionConfig()
     ).run()
 ```
\ No newline at end of file