From c100ceb437c324fd64ab7d52019d730e58dd4246 Mon Sep 17 00:00:00 2001 From: Eric Hare Date: Fri, 27 Sep 2024 11:32:40 -0700 Subject: [PATCH 1/3] feat: Update docs for Astra DB v2 connector updates --- .../destination_connectors/astradb.v2.py.mdx | 2 +- snippets/source_connectors/astradb.v1.py.mdx | 2 +- snippets/source_connectors/astradb.v2.py.mdx | 41 +++++++++++++++++++ 3 files changed, 43 insertions(+), 2 deletions(-) create mode 100644 snippets/source_connectors/astradb.v2.py.mdx diff --git a/snippets/destination_connectors/astradb.v2.py.mdx b/snippets/destination_connectors/astradb.v2.py.mdx index 1942f332..750c9d23 100644 --- a/snippets/destination_connectors/astradb.v2.py.mdx +++ b/snippets/destination_connectors/astradb.v2.py.mdx @@ -48,7 +48,7 @@ if __name__ == "__main__": ), stager_config=AstraDBUploadStagerConfig(), uploader_config=AstraDBUploaderConfig( - namespace=os.getenv("ASTRA_DB_NAMESPACE"), + keyspace=os.getenv("ASTRA_DB_KEYSPACE"), collection_name=os.getenv("ASTRA_DB_COLLECTION"), embedding_dimension=os.getenv("ASTRA_DB_EMBEDDING_DIMENSIONS") ) diff --git a/snippets/source_connectors/astradb.v1.py.mdx b/snippets/source_connectors/astradb.v1.py.mdx index 7a77959b..081986ef 100644 --- a/snippets/source_connectors/astradb.v1.py.mdx +++ b/snippets/source_connectors/astradb.v1.py.mdx @@ -29,7 +29,7 @@ if __name__ == "__main__": token=os.getenv("ASTRA_DB_APPLICATION_TOKEN"), api_endpoint=os.getenv("ASTRA_DB_API_ENDPOINT") ), - namespace=os.getenv("ASTRA_DB_NAMESPACE"), + keyspace=os.getenv("ASTRA_DB_KEYSPACE"), collection_name=os.getenv("ASTRA_DB_COLLECTION") ) ).run() diff --git a/snippets/source_connectors/astradb.v2.py.mdx b/snippets/source_connectors/astradb.v2.py.mdx new file mode 100644 index 00000000..4103ef02 --- /dev/null +++ b/snippets/source_connectors/astradb.v2.py.mdx @@ -0,0 +1,41 @@ +```python Python Ingest v2 +import os + +from unstructured_ingest.v2.pipeline.pipeline import Pipeline +from unstructured_ingest.v2.interfaces import ProcessorConfig +from unstructured_ingest.v2.processes.connectors.astradb import ( + AstraDBAccessConfig, + AstraDBConnectionConfig, + AstraDBDownloaderConfig, + AstraDBIndexerConfig, +) +from unstructured_ingest.v2.processes.partitioner import PartitionerConfig +from unstructured_ingest.v2.processes.connectors.local import LocalUploaderConfig + +# Chunking and embedding are optional. + +if __name__ == "__main__": + Pipeline.from_configs( + context=ProcessorConfig(), + indexer_config=AstraDBIndexerConfig( + collection_name=os.getenv("ASTRA_DB_COLLECTION"), + keyspace=os.getenv("ASTRA_DB_KEYSPACE"), + ), + downloader_config=AstraDBDownloaderConfig( + collection_name=os.getenv("ASTRA_DB_COLLECTION"), + keyspace=os.getenv("ASTRA_DB_KEYSPACE"), + ), + source_connection_config=AstraDBConnectionConfig( + access_config=AstraDBAccessConfig( + token=os.getenv("ASTRA_DB_APPLICATION_TOKEN"), + api_endpoint=os.getenv("ASTRA_DB_API_ENDPOINT"), + ), + ), + partitioner_config=PartitionerConfig( + partition_by_api=True, + partition_endpoint=os.getenv("UNSTRUCTURED_API_URL"), + api_key=os.getenv("UNSTRUCTURED_API_KEY"), + ), + uploader_config=LocalUploaderConfig(output_dir=os.getenv("LOCAL_FILE_OUTPUT_DIR")), + ).run() +``` \ No newline at end of file From 6b498268e9566ab99483b737197e0111e9189720 Mon Sep 17 00:00:00 2001 From: Eric Hare Date: Mon, 30 Sep 2024 08:21:19 -0700 Subject: [PATCH 2/3] Update astradb-cli-api.mdx --- snippets/general-shared-text/astradb-cli-api.mdx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/snippets/general-shared-text/astradb-cli-api.mdx b/snippets/general-shared-text/astradb-cli-api.mdx index 688f5f6a..b19b9d59 100644 --- a/snippets/general-shared-text/astradb-cli-api.mdx +++ b/snippets/general-shared-text/astradb-cli-api.mdx @@ -12,6 +12,6 @@ These environment variables: - `ASTRA_DB_API_ENDPOINT` - The API endpoint for the Astra DB database, represented by `--api-endpoint` (CLI) or `api_endpoint` (Python). To get the endpoint, see the **Database Details > API Endpoint** value on your database's **Overview** tab. - `ASTRA_DB_APPLICATION_TOKEN` - The database application token value for the database, represented by `--token` (CLI) or `token` (Python). To get the token, see the **Database Details > Application Tokens** box on your database's **Overview** tab. -- `ASTRA_DB_NAMESPACE` - The name of the namespace for the database, represented by `--namespace` (CLI) or `namespace` (Python). -- `ASTRA_DB_COLLECTION` - The name of the collection for the namespace, represented by `--collection-name` (CLI) or `collection_name` (Python). +- `ASTRA_DB_KEYSPACE` - The name of the keyspace for the database, represented by `--keyspace` (CLI) or `keyspace` (Python). +- `ASTRA_DB_COLLECTION` - The name of the collection for the keyspace, represented by `--collection-name` (CLI) or `collection_name` (Python). - `ASTRA_DB_EMBEDDING_DIMENSIONS` - The number of dimensions in the collection, represented by `--embedding-dimension` (CLI) or `embedding_dimension` (Python). \ No newline at end of file From f13d9590230149023d0786375fe81e6ef5bfb775 Mon Sep 17 00:00:00 2001 From: Paul Cornell Date: Fri, 18 Oct 2024 15:37:23 -0700 Subject: [PATCH 3/3] Minor updates for Astra DB --- api-reference/ingest/source-connectors/astradb.mdx | 5 ++--- open-source/ingest/source-connectors/astradb.mdx | 5 ++--- snippets/destination_connectors/astradb.sh.mdx | 2 +- snippets/destination_connectors/astradb.v1.py.mdx | 2 +- snippets/source_connectors/astradb.sh.mdx | 2 +- 5 files changed, 7 insertions(+), 9 deletions(-) diff --git a/api-reference/ingest/source-connectors/astradb.mdx b/api-reference/ingest/source-connectors/astradb.mdx index 895f29e8..4f983793 100644 --- a/api-reference/ingest/source-connectors/astradb.mdx +++ b/api-reference/ingest/source-connectors/astradb.mdx @@ -15,12 +15,11 @@ import SharedAPIKeyURL from '/snippets/general-shared-text/api-key-url.mdx'; Now call the Unstructured CLI or Python SDK. The destination connector can be any of the ones supported. This example uses the local destination connector: import AstraDBAPISh from '/snippets/source_connectors/astradb.sh.mdx'; +import AstraDBAPIPyV2 from '/snippets/source_connectors/astradb.v2.py.mdx'; import AstraDBAPIPyV1 from '/snippets/source_connectors/astradb.v1.py.mdx'; - - + - \ No newline at end of file diff --git a/open-source/ingest/source-connectors/astradb.mdx b/open-source/ingest/source-connectors/astradb.mdx index 4e234123..5748d181 100644 --- a/open-source/ingest/source-connectors/astradb.mdx +++ b/open-source/ingest/source-connectors/astradb.mdx @@ -15,14 +15,13 @@ Now call the Unstructured CLI or Python. The destination connector can be any of This example sends data to Unstructured API services for processing by default. To process data locally instead, see the instructions at the end of this page. import AstraDBSh from '/snippets/source_connectors/astradb.sh.mdx'; +import AstraDBPyV2 from '/snippets/source_connectors/astradb.v2.py.mdx'; import AstraDBPyV1 from '/snippets/source_connectors/astradb.v1.py.mdx'; - - + - import SharedPartitionByAPIOSS from '/snippets/ingest-configuration-shared/partition-by-api-oss.mdx'; diff --git a/snippets/destination_connectors/astradb.sh.mdx b/snippets/destination_connectors/astradb.sh.mdx index 12329475..77f50633 100644 --- a/snippets/destination_connectors/astradb.sh.mdx +++ b/snippets/destination_connectors/astradb.sh.mdx @@ -17,7 +17,7 @@ unstructured-ingest \ astradb \ --api-endpoint $ASTRA_DB_API_ENDPOINT \ --token $ASTRA_DB_APPLICATION_TOKEN \ - --namespace $ASTRA_DB_NAMESPACE \ + --keyspace $ASTRA_DB_KEYSPACE \ --collection-name $ASTRA_DB_COLLECTION \ --embedding-dimension $ASTRA_DB_EMBEDDING_DIMENSIONS ``` diff --git a/snippets/destination_connectors/astradb.v1.py.mdx b/snippets/destination_connectors/astradb.v1.py.mdx index f8681553..d534607f 100644 --- a/snippets/destination_connectors/astradb.v1.py.mdx +++ b/snippets/destination_connectors/astradb.v1.py.mdx @@ -28,7 +28,7 @@ def get_writer() -> Writer: api_endpoint=os.getenv("ASTRA_DB_API_ENDPOINT"), token=os.getenv("ASTRA_DB_APPLICATION_TOKEN"), ), - namespace=os.getenv("ASTRA_DB_NAMESPACE"), + keyspace=os.getenv("ASTRA_DB_KEYSPACE"), collection_name=os.getenv("ASTRA_DB_COLLECTION"), embedding_dimension=os.getenv("ASTRA_DB_EMBEDDING_DIMENSIONS"), ), diff --git a/snippets/source_connectors/astradb.sh.mdx b/snippets/source_connectors/astradb.sh.mdx index 64a8cad0..b184cc8e 100644 --- a/snippets/source_connectors/astradb.sh.mdx +++ b/snippets/source_connectors/astradb.sh.mdx @@ -5,7 +5,7 @@ unstructured-ingest \ astradb \ --api-endpoint $ASTRA_DB_API_ENDPOINT \ --token $ASTRA_DB_APPLICATION_TOKEN \ - --namespace $ASTRA_DB_NAMESPACE \ + --keyspace $ASTRA_DB_KEYSPACE \ --collection-name $ASTRA_DB_COLLECTION \ --download-dir $LOCAL_FILE_DOWNLOAD_DIR \ --partition-by-api \