From afd55017c857aabd803ba053cce798053058f5f9 Mon Sep 17 00:00:00 2001 From: Paul Cornell Date: Mon, 23 Sep 2024 15:43:01 -0700 Subject: [PATCH] Databricks Volumes v2 destination connector: update authentication details --- .../databricks_volumes.sh.mdx | 28 ++++----- .../databricks_volumes.v1.py.mdx | 13 +++-- .../databricks_volumes.v2.py.mdx | 20 +++++-- .../databricks-volumes-cli-api.mdx | 58 +++++++++++++++++-- .../databricks-volumes-platform.mdx | 31 ++++++++-- .../databricks-volumes.mdx | 39 ++++++++++++- 6 files changed, 155 insertions(+), 34 deletions(-) diff --git a/snippets/destination_connectors/databricks_volumes.sh.mdx b/snippets/destination_connectors/databricks_volumes.sh.mdx index a270376d..d411fa66 100644 --- a/snippets/destination_connectors/databricks_volumes.sh.mdx +++ b/snippets/destination_connectors/databricks_volumes.sh.mdx @@ -6,21 +6,23 @@ unstructured-ingest \ local \ --input-path $LOCAL_FILE_INPUT_DIR \ - --output-dir $LOCAL_FILE_OUTPUT_DIR \ - --strategy hi_res \ - --chunk-elements \ - --embedding-provider langchain-huggingface \ - --num-processes 2 \ - --verbose \ - --work-dir local-input \ --partition-by-api \ - --api-key $UNSTRUCTURED_API_KEY\ + --api-key $UNSTRUCTURED_API_KEY \ --partition-endpoint $UNSTRUCTURED_API_URL \ + --strategy hi_res \ --additional-partition-args="{\"split_pdf_page\":\"true\", \"split_pdf_allow_failed\":\"true\", \"split_pdf_concurrency_level\": 15}" \ + --chunk-by-api \ + --chunking-strategy by_title \ + --chunk-api-key $UNSTRUCTURED_API_KEY \ + --chunking-endpoint $UNSTRUCTURED_API_URL \ + --embedding-provider langchain-huggingface \ + --embedding-model-name sentence-transformers/all-mpnet-base-v2 \ databricks-volumes \ - --host "$DATABRICKS_HOST" \ - --username "$DATABRICKS_USERNAME" \ - --password "$DATABRICKS_PASSWORD" \ - --volume "$DATABRICKS_VOLUME" \ - --catalog "$DATABRICKS_CATALOG" + --host $DATABRICKS_HOST \ + --token $DATABRICKS_TOKEN \ + --cluster-id $DATABRICKS_CLUSTER_ID \ + --catalog $DATABRICKS_CATALOG \ + --schema $DATABRICKS_SCHEMA \ + --volume $DATABRICKS_VOLUME \ + --volume-path $DATABRICKS_VOLUME_PATH ``` diff --git a/snippets/destination_connectors/databricks_volumes.v1.py.mdx b/snippets/destination_connectors/databricks_volumes.v1.py.mdx index cd2e5658..e169ec48 100644 --- a/snippets/destination_connectors/databricks_volumes.v1.py.mdx +++ b/snippets/destination_connectors/databricks_volumes.v1.py.mdx @@ -26,13 +26,15 @@ def get_writer() -> Writer: connector_config=SimpleDatabricksVolumesConfig( host=os.getenv("DATABRICKS_HOST"), access_config=DatabricksVolumesAccessConfig( - username=os.getenv("DATABRICKS_USERNAME"), - password=os.getenv("DATABRICKS_PASSWORD") + token=os.getenv("DATABRICKS_TOKEN"), + cluster_id=os.getenv("DATABRICKS_CLUSTER_ID") ), ), write_config=DatabricksVolumesWriteConfig( catalog=os.getenv("DATABRICKS_CATALOG"), + schema=os.getenv("DATABRICKS_SCHEMA"), volume=os.getenv("DATABRICKS_VOLUME"), + volume_path=os.getenv("DATABRICKS_VOLUME_PATH") ), ) @@ -56,10 +58,13 @@ if __name__ == "__main__": partition_endpoint=os.getenv("UNSTRUCTURED_API_URL"), strategy="hi_res", ), - chunking_config=ChunkingConfig(chunk_elements=True), + chunking_config=ChunkingConfig( + chunk_elements=True + chunking_strategy="by_title", + ), embedding_config=EmbeddingConfig( provider="langchain-huggingface", - api_key=None, + model_name="sentence-transformers/all-mpnet-base-v2", ), writer=writer, writer_kwargs={}, diff --git a/snippets/destination_connectors/databricks_volumes.v2.py.mdx b/snippets/destination_connectors/databricks_volumes.v2.py.mdx index fd258819..216a4504 100644 --- a/snippets/destination_connectors/databricks_volumes.v2.py.mdx +++ b/snippets/destination_connectors/databricks_volumes.v2.py.mdx @@ -37,18 +37,28 @@ if __name__ == "__main__": "split_pdf_concurrency_level": 15 } ), - chunker_config=ChunkerConfig(chunking_strategy="by_title"), - embedder_config=EmbedderConfig(embedding_provider="langchain-huggingface"), + chunker_config=ChunkerConfig( + chunk_by_api=True, + chunk_api_key=os.getenv("UNSTRUCTURED_API_KEY"), + chunking_endpoint=os.getenv("UNSTRUCTURED_API_URL"), + chunking_strategy="by_title" + ), + embedder_config=EmbedderConfig( + embedding_provider="langchain-huggingface", + embedding_model_name="sentence-transformers/all-mpnet-base-v2" + ), destination_connection_config=DatabricksVolumesConnectionConfig( access_config=DatabricksVolumesAccessConfig( - username=os.getenv("DATABRICKS_USERNAME"), - password=os.getenv("DATABRICKS_PASSWORD") + token=os.getenv("DATABRICKS_TOKEN"), + cluster_id=os.getenv("DATABRICKS_CLUSTER_ID") ), host=os.getenv("DATABRICKS_HOST") ), uploader_config=DatabricksVolumesUploaderConfig( catalog=os.getenv("DATABRICKS_CATALOG"), - volume=os.getenv("DATABRICKS_VOLUME") + schema=os.getenv("DATABRICKS_SCHEMA"), + volume=os.getenv("DATABRICKS_VOLUME"), + volume_path=os.getenv("DATABRICKS_VOLUME_PATH") ) ).run() ``` \ No newline at end of file diff --git a/snippets/general-shared-text/databricks-volumes-cli-api.mdx b/snippets/general-shared-text/databricks-volumes-cli-api.mdx index b4e42243..a7c1d169 100644 --- a/snippets/general-shared-text/databricks-volumes-cli-api.mdx +++ b/snippets/general-shared-text/databricks-volumes-cli-api.mdx @@ -10,11 +10,61 @@ import AdditionalIngestDependencies from '/snippets/general-shared-text/ingest-d The following environment variables: -- `DATABRICKS_HOST` - The Databricks compute resource's host name, represented by `--host` (CLI) or `host` (Python). +- `DATABRICKS_HOST` - The Databricks host URL, represented by `--host` (CLI) or `host` (Python). +- `DATABRICKS_CLUSTER_ID` - The Databricks compute resource ID, represented by `--cluster-id` (CLI) or `cluster_id` (Python). - `DATABRICKS_CATALOG` - The Databricks catalog name for the Volume, represented by `--catalog` (CLI) or `catalog` (Python). +- `DATABRICKS_SCHEMA` - The Databricks schema name for the Volume, represented by `--schema` (CLI) or `schema` (Python). If not specified, `default` is used. - `DATABRICKS_VOLUME` - The Databricks Volume name, represented by `--volume` (CLI) or `volume` (Python). +- `DATABRICKS_VOLUME_PATH` - Any optional path to access within the volume, specified by `--volume-path` (CLI) or `volume_path` (Python). -Environment variables based on your authentication type, depending which types are supported by your cloud provider. For example, for username and password authentication: +Environment variables based on your authentication type, depending on your cloud provider: -- `DATABRICKS_USERNAME` - The Databricks account user's name, represented by `--username` (CLI) or `username` (Python). -- `DATABRICKS_PASSWORD` - The Databricks account user's password, represented by `--password` (CLI) or `password` (Python). +- For Databricks personal access token authentication (AWS, Azure, and GCP): + + - `DATABRICKS_TOKEN` - The personal access token, represented by `--token` (CLI) or `token` (Python). + +- For username and password (basic) authentication (AWS only): The user's name and password values. + + - `DATABRICKS_USERNAME` - The user's name, represented by `--username` (CLI) or `username` (Python). + - `DATABRICKS_PASSWORD` - The user's password, represented by `--password` (CLI) or `password` (Python). + +- For OAuth machine-to-machine (M2M) authentication (AWS, Azure, and GCP): The client ID and OAuth secret values for the corresponding service principal. + + - `DATABRICKS_CLIENT_ID` - The client ID value for the corresponding service principal, represented by `--client-id` (CLI) or `client_id` (Python). + - `DATABRICKS_CLIENT_SECRET` - The client ID and OAuth secret values for the corresponding service principal, represented by `--client-secret` (CLI) or `client_secret` (Python). + +- For OAuth user-to-machine (U2M) authentication (AWS, Azure, and GCP): No additional environment variables. + +- For Azure managed identities (MSI) authentication (Azure only): + + - `ARM_CLIENT_ID` - The client ID value for the corresponding managed identity, represented by `--azure-client-id` (CLI) or `azure_client_id` (Python). + - If the target identity has not already been added to the workspace, then you must also specify the + `DATABRICKS_AZURE_RESOURCE_ID`, represented by `--azure-workspace-resource-id` (CLI) or `azure_workspace_resource_id` (Python). + +- For Microsoft Entra ID service principal authentication (Azure only): + + - `ARM_TENANT_ID` - The tenant ID value for the corresponding service principal, represented by `--azure-tenant-id` (CLI) or `azure_tenant_id` (Python). + - `ARM_CLIENT_ID` - The client ID value for the corresponding service principal, represented by `--azure-client-id` (CLI) or `azure_client_id` (Python). + - `ARM_CLIENT_SECRET` - The client secret value for the corresponding service principal, represented by `--azure-client-secret` (CLI) or `azure_client_secret` (Python). + - If the service principal has not already been added to the workspace, then you must also specify the + `DATABRICKS_AZURE_RESOURCE_ID`, represented by `--azure-workspace-resource-id` (CLI) or `azure_workspace_resource_id` (Python). + +- For Azure CLI authentication (Azure only): No additional environment variables. + +- For Microsoft Entra ID user authentication (Azure only): + + - `DATABRICKS_TOKEN` - The Entra ID token for the corresponding Entra ID user, represented by `--token` (CLI) or `token` (Python). + +- For Google Cloud Platform credentials authentication (GCP only): + + - `GOOGLE_CREDENTIALS` - The local path to the corresponding Google Cloud service account's credentials file, represented by `--google-credentials` (CLI) or `google_credentials` + +- For Google Cloud Platform ID authentication (GCP only): + + - `GOOGLE_SERVICE_ACCOUNT` - The Google Cloud service account's email address, represented by `--google-service-account` (CLI) or `google_service_account` (Python). + +- Alternatively, you can store the preceding settings in a local + [Databricks configuration profile](https://docs.databricks.com/en/dev-tools/auth/config-profiles.html) and then just + refer to the profile's name: + + - `DATABRICKS_PROFILE` - The name of the Databricks configuration profile, represented by `--profile` (CLI) or `profile` (Python). \ No newline at end of file diff --git a/snippets/general-shared-text/databricks-volumes-platform.mdx b/snippets/general-shared-text/databricks-volumes-platform.mdx index c5b76fd6..a1f45588 100644 --- a/snippets/general-shared-text/databricks-volumes-platform.mdx +++ b/snippets/general-shared-text/databricks-volumes-platform.mdx @@ -2,14 +2,35 @@ Fill in the following fields: - **Name** (_required_): A unique name for this connector. - **Host** (_required_): The Databricks workspace host URL. -- **Account ID** : The Databricks account ID, if needed. -- **Username** : The Databricks username, if basic authentication is used. -- **Password** : The associated Databricks password, if basic authentication is used. -- **Token** : The Databricks personal access token, if personal access token authentication is used. - **Cluster ID** : The Databricks cluster ID. - **Catalog** (_required_): The name of the catalog to use. - **Schema** : The name of the associated schema. If not specified, **default** is used. - **Volume** (_required_): The name of the associated volume. - **Volume Path** : Any optional path to access within the volume. - **Overwrite** Check this box if existing data should be overwritten. -- **Encoding** : Any encoding to be applied to the data in the volume. If not specified, **utf-8**, is used. \ No newline at end of file +- **Encoding** : Any encoding to be applied to the data in the volume. If not specified, **utf-8**, is used. + +Also fill in the following fields based on your authentication type, depending on your cloud provider: + +- For Databricks personal access token authentication (AWS, Azure, and GCP): + + - **Token** : The Databricks personal access token value. + +- For username and password (basic) authentication (AWS only): + + - **Username** : The Databricks username value. + - **Password** : The associated Databricks password value. + +The following authentication types are currently not supported: + +- OAuth machine-to-machine (M2M) authentication (AWS, Azure, and GCP). +- OAuth user-to-machine (U2M) authentication (AWS, Azure, and GCP). +- Azure managed identities (MSI) authentication (Azure only). +- Microsoft Entra ID service principal authentication (Azure only). +- Azure CLI authentication (Azure only). +- Microsoft Entra ID user authentication (Azure only). +- Google Cloud Platform credentials authentication (GCP only). +- Google Cloud Platform ID authentication (GCP only). + + + diff --git a/snippets/general-shared-text/databricks-volumes.mdx b/snippets/general-shared-text/databricks-volumes.mdx index fdef34c9..8bed44d3 100644 --- a/snippets/general-shared-text/databricks-volumes.mdx +++ b/snippets/general-shared-text/databricks-volumes.mdx @@ -1,6 +1,39 @@ The Databricks Volumes prerequisites: -- The Databricks compute resource's host name. Get the host name for [AWS](https://docs.databricks.com/integrations/compute-details.html), [Azure](https://learn.microsoft.com/azure/databricks/integrations/compute-details), or [GCP](https://docs.gcp.databricks.com/integrations/compute-details.html). -- The Databricks authentication details. For more information, see the documentation for [AWS](https://docs.databricks.com/dev-tools/auth/index.html), [Azure](https://learn.microsoft.com/azure/databricks/dev-tools/auth/), or [GCP](https://docs.gcp.databricks.com/dev-tools/auth/index.html). +- The Databricks workspace URL. Get the workspace URL for + [AWS](https://docs.databricks.com/workspace/workspace-details.html#workspace-instance-names-urls-and-ids), + [Azure](https://learn.microsoft.com/azure/databricks/workspace/workspace-details#workspace-instance-names-urls-and-ids), + or [GCP](https://docs.gcp.databricks.com/workspace/workspace-details.html#workspace-instance-names-urls-and-ids). + + Examples: + + - AWS: `https://.cloud.databricks.com` + - Azure: `https://adb-..azuredatabricks.net` + - GCP: `https://..gcp.databricks.com` + +- The Databricks compute resource's ID. Get the compute resource ID for + [AWS](https://docs.databricks.com/integrations/compute-details.html), + [Azure](https://learn.microsoft.com/azure/databricks/integrations/compute-details), + or [GCP](https://docs.gcp.databricks.com/integrations/compute-details.html). + +- The Databricks authentication details. For more information, see the documentation for + [AWS](https://docs.databricks.com/dev-tools/auth/index.html), + [Azure](https://learn.microsoft.com/azure/databricks/dev-tools/auth/), + or [GCP](https://docs.gcp.databricks.com/dev-tools/auth/index.html). + + More specifically, you will need: + + - For Databricks personal access token authentication (AWS, Azure, and GCP): The personal access token's value. + - For username and password (basic) authentication (AWS only): The user's name and password values. + - For OAuth machine-to-machine (M2M) authentication (AWS, Azure, and GCP): The client ID and OAuth secret values for the corresponding service principal. + - For OAuth user-to-machine (U2M) authentication (AWS, Azure, and GCP): No additional values. + - For Azure managed identities (MSI) authentication (Azure only): The client ID value for the corresponding managed identity. + - For Microsoft Entra ID service principal authentication (Azure only): The tenant ID, client ID, and client secret values for the corresponding service principal. + - For Azure CLI authentication (Azure only): No additional values. + - For Microsoft Entra ID user authentication (Azure only): The Entra ID token for the corresponding Entra ID user. + - For Google Cloud Platform credentials authentication (GCP only): The local path to the corresponding Google Cloud service account's credentials file. + - For Google Cloud Platform ID authentication (GCP only): The Google Cloud service account's email address. + - The Databricks catalog name for the Volume. Get the catalog name for [AWS](https://docs.databricks.com/catalogs/manage-catalog.html), [Azure](https://learn.microsoft.com/azure/databricks/catalogs/manage-catalog), or [GCP](https://docs.gcp.databricks.com/catalogs/manage-catalog.html). -- The Databricks Volume name. Get the volume name for [AWS](https://docs.databricks.com/files/volumes.html), [Azure](https://learn.microsoft.com/azure/databricks/files/volumes), or [GCP](https://docs.gcp.databricks.com/files/volumes.html). \ No newline at end of file +- The Databricks schema name for the Volume. Get the schema name for [AWS](https://docs.databricks.com/schemas/manage-schema.html), [Azure](https://learn.microsoft.com/azure/databricks/schemas/manage-schema), or [GCP](https://docs.gcp.databricks.com/schemas/manage-schema.html). +- The Databricks Volume name, and optionally any path in that Volume that you want to access directly. Get the Volume information for [AWS](https://docs.databricks.com/files/volumes.html), [Azure](https://learn.microsoft.com/azure/databricks/files/volumes), or [GCP](https://docs.gcp.databricks.com/files/volumes.html). \ No newline at end of file