From afd55017c857aabd803ba053cce798053058f5f9 Mon Sep 17 00:00:00 2001
From: Paul Cornell <paul@unstructured.io>
Date: Mon, 23 Sep 2024 15:43:01 -0700
Subject: [PATCH] Databricks Volumes v2 destination connector: update
 authentication details

---
 .../databricks_volumes.sh.mdx                 | 28 ++++-----
 .../databricks_volumes.v1.py.mdx              | 13 +++--
 .../databricks_volumes.v2.py.mdx              | 20 +++++--
 .../databricks-volumes-cli-api.mdx            | 58 +++++++++++++++++--
 .../databricks-volumes-platform.mdx           | 31 ++++++++--
 .../databricks-volumes.mdx                    | 39 ++++++++++++-
 6 files changed, 155 insertions(+), 34 deletions(-)

diff --git a/snippets/destination_connectors/databricks_volumes.sh.mdx b/snippets/destination_connectors/databricks_volumes.sh.mdx
index a270376d..d411fa66 100644
--- a/snippets/destination_connectors/databricks_volumes.sh.mdx
+++ b/snippets/destination_connectors/databricks_volumes.sh.mdx
@@ -6,21 +6,23 @@
 unstructured-ingest \
   local \
     --input-path $LOCAL_FILE_INPUT_DIR \
-    --output-dir $LOCAL_FILE_OUTPUT_DIR \
-    --strategy hi_res \
-    --chunk-elements \
-    --embedding-provider langchain-huggingface \
-    --num-processes 2 \
-    --verbose \
-    --work-dir local-input \
     --partition-by-api \
-    --api-key $UNSTRUCTURED_API_KEY\
+    --api-key $UNSTRUCTURED_API_KEY \
     --partition-endpoint $UNSTRUCTURED_API_URL \
+    --strategy hi_res \
     --additional-partition-args="{\"split_pdf_page\":\"true\", \"split_pdf_allow_failed\":\"true\", \"split_pdf_concurrency_level\": 15}" \
+    --chunk-by-api \
+    --chunking-strategy by_title \
+    --chunk-api-key $UNSTRUCTURED_API_KEY \
+    --chunking-endpoint $UNSTRUCTURED_API_URL \ 
+    --embedding-provider langchain-huggingface \
+    --embedding-model-name sentence-transformers/all-mpnet-base-v2 \
   databricks-volumes \
-    --host "$DATABRICKS_HOST" \
-    --username "$DATABRICKS_USERNAME" \
-    --password "$DATABRICKS_PASSWORD" \
-    --volume "$DATABRICKS_VOLUME" \
-    --catalog "$DATABRICKS_CATALOG"
+    --host $DATABRICKS_HOST \
+    --token $DATABRICKS_TOKEN \
+    --cluster-id $DATABRICKS_CLUSTER_ID \
+    --catalog $DATABRICKS_CATALOG \
+    --schema $DATABRICKS_SCHEMA \
+    --volume $DATABRICKS_VOLUME \
+    --volume-path $DATABRICKS_VOLUME_PATH
 ```
diff --git a/snippets/destination_connectors/databricks_volumes.v1.py.mdx b/snippets/destination_connectors/databricks_volumes.v1.py.mdx
index cd2e5658..e169ec48 100644
--- a/snippets/destination_connectors/databricks_volumes.v1.py.mdx
+++ b/snippets/destination_connectors/databricks_volumes.v1.py.mdx
@@ -26,13 +26,15 @@ def get_writer() -> Writer:
         connector_config=SimpleDatabricksVolumesConfig(
             host=os.getenv("DATABRICKS_HOST"),
             access_config=DatabricksVolumesAccessConfig(
-                username=os.getenv("DATABRICKS_USERNAME"),
-                password=os.getenv("DATABRICKS_PASSWORD")
+                token=os.getenv("DATABRICKS_TOKEN"),
+                cluster_id=os.getenv("DATABRICKS_CLUSTER_ID")
             ),
         ),
         write_config=DatabricksVolumesWriteConfig(
             catalog=os.getenv("DATABRICKS_CATALOG"),
+            schema=os.getenv("DATABRICKS_SCHEMA"),
             volume=os.getenv("DATABRICKS_VOLUME"),
+            volume_path=os.getenv("DATABRICKS_VOLUME_PATH")
         ),
     )
 
@@ -56,10 +58,13 @@ if __name__ == "__main__":
             partition_endpoint=os.getenv("UNSTRUCTURED_API_URL"),
             strategy="hi_res",
         ),
-        chunking_config=ChunkingConfig(chunk_elements=True),
+        chunking_config=ChunkingConfig(
+            chunk_elements=True
+            chunking_strategy="by_title",
+        ),
         embedding_config=EmbeddingConfig(
             provider="langchain-huggingface",
-            api_key=None,
+            model_name="sentence-transformers/all-mpnet-base-v2",
         ),
         writer=writer,
         writer_kwargs={},
diff --git a/snippets/destination_connectors/databricks_volumes.v2.py.mdx b/snippets/destination_connectors/databricks_volumes.v2.py.mdx
index fd258819..216a4504 100644
--- a/snippets/destination_connectors/databricks_volumes.v2.py.mdx
+++ b/snippets/destination_connectors/databricks_volumes.v2.py.mdx
@@ -37,18 +37,28 @@ if __name__ == "__main__":
                 "split_pdf_concurrency_level": 15
             }
         ),
-        chunker_config=ChunkerConfig(chunking_strategy="by_title"),
-        embedder_config=EmbedderConfig(embedding_provider="langchain-huggingface"),
+        chunker_config=ChunkerConfig(
+            chunk_by_api=True,
+            chunk_api_key=os.getenv("UNSTRUCTURED_API_KEY"),
+            chunking_endpoint=os.getenv("UNSTRUCTURED_API_URL"),
+            chunking_strategy="by_title"
+        ),
+        embedder_config=EmbedderConfig(
+            embedding_provider="langchain-huggingface",
+            embedding_model_name="sentence-transformers/all-mpnet-base-v2"
+        ),
         destination_connection_config=DatabricksVolumesConnectionConfig(
             access_config=DatabricksVolumesAccessConfig(
-                username=os.getenv("DATABRICKS_USERNAME"),
-                password=os.getenv("DATABRICKS_PASSWORD")
+                token=os.getenv("DATABRICKS_TOKEN"),
+                cluster_id=os.getenv("DATABRICKS_CLUSTER_ID")
             ),
             host=os.getenv("DATABRICKS_HOST")
         ),
         uploader_config=DatabricksVolumesUploaderConfig(
             catalog=os.getenv("DATABRICKS_CATALOG"),
-            volume=os.getenv("DATABRICKS_VOLUME")
+            schema=os.getenv("DATABRICKS_SCHEMA"),
+            volume=os.getenv("DATABRICKS_VOLUME"),
+            volume_path=os.getenv("DATABRICKS_VOLUME_PATH")
         )
     ).run()
 ```
\ No newline at end of file
diff --git a/snippets/general-shared-text/databricks-volumes-cli-api.mdx b/snippets/general-shared-text/databricks-volumes-cli-api.mdx
index b4e42243..a7c1d169 100644
--- a/snippets/general-shared-text/databricks-volumes-cli-api.mdx
+++ b/snippets/general-shared-text/databricks-volumes-cli-api.mdx
@@ -10,11 +10,61 @@ import AdditionalIngestDependencies from '/snippets/general-shared-text/ingest-d
 
 The following environment variables:
 
-- `DATABRICKS_HOST` - The Databricks compute resource's host name, represented by `--host` (CLI) or `host` (Python).
+- `DATABRICKS_HOST` - The Databricks host URL, represented by `--host` (CLI) or `host` (Python).
+- `DATABRICKS_CLUSTER_ID` - The Databricks compute resource ID, represented by `--cluster-id` (CLI) or `cluster_id` (Python).
 - `DATABRICKS_CATALOG` - The Databricks catalog name for the Volume, represented by `--catalog` (CLI) or `catalog` (Python).
+- `DATABRICKS_SCHEMA` - The Databricks schema name for the Volume, represented by `--schema` (CLI) or `schema` (Python). If not specified, `default` is used.
 - `DATABRICKS_VOLUME` - The Databricks Volume name, represented by `--volume` (CLI) or `volume` (Python).
+- `DATABRICKS_VOLUME_PATH` - Any optional path to access within the volume, specified by `--volume-path` (CLI) or `volume_path` (Python).
 
-Environment variables based on your authentication type, depending which types are supported by your cloud provider. For example, for username and password authentication:
+Environment variables based on your authentication type, depending on your cloud provider:
 
-- `DATABRICKS_USERNAME` - The Databricks account user's name, represented by `--username` (CLI) or `username` (Python).
-- `DATABRICKS_PASSWORD` - The Databricks account user's password, represented by `--password` (CLI) or `password` (Python).
+- For Databricks personal access token authentication (AWS, Azure, and GCP):
+
+  - `DATABRICKS_TOKEN` - The personal access token, represented by `--token` (CLI) or `token` (Python).
+
+- For username and password (basic) authentication (AWS only): The user's name and password values.
+
+  - `DATABRICKS_USERNAME` - The user's name, represented by `--username` (CLI) or `username` (Python).
+  - `DATABRICKS_PASSWORD` - The user's password, represented by `--password` (CLI) or `password` (Python).
+
+- For OAuth machine-to-machine (M2M) authentication (AWS, Azure, and GCP): The client ID and OAuth secret values for the corresponding service principal.
+  
+  - `DATABRICKS_CLIENT_ID` - The client ID value for the corresponding service principal, represented by `--client-id` (CLI) or `client_id` (Python).
+  - `DATABRICKS_CLIENT_SECRET` - The client ID and OAuth secret values for the corresponding service principal, represented by `--client-secret` (CLI) or `client_secret` (Python).
+
+- For OAuth user-to-machine (U2M) authentication (AWS, Azure, and GCP): No additional environment variables.
+  
+- For Azure managed identities (MSI) authentication (Azure only): 
+
+  - `ARM_CLIENT_ID` - The client ID value for the corresponding managed identity, represented by `--azure-client-id` (CLI) or `azure_client_id` (Python).
+  - If the target identity has not already been added to the workspace, then you must also specify the 
+    `DATABRICKS_AZURE_RESOURCE_ID`, represented by `--azure-workspace-resource-id` (CLI) or `azure_workspace_resource_id` (Python).
+  
+- For Microsoft Entra ID service principal authentication (Azure only): 
+
+  - `ARM_TENANT_ID` - The tenant ID value for the corresponding service principal, represented by `--azure-tenant-id` (CLI) or `azure_tenant_id` (Python).
+  - `ARM_CLIENT_ID` - The client ID value for the corresponding service principal, represented by `--azure-client-id` (CLI) or `azure_client_id` (Python).
+  - `ARM_CLIENT_SECRET` - The client secret value for the corresponding service principal, represented by `--azure-client-secret` (CLI) or `azure_client_secret` (Python).
+  - If the service principal has not already been added to the workspace, then you must also specify the 
+    `DATABRICKS_AZURE_RESOURCE_ID`, represented by `--azure-workspace-resource-id` (CLI) or `azure_workspace_resource_id` (Python).
+
+- For Azure CLI authentication (Azure only): No additional environment variables.
+
+- For Microsoft Entra ID user authentication (Azure only): 
+
+  - `DATABRICKS_TOKEN` - The Entra ID token for the corresponding Entra ID user, represented by `--token` (CLI) or `token` (Python).
+  
+- For Google Cloud Platform credentials authentication (GCP only): 
+
+  - `GOOGLE_CREDENTIALS` - The local path to the corresponding Google Cloud service account's credentials file, represented by `--google-credentials` (CLI) or `google_credentials`
+
+- For Google Cloud Platform ID authentication (GCP only): 
+
+  - `GOOGLE_SERVICE_ACCOUNT` - The Google Cloud service account's email address, represented by `--google-service-account` (CLI) or `google_service_account` (Python).
+
+- Alternatively, you can store the preceding settings in a local 
+  [Databricks configuration profile](https://docs.databricks.com/en/dev-tools/auth/config-profiles.html) and then just 
+  refer to the profile's name:
+
+  - `DATABRICKS_PROFILE` - The name of the Databricks configuration profile, represented by `--profile` (CLI) or `profile` (Python).
\ No newline at end of file
diff --git a/snippets/general-shared-text/databricks-volumes-platform.mdx b/snippets/general-shared-text/databricks-volumes-platform.mdx
index c5b76fd6..a1f45588 100644
--- a/snippets/general-shared-text/databricks-volumes-platform.mdx
+++ b/snippets/general-shared-text/databricks-volumes-platform.mdx
@@ -2,14 +2,35 @@ Fill in the following fields:
 
 - **Name** (_required_): A unique name for this connector.
 - **Host** (_required_): The Databricks workspace host URL.
-- **Account ID** : The Databricks account ID, if needed.
-- **Username** : The Databricks username, if basic authentication is used.
-- **Password** : The associated Databricks password, if basic authentication is used.
-- **Token** : The Databricks personal access token, if personal access token authentication is used.
 - **Cluster ID** : The Databricks cluster ID.
 - **Catalog** (_required_): The name of the catalog to use.
 - **Schema** : The name of the associated schema. If not specified, **default** is used.
 - **Volume** (_required_): The name of the associated volume.
 - **Volume Path** : Any optional path to access within the volume.
 - **Overwrite** Check this box if existing data should be overwritten.
-- **Encoding** : Any encoding to be applied to the data in the volume. If not specified, **utf-8**, is used. 
\ No newline at end of file
+- **Encoding** : Any encoding to be applied to the data in the volume. If not specified, **utf-8**, is used. 
+
+Also fill in the following fields based on your authentication type, depending on your cloud provider:
+
+- For Databricks personal access token authentication (AWS, Azure, and GCP):
+
+  - **Token** : The Databricks personal access token value.
+
+- For username and password (basic) authentication (AWS only):
+
+  - **Username** : The Databricks username value.
+  - **Password** : The associated Databricks password value.
+
+The following authentication types are currently not supported:
+
+- OAuth machine-to-machine (M2M) authentication (AWS, Azure, and GCP).
+- OAuth user-to-machine (U2M) authentication (AWS, Azure, and GCP).
+- Azure managed identities (MSI) authentication (Azure only).
+- Microsoft Entra ID service principal authentication (Azure only).
+- Azure CLI authentication (Azure only).
+- Microsoft Entra ID user authentication (Azure only).
+- Google Cloud Platform credentials authentication (GCP only).
+- Google Cloud Platform ID authentication (GCP only).
+
+
+
diff --git a/snippets/general-shared-text/databricks-volumes.mdx b/snippets/general-shared-text/databricks-volumes.mdx
index fdef34c9..8bed44d3 100644
--- a/snippets/general-shared-text/databricks-volumes.mdx
+++ b/snippets/general-shared-text/databricks-volumes.mdx
@@ -1,6 +1,39 @@
 The Databricks Volumes prerequisites:
 
-- The Databricks compute resource's host name. Get the host name for [AWS](https://docs.databricks.com/integrations/compute-details.html), [Azure](https://learn.microsoft.com/azure/databricks/integrations/compute-details), or [GCP](https://docs.gcp.databricks.com/integrations/compute-details.html).
-- The Databricks authentication details. For more information, see the documentation for [AWS](https://docs.databricks.com/dev-tools/auth/index.html), [Azure](https://learn.microsoft.com/azure/databricks/dev-tools/auth/), or [GCP](https://docs.gcp.databricks.com/dev-tools/auth/index.html).
+- The Databricks workspace URL. Get the workspace URL for 
+  [AWS](https://docs.databricks.com/workspace/workspace-details.html#workspace-instance-names-urls-and-ids), 
+  [Azure](https://learn.microsoft.com/azure/databricks/workspace/workspace-details#workspace-instance-names-urls-and-ids), 
+  or [GCP](https://docs.gcp.databricks.com/workspace/workspace-details.html#workspace-instance-names-urls-and-ids).
+
+  Examples:
+
+  - AWS: `https://<workspace-id>.cloud.databricks.com`
+  - Azure: `https://adb-<workspace-id>.<random-number>.azuredatabricks.net`
+  - GCP: `https://<workspace-id>.<random-number>.gcp.databricks.com`
+
+- The Databricks compute resource's ID. Get the compute resource ID for 
+  [AWS](https://docs.databricks.com/integrations/compute-details.html), 
+  [Azure](https://learn.microsoft.com/azure/databricks/integrations/compute-details), 
+  or [GCP](https://docs.gcp.databricks.com/integrations/compute-details.html).
+
+- The Databricks authentication details. For more information, see the documentation for 
+  [AWS](https://docs.databricks.com/dev-tools/auth/index.html), 
+  [Azure](https://learn.microsoft.com/azure/databricks/dev-tools/auth/), 
+  or [GCP](https://docs.gcp.databricks.com/dev-tools/auth/index.html).
+
+  More specifically, you will need:
+
+  - For Databricks personal access token authentication (AWS, Azure, and GCP): The personal access token's value.
+  - For username and password (basic) authentication (AWS only): The user's name and password values.
+  - For OAuth machine-to-machine (M2M) authentication (AWS, Azure, and GCP): The client ID and OAuth secret values for the corresponding service principal.
+  - For OAuth user-to-machine (U2M) authentication (AWS, Azure, and GCP): No additional values.
+  - For Azure managed identities (MSI) authentication (Azure only): The client ID value for the corresponding managed identity.
+  - For Microsoft Entra ID service principal authentication (Azure only): The tenant ID, client ID, and client secret values for the corresponding service principal.
+  - For Azure CLI authentication (Azure only): No additional values.
+  - For Microsoft Entra ID user authentication (Azure only): The Entra ID token for the corresponding Entra ID user.
+  - For Google Cloud Platform credentials authentication (GCP only): The local path to the corresponding Google Cloud service account's credentials file.
+  - For Google Cloud Platform ID authentication (GCP only): The Google Cloud service account's email address.
+
 - The Databricks catalog name for the Volume. Get the catalog name for [AWS](https://docs.databricks.com/catalogs/manage-catalog.html), [Azure](https://learn.microsoft.com/azure/databricks/catalogs/manage-catalog), or [GCP](https://docs.gcp.databricks.com/catalogs/manage-catalog.html).
-- The Databricks Volume name. Get the volume name for [AWS](https://docs.databricks.com/files/volumes.html), [Azure](https://learn.microsoft.com/azure/databricks/files/volumes), or [GCP](https://docs.gcp.databricks.com/files/volumes.html).
\ No newline at end of file
+- The Databricks schema name for the Volume. Get the schema name for [AWS](https://docs.databricks.com/schemas/manage-schema.html), [Azure](https://learn.microsoft.com/azure/databricks/schemas/manage-schema), or [GCP](https://docs.gcp.databricks.com/schemas/manage-schema.html).
+- The Databricks Volume name, and optionally any path in that Volume that you want to access directly. Get the Volume information for [AWS](https://docs.databricks.com/files/volumes.html), [Azure](https://learn.microsoft.com/azure/databricks/files/volumes), or [GCP](https://docs.gcp.databricks.com/files/volumes.html).
\ No newline at end of file