### What this notebook does:
This notebook updates existing Lakeflow Connect pipelines. 

The options currently:
* Add tables to an existing pipeline
* Remove tables from existing pipeline

Cell List :
* Update databricks-sdk
* Widget Initilization
* Tables to be Replicated
* Update Managed Ingestion Pipeline

### Instructions:

Updating Managed Ingestion Pipeline:
1. Run the cell below labeled "Widget Initilization" to generate the widgets at the top of the screen
2. Fill out the wigets at the top of the screen (be sure to click out of the last widget)
3. Change the cell 'Tables to be Replicated' with the desired database, schema, and table content
4. Click the "Run all" button in the top right hand corner of the screen

### Widget Descriptions:

* p_connection_name: UC connection name that will connect to the data source 
* p_target_catalog_name: UC catalog where the extracted data from the source will be stored finally
* p_target_schema_name: UC schema where the extracted data from the source will be stored
* p_gateway_pipeline_id: Our unique ID of our gateway pipeline
* p_ingestion_pipeline_id: Our unique ID of our ingestion pipeline
* p_product_id: Product to be processed (usually 27 - Data Fabric)
* p_cluster_spec:
* p_instance_name
* p_instance_database_name

### Widget Initialization

In [0]:

dbutils.widgets.text("p_connection_name", "")
dbutils.widgets.text("p_target_catalog_name", "")
dbutils.widgets.text("p_target_schema_name", "")
dbutils.widgets.text("p_gateway_pipeline_id", "")
dbutils.widgets.text("p_ingestion_pipeline_id", "")
dbutils.widgets.text("p_product_id", "")
dbutils.widgets.text("p_client_id", "")
dbutils.widgets.text("p_cluster_spec", "")
dbutils.widgets.text("p_instance_name", "")
dbutils.widgets.text("p_environment_name", "")
dbutils.widgets.text("p_instance_database_name", "")

In [0]:
import json

widget_values_dict        = dbutils.notebook.entry_point.getCurrentBindings()
v_connection_name         = widget_values_dict["p_connection_name"]
v_target_catalog_name     = widget_values_dict["p_target_catalog_name"]
v_target_schema_name      = widget_values_dict["p_target_schema_name"]
v_gateway_pipeline_id     = widget_values_dict["p_gateway_pipeline_id"]
v_ingestion_pipeline_id   = widget_values_dict["p_ingestion_pipeline_id"]
v_product_id              = widget_values_dict["p_product_id"]
v_client_id               = widget_values_dict["p_client_id"]
v_ongoing_cluster_spec    = widget_values_dict["p_cluster_spec"]
v_ongoing_cluster_spec    = json.loads(v_ongoing_cluster_spec.replace("'", '"'))
v_instance_name           = widget_values_dict["p_instance_name"]
v_environment             = widget_values_dict["p_environment_name"]
v_database_name           = widget_values_dict["p_instance_database_name"]

In [0]:
%run "../common/DataFabricCommonFunctions"

In [0]:
locations_dict         = get_locations_by_env(v_environment)
v_file_location        = locations_dict.get('source')
v_archive_location     = locations_dict.get('archive')
v_key_vault_location   = locations_dict.get('keyvault')
v_environment          = locations_dict.get('environment')
v_unity_catalog        = locations_dict.get('unity_catalog')

### Library and Variable Declaration

Note: The databricks_url values is shared by Databricks Team, it is currently used for testing

In [0]:
import requests
from databricks.sdk import WorkspaceClient
from databricks.sdk.service import catalog, jobs, pipelines
from pyspark.sql.functions import col

notebook_context         = dbutils.notebook.entry_point.getDbutils().notebook().getContext()
v_api_token              = notebook_context.apiToken().get()
v_databricks_url         = notebook_context.apiUrl().get().replace("oregon", "e2-dogfood") 
v_pipelines_api_url      = f"{v_databricks_url}/api/2.0/pipelines"
v_connections_id_api_url = f"{v_databricks_url}/api/2.0/unity-catalog/connections/{v_connection_name}"
api_headers_dict         = {
    'Authorization': f'Bearer {v_api_token}',
    "Content-Type": "application/json"
}

workspace_client     = WorkspaceClient()
v_my_email           = workspace_client.current_user.me().user_name
v_current_user       = workspace_client.current_user.me().user_name
v_all_connections    = workspace_client.connections.list()


### Connection Testing and Extraction of Gateway and Ingestion Names

In [0]:
# Get connection ID
response = requests.get(v_connections_id_api_url, headers=api_headers_dict)
if response.status_code == 200:
    v_connection_id = response.json().get('connection_id')
    if v_connection_id is None:
        raise ValueError(f"No connection found with name {v_connection_name}")
    else:
        print(f"Connection id: {v_connection_id}")
else:
    raise Exception(f"Error fetching connections (HTTP {response.status_code}): {response.text}")

# Get gateway pipeline info
v_gateway_api_endpoint = f"{v_databricks_url}/api/2.0/pipelines/{v_gateway_pipeline_id}"
response = requests.get(v_gateway_api_endpoint, headers=api_headers_dict)

if response.status_code == 200:
    v_gateway_pipeline_data_dict = response.json()
    v_gateway_pipeline_name = v_gateway_pipeline_data_dict.get("name")
    print(f"Gateway Pipeline Name: {v_gateway_pipeline_name}")
else:
    print(f"Error fetching gateway pipeline: {response.status_code} - {response.text}")
    v_gateway_pipeline_name = None

# Get ingestion pipeline info 
v_ingestion_api_endpoint = f"{v_databricks_url}/api/2.0/pipelines/{v_ingestion_pipeline_id}"
response = requests.get(v_ingestion_api_endpoint, headers=api_headers_dict)

if response.status_code == 200:
    v_ingestion_pipeline_data_dict = response.json()
    v_ingestion_pipeline_name = v_ingestion_pipeline_data_dict.get("name")
    print(f"Ingestion Pipeline Name: {v_ingestion_pipeline_name}")
else:
    print(f"Error fetching ingestion pipeline: {response.status_code} - {response.text}")
    v_ingestion_pipeline_name = None


In [0]:
v_formatted_query = (
    f"(SELECT DISTINCT SourceDatabasename1 as DatabaseName, SourceTable as TableName "
    f"FROM cfg.vw_DataFabricTableList WHERE CDCDestinationSchema = '{v_target_schema_name}')"
)
print(v_formatted_query)

v_cdc_table_list = execute_dbconfig_sql_query(v_formatted_query,v_environment)
v_source_schema = 'dbo'
v_table_objects_list = [
    {
        "source_catalog": str(row["DatabaseName"]),
        "source_schema": str(v_source_schema),
        "source_table": str(row["TableName"])
    }
    for row in v_cdc_table_list.collect()
]


The Table Level - Update Managed Ingestion Pipeline cell will update an existing managed ingestion pipeline with new configuration settings. 

For example, if we need to add or remove a table from a pipeline, change a pipeline's name, or to use a custom image to troubleshoot issues

- gateway_pipeline_id: This is the unique ID that can be found on the gateway pipeline page in the Databricks UI
- ingestion_pipeline_id: This is the unique ID that can be found on the managed ingestion pipeline page in Databricks UI
- ingestion_pipeline_name: This is the displayed name of the Managed Ingestion Pipeline that will be seen in Databricks
- source_catalog: This is the name of our catalog or database from our SQL Server source database
- source_schema: This is the schema name of our catalog from our SQL Server source database
- source_table: This is the table name that will be replicated from our SQL Server source database

In [0]:
v_ingestion_pipeline_spec_dict = {
    "pipeline_type": "MANAGED_INGESTION",
    "name": v_ingestion_pipeline_name,
    "photon": True,
    "serverless": True,
    "continuous": False,

    # Ingestion-specific configuration
    "ingestion_definition": {
        "ingestion_gateway_id": v_gateway_pipeline_id,
        "source_type": "SQLSERVER",
        "objects": [
            {
                "table": {
                    "source_catalog": table["source_catalog"],
                    "source_schema": table["source_schema"],
                    "source_table": table["source_table"],
                    "destination_catalog": v_target_catalog_name,
                    "destination_schema": v_target_schema_name,
                }
            }
            for table in v_table_objects_list
        ]
    }
}

v_url_string = (
    f"{v_pipelines_api_url} + /{v_ingestion_pipeline_id}, "
    f"json={v_ingestion_pipeline_spec_dict}, headers={api_headers_dict}"
)
print(v_url_string)

v_response = requests.put(
    v_pipelines_api_url + f"/{v_ingestion_pipeline_id}",
    json=v_ingestion_pipeline_spec_dict,
    headers=api_headers_dict
)

if v_response.status_code == 200:
    print(f"Managed Ingestion pipeline updated: {v_ingestion_pipeline_id}")
else:
    raise Exception(
        f"Error updating Managed Ingestion Gateway pipeline {v_ingestion_pipeline_name} "
        f"(HTTP {v_response.status_code}): {v_response.text}"
    )

print(
    f"{v_ingestion_pipeline_name} Managed Ingestion Pipeline UI available at "
    f"{v_databricks_url}/pipelines/{v_ingestion_pipeline_id}"
)


In [0]:
v_gateway_pipeline_spec_dict = {
    "pipeline_type": "INGESTION_GATEWAY",
    "name": v_gateway_pipeline_name,
    "catalog": v_target_catalog_name,
    "target": v_target_schema_name,
    "photon": False,
    "serverless": False,
    "continuous": True,
    "clusters": v_ongoing_cluster_spec,
    "gateway_definition": {
        "connection_name": v_connection_name,
        "gateway_storage_catalog": v_target_catalog_name,
        "gateway_storage_schema": v_target_schema_name,
    }
}

v_update_pipeline_api_url = v_pipelines_api_url + f"/{v_gateway_pipeline_id}"
v_response = requests.put(
    v_update_pipeline_api_url,
    json=v_gateway_pipeline_spec_dict,
    headers=api_headers_dict
)

if v_response.status_code == 200:
    print(f"Gateway pipeline {v_gateway_pipeline_name} updated: {v_gateway_pipeline_id}")
else:
    raise Exception(
        f"Error creating Gateway pipeline {v_gateway_pipeline_name} "
        f"(HTTP {v_response.status_code}): {v_response.text}"
    )

print(
    f"{v_gateway_pipeline_name} Gateway UI available at "
    f"{v_databricks_url}/pipelines/{v_gateway_pipeline_id}"
)


In [0]:
v_notebook_context     = dbutils.notebook.entry_point.getDbutils().notebook().getContext()
v_api_token            = v_notebook_context.apiToken().get()
v_databricks_url       = v_notebook_context.apiUrl().get()
v_pipelines_api_url    = f"{v_databricks_url}/api/2.0/permissions/pipelines"
v_api_headers_dict     = {'Authorization': f'Bearer {v_api_token}', "Content-Type": "application/json"}

v_headers_dict = {
    "Authorization": f"Bearer {v_api_token}",
    "Content-Type": "application/json"
}

if v_environment == "dev":
        v_access_control_payload_dict = {
            "access_control_list": [
                {
                    "user_name": "svc_analytics@nthrive.com",
                    "permission_level": "IS_OWNER"
                },
                {
                    "group_name": "sec_nprod_dev_analytics_databricks_workspace_user",
                    "permission_level": "CAN_MANAGE"
                }
            ]
        }
elif v_environment == "qa":
    v_access_control_payload_dict = {
        "access_control_list": [
            {
                "user_name": "svc_analytics@nthrive.com",
                "permission_level": "IS_OWNER"
            },
            {
                "group_name": "sec_nprod_qae_analytics_databricks_workspace_user",
                "permission_level": "CAN_MANAGE"
            }
        ]
    }
elif v_environment == "prod":
    v_access_control_payload_dict = {
        "access_control_list": [
            {
                "user_name": "sekkati@nthrive.com",
                "permission_level": "IS_OWNER"
            },
            {
                "group_name": "sec_prod_prd_analytics_databricks_workspace_user",
                "permission_level": "CAN_MANAGE"
            }
        ]
    }
elif v_environment == "stg":
    v_access_control_payload_dict = {
        "access_control_list": [
            {
                "user_name": "sekkati@nthrive.com",
                "permission_level": "IS_OWNER"
            },
            {
                "group_name": "sec_prod_stg_analytics_databricks_workspace_user",
                "permission_level": "CAN_MANAGE"
            }
        ]
    }
v_gateway_permissions_response = requests.put(
    f"{v_pipelines_api_url}/{v_gateway_pipeline_id}",
    headers=v_headers_dict,
    data=json.dumps(v_access_control_payload_dict)
)

v_ingestion_permissions_response = requests.put(
    f"{v_pipelines_api_url}/{v_ingestion_pipeline_id}",
    headers=v_headers_dict,
    data=json.dumps(v_access_control_payload_dict)
)

print("Create response:", v_gateway_permissions_response.status_code)
print(v_gateway_permissions_response.json())

print("Create response:", v_ingestion_permissions_response.status_code)
print(v_ingestion_permissions_response.json())
