Initialize widgets and import modules

In [0]:
%run "../common/DataFabricCommonFunctions"

In [0]:
dbutils.widgets.text("p_client_id", "", "Client ID")
dbutils.widgets.text("p_product_id", "", "Product ID")
dbutils.widgets.text("p_facility_id", "", "Facility ID")
dbutils.widgets.text("p_historical_flag", "", "Historical Processing Flag")
dbutils.widgets.text("p_instance_name", "", "Instance Name")
dbutils.widgets.text("p_instance_database_name", "", "Instance Database Name")
dbutils.widgets.text("p_environment_name", "", "Environment Name")

from datetime import datetime
from pyspark.sql import Row
from pyspark.sql.functions import col
import json
import re
import requests
from databricks.sdk import WorkspaceClient
from databricks.sdk.service import catalog, jobs, pipelines

Identify the workspace context

Pull data from widgets

In [0]:
def get_widget_values():
    v_widget_values_dict = {
        "v_filter_client_id" : dbutils.widgets.get("p_client_id").strip(),
        "v_filter_product_id": dbutils.widgets.get("p_product_id").strip()
    }

    v_required_widgets_list = [
        "v_filter_product_id"
    ]
    
    for widget in v_required_widgets_list:
        if not v_widget_values_dict[widget]:
            raise ValueError(f"The widget '{widget}' must be populated before continuing.")
    return v_widget_values_dict

try:
    v_widget_values_dict                   = get_widget_values()
    v_filter_client_id              = v_widget_values_dict["v_filter_client_id"]
    v_filter_product_id             = v_widget_values_dict["v_filter_product_id"]
    v_filter_facility_id            = dbutils.widgets.get("p_facility_id").strip() if dbutils.widgets.get("p_facility_id").strip() else None
    v_is_historical                 = dbutils.widgets.get("p_historical_flag")
    v_instance_name                 = dbutils.widgets.get("p_instance_name").strip()
    v_filter_instance_database_name = dbutils.widgets.get("p_instance_database_name").strip()
    v_environment_name              = dbutils.widgets.get("p_environment_name").strip()
    print("All required widgets are populated. Proceeding with the script.")
except ValueError as e:
    print(f"Error: {str(e)}")
    raise

In [0]:
v_locations_dict       = get_locations_by_env(v_environment_name)
v_file_location      = v_locations_dict.get('source')
v_archive_location   = v_locations_dict.get('archive')
v_keyvault_location  = v_locations_dict.get('keyvault')
v_environment        = v_locations_dict.get('environment')
v_unity_catalog      = v_locations_dict.get('unity_catalog')

In [0]:
v_timeout = 3600

v_target_catalog_name      = v_unity_catalog
v_target_schema_name       = ""
v_gateway_pipeline_name    = ""
v_ingestion_pipeline_name  = ""

In [0]:
v_server_process_list_df         = execute_dbconfig_sql_query(f"SELECT * FROM cfg.vw_ServerProcessList where InternalProductId = {v_filter_product_id}", v_environment_name)
v_config_product                 = execute_dbconfig_sql_query("SELECT * FROM cfg.product", v_environment_name)
v_config_product_client          = execute_dbconfig_sql_query("SELECT * FROM cfg.productclient", v_environment_name)
v_config_product_client_facility = execute_dbconfig_sql_query("SELECT * FROM cfg.productclientfacility", v_environment_name)
v_config_product_instance        = execute_dbconfig_sql_query("SELECT * FROM cfg.productinstance", v_environment_name)
v_config_data_source             = execute_dbconfig_sql_query("SELECT * FROM cfg.datasource", v_environment_name)

In [0]:
if v_instance_name:
    v_server_process_list_df = v_server_process_list_df.filter(col("SourceServerName1") == f"{v_instance_name}").filter(col("RecordSource") == "ProductInstance")
else:
    v_server_process_list_df = v_server_process_list_df.filter(col("InternalClientId").isin(v_filter_client_id, 0)).filter(col("RecordSource") != "ProductInstance")
    if v_filter_facility_id:
        v_server_process_list_df = v_server_process_list_df.filter(col("InternalFacilityId") == f"{v_filter_facility_id}")

v_df_product   = v_config_product.select("InternalProductId", "ProductCode", "NotificationRecipientsEmail").filter("Active = 1")
v_product_code = v_df_product.filter(f"InternalProductId = {v_filter_product_id}").select("ProductCode").first()["ProductCode"]

v_df_product_client_facility = v_config_product_client_facility.select("InternalProductId", "InternalClientId", "InternalFacilityId", "CDCgatewayPipelineId", "CDCingestionPipelineId", "SourceServerName1", col("CDC_ConfigJSON").alias("facility_CDC_ConfigJSON")).filter("Active = 1")

v_df_product_client = v_config_product_client.select("InternalProductId", "InternalClientId").filter("Active = 1")

v_df_data_source = v_config_data_source.select("DataSourceId", "InternalProductId", "DataSourceShortName", "DataSourceType").filter("Active = 1").filter(f"InternalProductId = {v_filter_product_id}")

v_df_instance_builder = v_config_product_instance.select("InternalProductId", "DataSourceId", "SourceServerName1", "SourceDatabaseName1", "CDCgatewayPipelineId", "CDCingestionPipelineId", "CDC_ConfigJSON").filter("Active = 1").filter(f"InternalProductId = {v_filter_product_id}").filter(col("SourceServerName1").startswith(v_instance_name)).filter(col("SourceDatabaseName1") == v_filter_instance_database_name)

v_df_instance = v_df_instance_builder.join(v_df_data_source, on="DataSourceId", how="inner")

v_df_joined_product_and_client_and_facility = v_df_product_client.join(v_df_product, on="InternalProductId", how="outer")
v_df_joined_config_table = v_df_joined_product_and_client_and_facility.join(v_df_product_client_facility, on=["InternalProductId", "InternalClientId"], how="left")

Take config data from config db

In [0]:
if v_filter_facility_id:
    v_df_filtered_config = (
        v_config_product_client_facility
            .filter(f"InternalProductId  = {v_filter_product_id}")
            .filter(f"InternalClientId   = {v_filter_client_id}")
            .filter(f"InternalFacilityId = {v_filter_facility_id}")
    )
    v_config_dictionary = json.loads(v_df_filtered_config.collect()[0]["CDC_ConfigJSON"])
    v_server_name       = v_server_process_list_df.collect()[0]["SourceServerName1"]
    v_database_name     = v_server_process_list_df.collect()[0]["SourceDatabaseName1"]

elif v_instance_name:
   v_df_filtered_config = v_df_instance
   v_config_dictionary  = json.loads(v_df_filtered_config.first()["CDC_ConfigJSON"])

else:
    v_df_filtered_config = (
        v_config_product_client
            .filter(f"InternalProductId = {v_filter_product_id}")
            .filter(f"InternalClientId  = {v_filter_client_id}")
    )
    v_config_dictionary = json.loads(v_df_filtered_config.collect()[0]["CDC_ConfigJSON"])
    v_server_name       = v_df_filtered_config.collect()[0]["SourceServerName1"]
    v_database_name     = v_df_filtered_config.collect()[0]["SourceDatabaseName1"]

Filters through config table if widgets are populated

In [0]:
v_connection_name = v_config_dictionary["connection_name"]

def quote_emails(email_list):
    v_parts_list  = re.split(r'[,\s]+', email_list.strip())
    v_quoted_list = [f'"{part.strip()}"' for part in v_parts_list if part.strip()]
    return ', '.join(v_quoted_list)

if not v_instance_name:
    v_my_email = quote_emails(v_df_joined_config_table.first()["NotificationRecipientsEmail"])
else:
    v_my_email = quote_emails(v_df_product.filter(f"InternalProductId = {v_filter_product_id}").first()["NotificationRecipientsEmail"])

Loop through config table and execute notebook accordingly

In [0]:
v_source_server_name = v_df_filtered_config.select("SourceServerName1").first()["SourceServerName1"]
v_short_server_name = v_source_server_name.partition(".")[0]

if not v_is_historical:
    v_cluster_spec_dict = v_config_dictionary["ongoing_cluster_spec"]["clusters"]
else:
    v_cluster_spec_dict = v_config_dictionary["initial_cluster_spec"]["clusters"]

v_gateway_pipeline_id = v_df_filtered_config.select("CDCgatewayPipelineId").first()["CDCgatewayPipelineId"]
v_ingestion_pipeline_id = v_df_filtered_config.select("CDCingestionPipelineId").first()["CDCingestionPipelineId"]

if not v_instance_name:
    v_data_source_type = "Product"
    v_data_source_short_name = v_df_data_source.filter(col("DataSourceType") == f"{v_data_source_type}").first()["DataSourceShortName"]
    if v_filter_facility_id:
        v_target_schema_name = f"{v_data_source_short_name}_ods_{v_short_server_name}_{v_filter_client_id}_{v_filter_facility_id}"
        if not (v_gateway_pipeline_id and v_ingestion_pipeline_id):
            v_gateway_pipeline_name = f"gw_sqlcdc_{v_data_source_short_name}_{v_short_server_name}_{v_filter_client_id}_{v_filter_facility_id}"
            v_ingestion_pipeline_name = f"ingst_sqlcdc_{v_data_source_short_name}_{v_short_server_name}_{v_filter_client_id}_{v_filter_facility_id}"
    else:
        v_target_schema_name = f"{v_data_source_short_name}_ods_{v_short_server_name}_{v_filter_client_id}"
        if not (v_gateway_pipeline_id and v_ingestion_pipeline_id):
            v_gateway_pipeline_name = f"gw_sqlcdc_{v_data_source_short_name}_{v_short_server_name}_{v_filter_client_id}"
            v_ingestion_pipeline_name = f"ingst_sqlcdc_{v_data_source_short_name}_{v_short_server_name}_{v_filter_client_id}"

else:
    v_data_source_type = "Instance"
    v_data_source_short_name = v_df_data_source.filter(col("DataSourceType") == f"{v_data_source_type}").first()["DataSourceShortName"]
    v_target_schema_name = f"{v_data_source_short_name}_ods_{v_short_server_name}_{v_filter_instance_database_name}"

    if not (v_gateway_pipeline_id and v_ingestion_pipeline_id):
        v_gateway_pipeline_name = f"gw_sqlcdc_{v_data_source_short_name}_{v_short_server_name}_{v_filter_instance_database_name}"
        v_ingestion_pipeline_name = f"ingst_sqlcdc_{v_data_source_short_name}_{v_short_server_name}_{v_filter_instance_database_name}"

if v_gateway_pipeline_name and v_ingestion_pipeline_name and not (v_gateway_pipeline_id or v_ingestion_pipeline_id):
    v_child_path = "./CreateGatewayAndIngestionPipelines"
    v_params_dict = {
        "p_connection_name"        : v_connection_name,
        "p_target_catalog_name"    : v_target_catalog_name,
        "p_target_schema_name"     : v_target_schema_name.lower(),
        "p_gateway_pipeline_name"  : v_gateway_pipeline_name.lower(),
        "p_ingestion_pipeline_name": v_ingestion_pipeline_name.lower(),
        "p_product_id"             : v_filter_product_id,
        "p_client_id"              : v_filter_client_id,
        "p_facility_id"            : v_filter_facility_id,
        "p_notification_list"      : v_my_email,
        "p_cluster_spec"           : json.dumps(v_config_dictionary["initial_cluster_spec"]["clusters"]),
        "p_instance_name"          : v_instance_name if v_instance_name else v_server_name,
        "p_instance_database_name" : v_filter_instance_database_name if v_filter_instance_database_name else v_database_name,
        "p_data_source_short_name" :  v_data_source_short_name,
        "p_source_server_name"     : v_source_server_name,
        "p_data_source_type"       : v_data_source_type,
        "p_environment_name"       : v_environment_name
    }
    if v_instance_name:
        spark.sql(f"""
                CREATE SCHEMA IF NOT EXISTS {v_target_catalog_name}.{v_target_schema_name}
                MANAGED LOCATION '{v_file_location}/{v_data_source_short_name}_ods_{v_short_server_name}_{v_filter_instance_database_name}'
                """)
    else:
        spark.sql(f"""
                CREATE SCHEMA IF NOT EXISTS {v_target_catalog_name}.{v_target_schema_name}
                MANAGED LOCATION '{v_file_location}/{v_filter_client_id}'
                """)
    v_output_json                = dbutils.notebook.run(v_child_path, v_timeout, v_params_dict)
    v_output_dict            = json.loads(v_output_json)
    v_returned_gateway_id   = v_output_dict["gateway_pipeline_id"]
    v_returned_ingestion_id = v_output_dict["ingestion_pipeline_id"]

    if v_instance_name:
        execute_dbconfig_stored_procedure(
            f"""
            EXEC cfg.usp_UpdateCDCPipeline @InternalProductId = {v_filter_product_id}, @SourceServerName = '{v_instance_name}', @SourceDatabaseName = '{v_filter_instance_database_name}', @TableToUpdate = 'ProductInstance', @gatewayPipelineId = '{v_returned_gateway_id}', @ingestionPipelineId = '{v_returned_ingestion_id}', @CatalogDatabaseName = '{v_target_schema_name.lower()}'
            """
            ,v_environment_name
        )
    elif v_filter_facility_id:
        execute_dbconfig_stored_procedure(
            f"""
            EXEC cfg.usp_UpdateCDCPipeline @InternalProductId = {v_filter_product_id}, @InternalClientId = {v_filter_client_id}, @InternalFacilityId = {v_filter_facility_id}, @TableToUpdate = 'ProductClientFacility', @gatewayPipelineId = '{v_returned_gateway_id}', @ingestionPipelineId = '{v_returned_ingestion_id}', @CatalogDatabaseName = '{v_target_schema_name.lower()}'
            """
            ,v_environment_name
        )
    else:
        execute_dbconfig_stored_procedure(
            f"""
            EXEC cfg.usp_UpdateCDCPipeline @InternalProductId = {v_filter_product_id}, @InternalClientId = {v_filter_client_id}, @TableToUpdate = 'ProductClient', @gatewayPipelineId = '{v_returned_gateway_id}', @ingestionPipelineId = '{v_returned_ingestion_id}', @CatalogDatabaseName = '{v_target_schema_name.lower()}'
            """
            ,v_environment_name
        )

elif v_gateway_pipeline_id and v_ingestion_pipeline_id and not (v_gateway_pipeline_name or v_ingestion_pipeline_name):
    v_target_schema_name = v_server_process_list_df.filter(col("CDCgatewayPipelineID") == f"{v_gateway_pipeline_id}").first()["SourceServerName2"]
    v_child_path = "../ingestion/DailyIngestionPipelineUpdateClusterSpecs"
    v_params_dict = {
        "p_connection_name"        : v_connection_name,
        "p_target_catalog_name"    : v_target_catalog_name,
        "p_target_schema_name"     : v_target_schema_name.lower(),
        "p_gateway_pipeline_id"    : v_gateway_pipeline_id,
        "p_ingestion_pipeline_id"  : v_ingestion_pipeline_id,
        "p_product_id"             : v_filter_product_id,
        "p_client_id"              : v_filter_client_id,
        "p_facility_id"            : v_filter_facility_id,
        "p_cluster_spec"           : json.dumps(v_cluster_spec_dict),
        "p_instance_name"          : v_instance_name,
        "p_instance_database_name" : v_filter_instance_database_name,
        "p_data_source_short_name" : v_data_source_short_name,
        "p_source_server_name"     : v_source_server_name,
        "p_data_source_type"       : v_data_source_type,
        "p_environment_name"       : v_environment_name
    }
    dbutils.notebook.run(v_child_path, v_timeout, v_params_dict)
else:
    print("incorrect input or new entry â€“ skipping run")
    v_params_dict = {}
    v_child_path = ""