Initialize widgets and import modules

In [0]:
%run "../common/DataFabricCommonFunctions"

In [0]:
dbutils.widgets.text("p_client_id", "", "Client ID")
dbutils.widgets.text("p_product_id", "", "Product ID")
dbutils.widgets.text("p_facility_id", "", "Facility ID")
dbutils.widgets.text("p_historical_flag", "", "Historical Processing Flag")
dbutils.widgets.text("p_instance_name", "", "Instance Name")
dbutils.widgets.text("p_instance_database_name", "", "Instance Database Name")
dbutils.widgets.text("p_environment_name", "", "Environment Name")

from datetime import datetime
from pyspark.sql import Row
from pyspark.sql import functions as F
from pyspark.sql.functions import col
import json
import re
import requests
from databricks.sdk import WorkspaceClient
from databricks.sdk.service import catalog, jobs, pipelines

Pull data from widgets

In [0]:
def get_widget_values():
    v_widget_values_dict = {
        "v_filter_client_id" : dbutils.widgets.get("p_client_id").strip(),
        "v_filter_product_id": dbutils.widgets.get("p_product_id").strip()
    }

    v_required_widgets_list = [
        "v_filter_product_id"
    ]
    
    for widget in v_required_widgets_list:
        if not v_widget_values_dict[widget]:
            raise ValueError(f"The widget '{widget}' must be populated before continuing.")
    return v_widget_values_dict

try:
    v_widget_values_dict                   = get_widget_values()
    v_filter_client_id              = v_widget_values_dict["v_filter_client_id"]
    v_filter_product_id             = v_widget_values_dict["v_filter_product_id"]
    v_filter_facility_id            = dbutils.widgets.get("p_facility_id").strip() if dbutils.widgets.get("p_facility_id").strip() else None
    v_is_historical                 = dbutils.widgets.get("p_historical_flag")
    v_instance_name                 = dbutils.widgets.get("p_instance_name").strip()
    v_filter_instance_database_name = dbutils.widgets.get("p_instance_database_name").strip()
    v_environment_name              = dbutils.widgets.get("p_environment_name").strip()
    print("All required widgets are populated. Proceeding with the script.")
except ValueError as e:
    print(f"Error: {str(e)}")
    raise

In [0]:
v_locations_dict       = get_locations_by_env(v_environment_name)
v_file_location      = v_locations_dict.get('source')
v_archive_location   = v_locations_dict.get('archive')
v_keyvault_location  = v_locations_dict.get('keyvault')
v_environment        = v_locations_dict.get('environment')
v_unity_catalog      = v_locations_dict.get('unity_catalog')

In [0]:
v_target_catalog_name      = v_unity_catalog
v_target_schema_name       = ""
v_gateway_pipeline_name    = ""
v_ingestion_pipeline_name  = ""

In [0]:
v_server_process_list_df         = execute_dbconfig_sql_query(f"SELECT * FROM cfg.vw_DataFabricServerList where InternalProductId = {v_filter_product_id}", v_environment_name)

if v_instance_name:
    v_server_process_list_df = v_server_process_list_df.filter(col("SourceServerName1") == f"{v_instance_name}").filter(col("DataSourceId") == 46)
else:
    v_server_process_list_df = v_server_process_list_df.filter(col("InternalClientId").isin(v_filter_client_id, 0)).filter(col("DataSourceId") != 46)
    if v_filter_facility_id:
        v_server_process_list_df = v_server_process_list_df.filter(col("InternalFacilityId") == f"{v_filter_facility_id}")

Take config data from config db

In [0]:
if v_filter_facility_id:
    v_df_filtered_config = (
        v_server_process_list_df
            .filter(f"InternalProductId  = {v_filter_product_id}")
            .filter(f"InternalClientId   = {v_filter_client_id}")
            .filter(f"InternalFacilityId = {v_filter_facility_id}")
            .filter(F.col("SourceDatabaseName1") == f"{v_filter_instance_database_name}")
    )
    v_config_dictionary = json.loads(v_df_filtered_config.collect()[0]["CDCConfigJSON"])
    v_server_name       = v_server_process_list_df.collect()[0]["SourceServerName1"]

elif v_instance_name:
   v_df_filtered_config = v_server_process_list_df.filter(f"InternalProductId = {v_filter_product_id}").filter(col("SourceServerName1") == f"{v_instance_name}").filter(col("SourceDatabaseName1") == f"{v_filter_instance_database_name}")
   v_config_dictionary  = json.loads(v_df_filtered_config.first()["CDCConfigJSON"])

else:
    v_df_filtered_config = (
        v_server_process_list_df
            .filter(f"InternalProductId = {v_filter_product_id}")
            .filter(f"InternalClientId  = {v_filter_client_id}")
            .filter(f"InternalFacilityId = 0")
            .filter(F.col("SourceDatabaseName1") == f"{v_filter_instance_database_name}")
    )
    v_config_dictionary = json.loads(v_df_filtered_config.collect()[0]["CDCConfigJSON"])
    v_server_name       = v_df_filtered_config.collect()[0]["SourceServerName1"]


v_database_name = v_filter_instance_database_name if v_filter_instance_database_name else v_server_process_list_df.collect()[0]["SourceDatabaseName1"]
v_process_level = v_df_filtered_config.collect()[0]["ProcessLevel"]
v_target_schema_name = v_df_filtered_config.collect()[0]["CDCDestinationSchema"]

Filters through config table if widgets are populated

In [0]:
def quote_emails(email_list):
    v_parts_list  = re.split(r'[,\s]+', email_list.strip())
    v_quoted_list = [f'"{part.strip()}"' for part in v_parts_list if part.strip()]
    return ', '.join(v_quoted_list)

v_connection_name = v_config_dictionary["connection_name"]
v_timeout         = v_config_dictionary["notebook_timeout"]
v_wait_time       = v_config_dictionary["pipeline_wait_time"]
v_my_email        = quote_emails(v_config_dictionary["notification_email"])

Loop through config table and execute notebook accordingly

In [0]:
v_server_name = v_instance_name if v_instance_name else v_server_name
v_short_server_name = v_instance_name.partition(".")[0] if v_instance_name else v_server_name.partition(".")[0]

if not v_is_historical:
    v_cluster_spec_dict = v_config_dictionary["ongoing_cluster_spec"]["clusters"]
else:
    v_cluster_spec_dict = v_config_dictionary["initial_cluster_spec"]["clusters"]

v_gateway_pipeline_id = v_df_filtered_config.first()["CDCGatewayPipelineID"]
v_ingestion_pipeline_id = v_df_filtered_config.first()["CDCIngestionPipelineID"]
v_data_source_short_name = v_df_filtered_config.select("DataSourceShortName").first()["DataSourceShortName"]
v_data_source_id = v_df_filtered_config.select("DataSourceId").first()["DataSourceId"]
v_target_schema_name = v_df_filtered_config.select("CDCDestinationSchema").first()["CDCDestinationSchema"]
if v_data_source_id == 47:
    v_data_source_type = "Product"
else:
    v_data_source_type = "Instance"

if not (v_gateway_pipeline_id and v_ingestion_pipeline_id):
    v_gateway_pipeline_name = v_df_filtered_config.select("CDCGatewayPipelineName").first()["CDCGatewayPipelineName"]
    v_ingestion_pipeline_name = v_df_filtered_config.select("CDCIngestionPipelineName").first()["CDCIngestionPipelineName"]

if v_gateway_pipeline_name and v_ingestion_pipeline_name and not (v_gateway_pipeline_id or v_ingestion_pipeline_id):
    v_child_path = "./CreateGatewayAndIngestionPipelines"
    v_params_dict = {
        "p_connection_name"        : v_connection_name,
        "p_target_catalog_name"    : v_target_catalog_name,
        "p_target_schema_name"     : v_target_schema_name.lower(),
        "p_gateway_pipeline_name"  : v_gateway_pipeline_name.lower(),
        "p_ingestion_pipeline_name": v_ingestion_pipeline_name.lower(),
        "p_product_id"             : v_filter_product_id,
        "p_client_id"              : v_filter_client_id,
        "p_facility_id"            : v_filter_facility_id,
        "p_notification_list"      : v_my_email,
        "p_cluster_spec"           : json.dumps(v_config_dictionary["initial_cluster_spec"]["clusters"]),
        "p_instance_name"          : v_instance_name if v_instance_name else v_server_name,
        "p_instance_database_name" : v_filter_instance_database_name if v_filter_instance_database_name else v_database_name,
        "p_data_source_short_name" :  v_data_source_short_name,
        "p_source_server_name"     : v_instance_name if v_instance_name else v_server_name,
        "p_data_source_type"       : v_data_source_type,
        "p_environment_name"       : v_environment_name,
        "p_timeout_duration"       : v_wait_time
    }

    if v_instance_name:
        spark.sql(f"""
                CREATE SCHEMA IF NOT EXISTS {v_target_catalog_name}.{v_target_schema_name}
                MANAGED LOCATION '{v_file_location}/{v_data_source_short_name}_ods_{v_short_server_name}_{v_filter_instance_database_name}'
                """)
    else:
        spark.sql(f"""
                CREATE SCHEMA IF NOT EXISTS {v_target_catalog_name}.{v_target_schema_name}
                MANAGED LOCATION '{v_file_location}/{v_filter_client_id}'
                """)
    v_output_json           = dbutils.notebook.run(v_child_path, v_timeout, v_params_dict)
    v_output_dict           = json.loads(v_output_json)
    v_returned_gateway_id   = v_output_dict["gateway_pipeline_id"]
    v_returned_ingestion_id = v_output_dict["ingestion_pipeline_id"]

    if v_process_level == "Instance":
        execute_dbconfig_stored_procedure(
            f"""
            EXEC cfg.usp_UpdateDataSourceCDCPipeline @InternalProductId = {v_filter_product_id}, @SourceServerName = '{v_instance_name}', @SourceDatabaseName = '{v_filter_instance_database_name}', @TableToUpdate = 'ProductDataSourceInstance', @gatewayPipelineId = '{v_returned_gateway_id}', @ingestionPipelineId = '{v_returned_ingestion_id}', @CatalogDatabaseName = '{v_target_schema_name.lower()}'
            """
            ,v_environment_name
        )
    elif v_process_level == "Facility":
        execute_dbconfig_stored_procedure(
            f"""
            EXEC cfg.usp_UpdateDataSourceCDCPipeline @InternalProductId = {v_filter_product_id}, @InternalClientId = {v_filter_client_id}, @InternalFacilityId = {v_filter_facility_id}, @TableToUpdate = 'ProductDataSourceClientFacility', @gatewayPipelineId = '{v_returned_gateway_id}', @ingestionPipelineId = '{v_returned_ingestion_id}', @CatalogDatabaseName = '{v_target_schema_name.lower()}'
            """
            ,v_environment_name
        )
    elif v_process_level == "Client":
        execute_dbconfig_stored_procedure(
            f"""
            EXEC cfg.usp_UpdateDataSourceCDCPipeline @InternalProductId = {v_filter_product_id}, @InternalClientId = {v_filter_client_id}, @TableToUpdate = 'ProductDataSourceClient', @gatewayPipelineId = '{v_returned_gateway_id}', @ingestionPipelineId = '{v_returned_ingestion_id}', @CatalogDatabaseName = '{v_target_schema_name.lower()}'
            """
            ,v_environment_name
        )
    elif v_process_level == "Product":
        execute_dbconfig_stored_procedure(
            f"""
            EXEC cfg.usp_UpdateDataSourceCDCPipeline @InternalProductId = {v_filter_product_id}, @SourceServerName = '{v_server_name}', @SourceDatabaseName = '{v_database_name}', @TableToUpdate = 'ProductDataSource', @gatewayPipelineId = '{v_returned_gateway_id}', @ingestionPipelineId = '{v_returned_ingestion_id}', @CatalogDatabaseName = '{v_target_schema_name.lower()}'
            """
            ,v_environment_name
        )

elif v_gateway_pipeline_id and v_ingestion_pipeline_id and not (v_gateway_pipeline_name or v_ingestion_pipeline_name):
    v_target_schema_name = v_server_process_list_df.filter(col("CDCgatewayPipelineID") == f"{v_gateway_pipeline_id}").first()["CDCDestinationSchema"]
    v_child_path = "../ingestion/DailyIngestionPipelineUpdateClusterSpecs"
    v_params_dict = {
        "p_connection_name"        : v_connection_name,
        "p_target_catalog_name"    : v_target_catalog_name,
        "p_target_schema_name"     : v_target_schema_name.lower(),
        "p_gateway_pipeline_id"    : v_gateway_pipeline_id,
        "p_ingestion_pipeline_id"  : v_ingestion_pipeline_id,
        "p_product_id"             : v_filter_product_id,
        "p_client_id"              : v_filter_client_id,
        "p_facility_id"            : v_filter_facility_id,
        "p_cluster_spec"           : json.dumps(v_cluster_spec_dict),
        "p_instance_name"          : v_instance_name,
        "p_instance_database_name" : v_filter_instance_database_name,
        "p_data_source_short_name" : v_data_source_short_name,
        "p_source_server_name"     : v_instance_name,
        "p_data_source_type"       : v_data_source_type,
        "p_environment_name"       : v_environment_name,
        "p_timeout_duration"       : v_wait_time
    }
    dbutils.notebook.run(v_child_path, v_timeout, v_params_dict)
else:
    print("incorrect input or new entry â€“ skipping run")
    v_params_dict = {}
    v_child_path = ""