Initialize Time - this will be use for the HighWaterMark of Client Split

In [0]:
from datetime import datetime
import time
import json

import pytz
from typing import Callable, Any

from pyspark.sql import Row
from pyspark.sql.functions import col, countDistinct, split, lit

import threading
import concurrent.futures
import traceback
from concurrent.futures import ThreadPoolExecutor, as_completed

v_timestamp_now = datetime.now(pytz.timezone("US/Central")).strftime("%Y-%m-%d %H:%M:%S.%f")

In [0]:
dbutils.widgets.text("p_product_id", "", "Product ID")
dbutils.widgets.text("p_source_servername", "", "Source Server Name")
dbutils.widgets.text("p_environment_name", "", "Environment")
dbutils.widgets.text("p_datasource_shortname", "", "Data Source Short Name")

Initialize widgets and variables

In [0]:
def get_widget_values():
    v_widget_values_dict = {
        "filter_source_server_name": dbutils.widgets.get("p_source_servername").strip(),
        "filter_product_id": dbutils.widgets.get("p_product_id").strip(),
        "filter_datasource_short_name": dbutils.widgets.get("p_datasource_shortname").strip(),
        "filter_environment_name": dbutils.widgets.get("p_environment_name").strip()
    }

    v_required_widgets_list = [
        "filter_product_id",
        "filter_source_server_name",
        "filter_datasource_short_name",
        "filter_environment_name"
    ]

    for v_widget_key in v_required_widgets_list:
        if not v_widget_values_dict[v_widget_key]:
            raise ValueError(f"The widget '{v_widget_key}' must be populated before continuing.")

    return v_widget_values_dict

try:
    v_widget_values_dict           = get_widget_values()
    v_filter_product_id            = v_widget_values_dict["filter_product_id"]
    v_filter_source_server_name    = v_widget_values_dict["filter_source_server_name"]
    v_filter_datasource_short_name = v_widget_values_dict["filter_datasource_short_name"]
    v_filter_environment           = v_widget_values_dict["filter_environment_name"]

    print("All required widgets are populated. Proceeding with the script.")
except ValueError as v_error:
    print(f"Error: {str(v_error)}")
    raise


In [0]:
%run "../common/DataFabricCommonFunctions"

Identify workspace context

In [0]:
locations_dict         = get_locations_by_env(v_filter_environment)
v_file_location        = locations_dict.get('source')
v_archive_location     = locations_dict.get('archive')
v_keyvault_location    = locations_dict.get('keyvault')
v_environment          = locations_dict.get('environment')
v_unity_catalog        = locations_dict.get('unity_catalog')

Initialize parameters for JDBC connection

Extract the Data from the Product Instance Config - Filter the respective Source Server Name and Product ID

In [0]:
v_query = (
    f"SELECT svr.*, COALESCE(tbl.IsHistorical, 0) AS IsHistorical "
    f"FROM CFG.vw_DataFabricServerList svr "
    f"LEFT JOIN (SELECT InternalProductId, InternalClientId, InternalFacilityId, "
    f"DataSourceID, SourceDatabaseName1, SourceServerName1, "
    f"CASE WHEN MAX(CASE WHEN IsHistorical = 1 THEN 1 ELSE 0 END)=1 THEN 1 ELSE 0 END AS IsHistorical "
    f"FROM CFG.vw_DataFabricTableList "
    f"GROUP BY InternalProductId, InternalClientId, InternalFacilityId, DataSourceID, SourceDatabaseName1, SourceServerName1) tbl "
    f"ON svr.InternalProductId=tbl.InternalProductId AND svr.InternalClientId=tbl.InternalClientId AND svr.InternalFacilityId=tbl.InternalFacilityId "
    f"AND svr.DataSourceID=tbl.DataSourceID AND svr.SourceServerName1=tbl.SourceServerName1 AND svr.SourceDatabaseName1=tbl.SourceDatabaseName1 "
    f"WHERE svr.InternalProductId={v_filter_product_id} AND svr.DataSourceShortName='{v_filter_datasource_short_name}' "
    f"AND svr.SourceServerName1 = '{v_filter_source_server_name}' AND svr.CDCGatewayPipelineID IS NOT NULL AND svr.CDCIngestionPipelineID IS NOT NULL"
)

v_process_list_df = execute_dbconfig_sql_query(v_query, v_filter_environment)

Triggers the CDCUpdateIngestionPipeline notebook with given parameters.

In [0]:
def trigger_cdc_update(
    p_connection_name,
    p_instance_name,
    p_product_id,
    p_gateway_id,
    p_ingestion_id,
    p_cluster_spec,
    p_target_schema_name,
    p_database_name,
    p_notebook_timeout
):
    v_cdc_update_ingestion_path = "./DailyIngestionPipelineUpdateClusterSpecs"

    v_params_dict = {
        "p_connection_name": p_connection_name,
        "p_instance_name": p_instance_name,
        "p_product_id": p_product_id,
        "p_gateway_pipeline_id": p_gateway_id,
        "p_ingestion_pipeline_id": p_ingestion_id,
        "p_cluster_spec": p_cluster_spec,
        "p_target_schema_name": p_target_schema_name,
        "p_instance_database_name": p_database_name,
        "p_target_catalog_name": v_unity_catalog,
        "p_environment_name": v_filter_environment
    }
   
    dbutils.notebook.run(v_cdc_update_ingestion_path, p_notebook_timeout, v_params_dict)
    return True


Trigger Daily Ingestion Pipeline Controller to switch on and off gateway and ingestion pipeline

In [0]:
def trigger_ingestion(
    p_source_server_name,
    p_product_id,
    p_gateway_id,
    p_ingestion_id,
    p_is_historical,
    p_notebook_timeout
):
    v_ingestion_path = "./DailyIngestionPipelineGatewayController"

    v_params_dict = {
        "p_source_server_name": p_source_server_name,
        "p_product_id": p_product_id,
        "p_gateway_id": p_gateway_id,
        "p_ingestion_id": p_ingestion_id,
        "p_is_historical": p_is_historical,
        "p_max_wait_time": p_notebook_timeout
    }

    dbutils.notebook.run(v_ingestion_path, p_notebook_timeout, v_params_dict)
    return True


Trigger Data Split to Client Databases

In [0]:
def trigger_data_split(
    p_instance_name,
    p_product_id,
    p_notebook_timeout
):
    v_target_notebook_path = "./DataSplitToClientDatabases"
    v_params_dict = {
        "p_product_id": p_product_id,
        "p_timestamp": v_timestamp_now,
        "p_cdc_instance_name": p_instance_name,
        "p_environment_name": v_filter_environment
    }

    dbutils.notebook.run(v_target_notebook_path, p_notebook_timeout, v_params_dict)

    return True


Determine cluster spec type

In [0]:
v_historical_flag_list = (
    v_process_list_df
    .filter(col("InternalProductId") == v_filter_product_id)
    .filter(col("SourceServerName1") == v_filter_source_server_name)
    .filter(col("IsHistorical") == "1")
    .select("IsHistorical")
    .distinct()
    .collect()
)

v_cluster_spec = (
    "initial_cluster_spec"
    if any(v_row["IsHistorical"] == 1 for v_row in v_historical_flag_list)
    else "ongoing_cluster_spec"
)


In [0]:
def pre_run_updateStatus(param_internal_product_id, param_internal_client_id, param_internal_facility_id, param_datasource_id, param_databasename):
    
        execute_dbconfig_stored_procedure(
        f"""
        EXEC CFG.UpdateProcessStatusFacility @InternalProductId = {param_internal_product_id}, @InternalClientId = {param_internal_client_id}, @InternalFacilityId = {param_internal_facility_id}, @Status = 'P', @DataSourceId = {param_datasource_id}, @StepName = '{param_databasename}'
        """
        ,v_environment)

        return 

In [0]:
def post_run_updateStatus(param_internal_product_id, param_internal_client_id, param_internal_facility_id, param_datasource_id, param_databasename, param_timestamp):
    isRunComplete = 0
    v_timestamp_now = datetime.strptime(param_timestamp, "%Y-%m-%d %H:%M:%S.%f").strftime("%Y-%m-%d %H:%M:%S.%f")[:-3]
    v_formatted_query = (
        f"SELECT * "
        f"FROM CFG.ProcessStatusFacilityDetail WHERE InternalProductId = '{param_internal_product_id}' "
        f"and InternalClientIds = '{param_internal_client_id}' "
        f"and InternalFacilityId = '{param_internal_facility_id}' "
        f"and DataSourceId = '{param_datasource_id}' "
        f"and StepName like '%{param_databasename}%' "
        f"and StepType = 'Extract' "
        f"and Status <> 'S'"
        )
    v_processStatusFacilityDetail_df = execute_dbconfig_sql_query(v_formatted_query,v_environment)
    v_combination_id_str = f"ProductId: {param_internal_product_id} - ClientId: {param_internal_client_id} - FacilityId: {param_internal_facility_id} - DataSourceId: {param_datasource_id} - DatabaseName: {param_databasename}"
    if v_processStatusFacilityDetail_df.isEmpty():
            execute_dbconfig_stored_procedure(
            f"""
            EXEC CFG.UpdateProcessStatusFacility @InternalProductId = {param_internal_product_id}, @InternalClientId = {param_internal_client_id}, @InternalFacilityId = {param_internal_facility_id}, @Status = 'C',@NewWatermarkValue = '{v_timestamp_now}', @DataSourceId = {param_datasource_id}, @StepName = '{param_databasename}'
            """
            ,v_environment)
            print(f"Run Successfully - {v_combination_id_str}")     
            isRunComplete = 1
    else:
            execute_dbconfig_stored_procedure(
            f"""
            EXEC CFG.UpdateProcessStatusFacility @InternalProductId = {param_internal_product_id}, @InternalClientId = {param_internal_client_id}, @InternalFacilityId = {param_internal_facility_id}, @Status = 'F', @DataSourceId = {param_datasource_id}, @StepName = '{param_databasename}'
            """
            ,v_environment)
            print(f"Run Failed - {v_combination_id_str}") 
            raise Exception(f"Post Run Check - Kindly check the Process Status Facility Details for - {v_combination_id_str}")
    return isRunComplete 

Process: For each active ProductInstance row, parse CDC_ConfigJSON and run CDC Update → Ingestion → Data Split; each step wraps in with_retry (exponential backoff, total attempts = 1 + ingestion_retry); aggregate errors and fail the run if any instance exhausts retries.

In [0]:
def main(v_row):
    v_errors_list = []

    v_internal_facility_id  = v_row["InternalFacilityId"]
    v_internal_client_id    = v_row["InternalClientId"]
    v_internal_product_id   = v_row["InternalProductId"]
    v_source_server_name1   = v_row["SourceServerName1"].split(".")[0]
    v_source_server_name2   = v_row["CDCDestinationSchema"]
    v_source_database_name1 = v_row["SourceDatabaseName1"]
    v_gateway_pipeline_id   = v_row["CDCGatewayPipelineID"]
    v_ingestion_pipeline_id = v_row["CDCIngestionPipelineID"]
    v_datasource_id         = v_row["DataSourceId"]
    v_context_info = f"{v_internal_product_id}-{v_source_server_name1}-{v_source_server_name2}-{v_source_database_name1}-{v_gateway_pipeline_id}-{v_ingestion_pipeline_id}"

    try:
        v_cdc_config_json = json.loads(v_row["CDCConfigJSON"])
        v_ingestion_retry  = v_cdc_config_json.get("ingestion_retry", 0)
        v_clusters         = v_cdc_config_json.get(v_cluster_spec, {}).get("clusters")
        v_connection_name  = v_cdc_config_json.get("connection_name")
        v_notebook_timeout = v_cdc_config_json.get("notebook_timeout")

        if v_clusters is None:
            raise ValueError(f"{v_context_info} | Missing cluster spec ({v_cluster_spec})")

        trigger_cdc_update(
            v_connection_name,
            v_source_server_name1,
            v_internal_product_id,
            v_gateway_pipeline_id,
            v_ingestion_pipeline_id,
            str(v_clusters),
            v_source_server_name2,
            v_source_database_name1,
            v_notebook_timeout
        )

        trigger_ingestion(
            v_source_server_name1,
            v_internal_product_id,
            v_gateway_pipeline_id,
            v_ingestion_pipeline_id,
            str(v_cluster_spec),
            v_notebook_timeout
        )

        pre_run_updateStatus(v_internal_product_id, v_internal_client_id, v_internal_facility_id, v_datasource_id, v_source_database_name1)
        try:

            trigger_data_split(
                v_source_server_name2,
                v_internal_product_id,
                v_notebook_timeout
            )

        except Exception as v_error:
            print(f"{v_context_info} | Data Split failed: {v_error}")
        post_run_updateStatus(v_internal_product_id, v_internal_client_id, v_internal_facility_id, v_datasource_id, v_source_database_name1, v_timestamp_now)
        
    except Exception as v_error:
        v_errors_list.append(str(v_error))

    if v_errors_list:
        raise Exception("One or more failed:\n- " + "\n- ".join(v_errors_list))



In [0]:
v_lock = threading.Lock()
 
with concurrent.futures.ThreadPoolExecutor() as v_executor:
    v_futures_list = [
        v_executor.submit(
            main,
            row
        )
        for row in v_process_list_df.distinct().collect()
    ]
 

    concurrent.futures.wait(v_futures_list)
 
    for future in v_futures_list:
        try:
            future.result() 
        except Exception as e:
            traceback.print_exc()
            raise RuntimeError("Threaded task failed") from e