### What this notebook does:
This notebook configures and creates a Gateway pipeline, Managed Ingestion pipeline, and a scheduled job that runs the managed ingestion pipeline every 30 minutes. These are used for Lakeflow Connect to extract data from SQL Server and ingest it into Databricks.

Cell List:
* Update databricks-sdk
* Widget Initilization
* Create Gateway Pipeline
* Tables to be Replicated
* Table Level - Create Managed Ingestion Pipeline

##### Note: If we want to update an existing pipeline, please refer to notebook 03 Update Gateway and Ingestion Pipelines

### Instructions:

##### Note: A UC connection configured to connect to a data source is required before continuing (please see 01 Pre Requirements notebook)

The widgets at the top of the page are all the configuration options that can be set in this notebook.

1. Run the cell below labeled "Widget Initilization" to generate the widgets at the top of the screen
2. Fill out the wigets at the top of the screen (be sure to click out of the last widget)
3. Populate the cell "Tables to be Replicated" below with tables we would like to replicate following the examples given
4. Click the "Run all" button in the top right hand corner of the screen

In [0]:
%run "../common/DataFabricCommonFunctions"

### Widget initilization (run first)

In [0]:
import json
import re
import requests
from databricks.sdk import WorkspaceClient
from databricks.sdk.service import catalog, jobs, pipelines
from pyspark.sql.functions import col, lit
import subprocess
subprocess.run(["pip", "install", "pymssql"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
import pymssql
from pymssql import OperationalError
import time

w = WorkspaceClient()

notebook_context  = dbutils.notebook.entry_point.getDbutils().notebook().getContext()
api_token         = notebook_context.apiToken().get()
databricks_url    = notebook_context.apiUrl().get()
pipelines_api_url = f"{databricks_url}/api/2.0/pipelines"
api_headers       = {'Authorization': f'Bearer {api_token}', "Content-Type": "application/json"}

queries      = w.queries.list()
current_user = w.current_user.me().user_name

dbutils.widgets.text("p_connection_name", "")
dbutils.widgets.text("p_target_catalog_name", "")
dbutils.widgets.text("p_target_schema_name", "")
dbutils.widgets.text("p_gateway_pipeline_name", "")
dbutils.widgets.text("p_ingestion_pipeline_name", "")
dbutils.widgets.text("p_product_id", "")
dbutils.widgets.text("p_client_id", "")
dbutils.widgets.text("p_facility_id", "")
dbutils.widgets.text("p_notification_list", "")
dbutils.widgets.text("p_cluster_spec", "")
dbutils.widgets.text("p_instance_name", "")
dbutils.widgets.text("p_instance_database_name", "")
dbutils.widgets.text("p_data_source_short_name", "")
dbutils.widgets.text("p_source_server_name", "")
dbutils.widgets.text("p_data_source_type", "")
dbutils.widgets.text("p_environment_name", "")
dbutils.widgets.text("p_timeout_duration", "")

### Widget Descriptions:
* 001_v_ownership_filter: Used to select what connections are populated in the drop down under connection_name
* 002_connection_name: UC connection name that will connect to the data source 
* 003_target_catalog_name: UC catalog where the extracted data from the source will be stored finally
* 004_target_schema_name: UC schema where the extracted data from the source will be stored
* 005_gateway_pipeline_name: Custom name of your gateway pipeline you will see this name in the Databricks UI
* 006_ingestion_pipeline_name: Custom name of your ingestion pipeline you will see this name in the Databricks UI
* 007_policy_selection: Let's you select what policies the policy_id widget shows, all or the ones related to your username
* 008_policy_id: It's required that we have unrestricted permissions to create clusters, or use a custom policy, this is the ID of the policy that will be used  
* 009_gateway_image: This is an optional parameter that does not need to be set unless instructed by Databricks to do so
* 010_log_level: Level of information shown in the stdout of the cluster logs, DEBUG should only be used when trying to troubleshoot issues


In [0]:
v_ownership_filter         = "All"
v_connection_name          = dbutils.widgets.get("p_connection_name").strip()
v_target_catalog_name      = dbutils.widgets.get("p_target_catalog_name").strip()
v_target_schema_name       = dbutils.widgets.get("p_target_schema_name").strip()
v_gateway_pipeline_name    = dbutils.widgets.get("p_gateway_pipeline_name").strip()
v_ingestion_pipeline_name  = dbutils.widgets.get("p_ingestion_pipeline_name").strip()
v_policy_selection         = "My Policies"                      
v_policy_id                = "No policy selected" 
v_gateway_image            = ""
v_log_level                = "INFO"
v_product_id               = dbutils.widgets.get("p_product_id").strip()
v_client_id                = dbutils.widgets.get("p_client_id").strip()
v_facility_id              = dbutils.widgets.get("p_facility_id").strip()
v_my_email                 = dbutils.widgets.get("p_notification_list").strip()
v_initial_cluster_spec     = dbutils.widgets.get("p_cluster_spec")
v_initial_cluster_spec     = json.loads(v_initial_cluster_spec.replace("'", '"'))
v_instance_name            = dbutils.widgets.get("p_instance_name").strip()
v_instance_database_name   = dbutils.widgets.get("p_instance_database_name").strip()
v_data_source_short_name   = dbutils.widgets.get("p_data_source_short_name").strip()
v_source_server_name       = dbutils.widgets.get("p_source_server_name").strip()
v_data_source_type         = dbutils.widgets.get("p_data_source_type").strip()
v_environment_name         = dbutils.widgets.get("p_environment_name").strip()
v_timeout_duration         = int(dbutils.widgets.get("p_timeout_duration").strip())

In [0]:
v_locations_dict         = get_locations_by_env(v_environment_name)
v_file_location        = v_locations_dict.get('source')
v_archive_location     = v_locations_dict.get('archive')
v_key_Vault_location   = v_locations_dict.get('keyvault')
v_unity_catalog        = v_locations_dict.get('unity_catalog')

In [0]:
v_df_process_status_facility_detail = execute_dbconfig_sql_query("SELECT * FROM cfg.processstatusfacilitydetail", v_environment_name)
v_df_process_status_facility = execute_dbconfig_sql_query("SELECT * FROM cfg.processstatusfacility", v_environment_name)

### Tables to be Replicated
The below cell is used to list the tables that will be replcated from the source to the target. 

* source_catalog: This is the name of our catalog or database from our SQL Server source database
* source_schema: This is the schema name of our catalog from our SQL Server source database
* source_table: This is the table name that will be replicated from our SQL Server source database 

##### IMPORTANT: The letter case of the catalog, schema, and table names MUST MATCH the case used in the source database system tables
##### Note: Each table should be encapulated with a set of { } and a comma after each entry
##### Note: Currently the recommended number of supported tables per pipeline is 100.


In [0]:
v_cdc_table_list = execute_dbconfig_sql_query(
    f"(SELECT DISTINCT SourceDatabasename1 as DatabaseName, SourceTable as TableName from cfg.vw_DataFabricTableList where SourceServerName1 = '{v_source_server_name}' and SourceDatabaseName1 = '{v_instance_database_name}')",
    v_environment_name,
)

v_source_schema = "dbo"
v_table_objects_list = [
    {
        "source_catalog": str(row["DatabaseName"]),
        "source_schema": str(v_source_schema),
        "source_table": str(row["TableName"]),
    }
    for row in v_cdc_table_list.collect()
]

The next cell checks to see if CDC is enabled on the server and if not, executes a stored procedure to enable it

Once CDC is enabled or if it already was enabled, the list of tables is processed to turn on CDC for each table.


In [0]:
# def execute_sql_query(query, jdbc_url, connection_properties):
#     formatted_query = f"({query}) as tmp"
#     return spark.read.jdbc(url=jdbc_url, table=formatted_query, properties=connection_properties)

# def enable_db_cdc(db_to_enable):
#     with pymssql.connect(server=Instance_jdbc_hostname, user=Instance_jdbc_username, password=Instance_jdbc_password, database=db_to_enable) as conn:
#         with conn.cursor() as cursor:
#             cursor.execute('EXEC sys.sp_cdc_enable_db')
#             conn.commit()

# #def run_ddlSupportObjects_cdc(db_to_enable):
# #    with pymssql.connect(server=Instance_jdbc_hostname, user=Instance_jdbc_username, password=Instance_jdbc_password, database=db_to_enable) as conn:
# #        with conn.cursor() as cursor:
# #            cursor.execute(ddl_query)
# #            conn.commit()
  

# def enable_cdc_table(catalog, schema, table_name):
#     query = f"""EXEC sys.sp_cdc_enable_table @source_schema = '{schema}', @source_name = '{table_name}', @role_name = NULL"""
#     with pymssql.connect(server=Instance_jdbc_hostname, user=Instance_jdbc_username, password=Instance_jdbc_password, database=f"{catalog}") as conn:
#         with conn.cursor() as cursor:
#             cursor.execute(query)
#             conn.commit()

# def is_cdc_enabled_table(schema, table_name):
#     query = f"SELECT is_tracked_by_cdc FROM sys.tables WHERE name = '{table_name}'"
#     result = execute_sql_query(query, Instance_jdbc_url, Instance_connection_properties)
#     return result

# Instance_jdbc_username = "sql_cdc" 
# Instance_jdbc_password = dbutils.secrets.get(scope = v_key_Vault_location, key = "EtlSqlCDCSecret")
# Instance_jdbc_hostname = v_source_server_name_1 #- make sure this is the correct url  --lewvpalyedb04.nthext.com
# Instance_jdbc_port = 1433

# Instance_connection_properties = {
#     "user": Instance_jdbc_username,
#     "password": Instance_jdbc_password,
#     "driver": "com.microsoft.sqlserver.jdbc.SQLServerDriver"
# }

# #for catalog in distinct_catalogs:
# Instance_jdbc_database = v_instance_database_name
# Instance_jdbc_url = f"jdbc:sqlserver://{Instance_jdbc_hostname}:{Instance_jdbc_port};databaseName={Instance_jdbc_database};user={Instance_jdbc_username};password={Instance_jdbc_password};encrypt=true;trustServerCertificate=true;loginTimeout=30;"

# query = f"""SELECT name, is_cdc_enabled FROM sys.databases WHERE name = '{v_instance_database_name}'"""
# result = execute_sql_query(query, Instance_jdbc_url, Instance_connection_properties)
# rows = result.select('is_cdc_enabled').collect()

# #if rows:  # Only proceed if there's at least one row
#     #is_cdc_enabled_catalog = rows[0]['is_cdc_enabled']
#     #if not is_cdc_enabled_catalog:
#     #    enable_db_cdc(catalog)
#     #run_ddlSupportObjects_cdc(catalog)
# #else:
# #    print(f"Catalog '{catalog}' not found or returned no data.")


# for row in table_objects:
#     catalog, schema, table_name = row["source_catalog"], row["source_schema"], row["source_table"]
#     result_df = is_cdc_enabled_table(schema, table_name).select('is_tracked_by_cdc')
#     rows = result_df.collect()

#     if rows:  # Only proceed if there's at least one row
#         is_tracked_by_cdc_flag = rows[0]['is_tracked_by_cdc']
#         if not is_tracked_by_cdc_flag:
#             enable_cdc_table(catalog, schema, table_name)
#     else:
#         print(f"No CDC tracking info found for {schema}.{table_name}")

### Gateway Pipeline Creation
The Gateway Pipeline used to connect to the data source, extract data, and stage it into a UC volume. The below variables/widgets are related to the gateway pipeline. The cell will output the gateway_pipeline_id and also a link to the UI for the pipeline.

- gateway_pipeline_name: This is the displayed name of the Gateway Pipeline that you will see inside of Databricks.
- gateway_image: If you have been instructed by Databricks to use a custom image name, you will uncomment and fill in this line with a value given.


In [0]:
v_dbr_version = v_gateway_image if v_gateway_image else None

v_gateway_pipeline_spec_dict = {
   "pipeline_type": "INGESTION_GATEWAY",
   "name": v_gateway_pipeline_name,
   "catalog": v_target_catalog_name,
   "target": v_target_schema_name,
   "photon": False,
   "serverless": False,
   "continuous": True,
   "clusters": v_initial_cluster_spec,
   "gateway_definition": {
     "connection_name" : v_connection_name,
     "gateway_storage_catalog": v_target_catalog_name,
     "gateway_storage_schema" : v_target_schema_name,
   }
}

spark.sql(f"CREATE SCHEMA IF NOT EXISTS {v_target_catalog_name}.{v_target_schema_name} MANAGED LOCATION '{v_file_location}{v_target_schema_name}'")
r = requests.post(pipelines_api_url, json=v_gateway_pipeline_spec_dict, headers=api_headers)
if r.status_code == 200:
  v_gateway_pipeline_id = r.json()["pipeline_id"]
  print(f"Gateway pipeline {v_gateway_pipeline_name} created: {v_gateway_pipeline_id}/{v_gateway_image}")
else:
  raise Exception(f"Error creating Gateway pipeline {v_gateway_pipeline_name} (HTTP {r.status_code}): {r.text}")

print(f"{v_gateway_pipeline_name} Gateway UI available at {databricks_url}/pipelines/{v_gateway_pipeline_id}")

In [0]:
if v_data_source_type == "Instance":
    p_write_parameters_dict = {
        "p_environment":    v_environment_name,
        "p_instance_name":  v_instance_name,
        "p_product_id":     v_product_id,
        "p_step_type":      "PreExtract",
        "p_data_source_type":    v_data_source_type
        }
elif v_data_source_type == "Product":
    if v_facility_id == "null":
        p_write_parameters_dict = {
            "p_environment":    v_environment_name,
            "p_instance_name":  v_instance_name,
            "p_product_id":     v_product_id,
            "p_client_id":      v_client_id,
            "p_step_type":      "PreExtract",
            "p_data_source_type":    v_data_source_type
        }
    else:
        p_write_parameters_dict = {
            "p_environment":    v_environment_name,
            "p_instance_name":  v_instance_name,
            "p_product_id":     v_product_id,
            "p_client_id":      v_client_id,
            "p_facility_id":    v_facility_id,
            "p_step_type":      "PreExtract",
            "p_data_source_type":    v_data_source_type
        }
v_child_path = "../common/InsertProcessStatusRecords"
dbutils.notebook.run(v_child_path, v_timeout_duration, p_write_parameters_dict)

### Managed Ingestion Pipeline Creation

The Managed Ingestion Pipeline parses the extracted data from our UC staging volume and ingests it into our target catalog and schema.

The below variables/widgets are related to the Managed Ingestion pipeline. The cell will output the ingestion_pipeline_id and also a link to the UI for the pipeline.
- ingestion_pipeline_name: This is the displayed name of the Managed Ingestion Pipeline that will be seen inside of Databricks
- source_catalog: This is the name of our catalog or database from our SQL Server source database
- source_schema: This is the schema name of our catalog from our SQL Server source database
- source_table: This is the table name that will be replicated from our SQL Server source database 

##### Note: The first run of the Managed Ingestion Pipeline might fail due to waiting for the Gateway Pipeline to get resources and initialize. It will be successful once the Gateway Pipeline is up and running.

In [0]:
# def switch_pipeline(apiToken, ApiUrl, PipelineID, action="start", fullRefresh=True, cause="Triggered by API"):
 
#     headers = {
#         "Authorization": f"Bearer {apiToken}",
#         "Content-Type": "application/json"
#     }

#     endpoint = f"{ApiUrl}/{PipelineID}/updates"
#     payload = {
#         "full_refresh": fullRefresh,
#         "cause": cause
#     }

#     response = requests.post(endpoint, headers=headers, data=json.dumps(payload))
#     print(f"{action.capitalize()} response status:", response.status_code)
#     try:
#         print("Response body:", response.json())
#     except Exception as e:
#         print("No JSON response:", response.text)

In [0]:
ingestion_pipeline_spec = {
  "pipeline_type": "MANAGED_INGESTION",
  "name": v_ingestion_pipeline_name,
  "photon": True,
  "serverless": True,
  "continuous": False,
   
  # Ingestion-specific configuration
  "ingestion_definition": {
      "ingestion_gateway_id": v_gateway_pipeline_id,
      "source_type": "SQLSERVER",
      "objects": [
          {
              "table": {
                  "source_catalog": table["source_catalog"],
                  "source_schema": table["source_schema"],
                  "source_table": table["source_table"],
                  "destination_catalog": v_target_catalog_name,
                  "destination_schema": v_target_schema_name,
              }
          }
          for table in v_table_objects_list
      ]
    }
}

r = requests.post(pipelines_api_url, json=ingestion_pipeline_spec, headers=api_headers)

if r.status_code == 200:
    ingestion_pipeline_id = r.json()["pipeline_id"]
    print(f"Managed Ingestion pipeline {v_ingestion_pipeline_name} created: {ingestion_pipeline_id}")
else:
    raise Exception(f"Error creating Ingestion Gateway pipeline {v_ingestion_pipeline_name} (HTTP {r.status_code}): {r.text}")

print(f"{v_ingestion_pipeline_name} Managed Ingestion Pipeline UI available at {databricks_url}/pipelines/{ingestion_pipeline_id}")


In [0]:
if v_data_source_type == "Instance":
    p_write_parameters_dict = {
        "p_environment":    v_environment_name,
        "p_instance_name":  v_instance_name,
        "p_product_id":     v_product_id,
        "p_step_type":      "Extract",
        "p_data_source_type":    v_data_source_type
        }
elif v_data_source_type == "Product":
    if v_facility_id == "null":
        p_write_parameters_dict = {
            "p_environment":    v_environment_name,
            "p_instance_name":  v_instance_name,
            "p_product_id":     v_product_id,
            "p_client_id":      v_client_id,
            "p_step_type":      "Extract",
            "p_data_source_type":    v_data_source_type
        }
    else:
        p_write_parameters_dict = {
            "p_environment":    v_environment_name,
            "p_instance_name":  v_instance_name,
            "p_product_id":     v_product_id,
            "p_client_id":      v_client_id,
            "p_facility_id":    v_facility_id,
            "p_step_type":      "Extract",
            "p_data_source_type":    v_data_source_type
        }
v_child_path = "../common/InsertProcessStatusRecords"
dbutils.notebook.run(v_child_path, v_timeout_duration, p_write_parameters_dict)

In [0]:
time.sleep(v_timeout_duration)
switch_pipeline(api_token, pipelines_api_url, ingestion_pipeline_id, 'start', False)

Added a block to return the ingestion and gateway pipeline IDs back to orchestrator

In [0]:
try:
    notebook_context = dbutils.notebook.entry_point.getDbutils().notebook().getContext()
    api_token = notebook_context.apiToken().get()
    databricks_url = notebook_context.apiUrl().get()
    pipelines_api_url = f"{databricks_url}/api/2.0/permissions/pipelines"
    api_headers = {'Authorization': f'Bearer {api_token}', "Content-Type": "application/json"}

    headers = {
        "Authorization": f"Bearer {api_token}",
        "Content-Type": "application/json"
    }
    payload = {}
    if v_environment_name == "dev":
        payload = {
            "access_control_list": [
                {
                    "user_name": "svc_analytics@nthrive.com",
                    "permission_level": "IS_OWNER"
                },
                {
                    "group_name": "sec_nprod_dev_analytics_databricks_workspace_user",
                    "permission_level": "CAN_MANAGE"
                }
            ]
        }
    elif v_environment_name == "qa":
        payload = {
            "access_control_list": [
                {
                    "user_name": "svc_analytics@nthrive.com",
                    "permission_level": "IS_OWNER"
                },
                {
                    "group_name": "sec_nprod_qae_analytics_databricks_workspace_user",
                    "permission_level": "CAN_MANAGE"
                }
            ]
        }
    elif v_environment_name == "prod":
        payload = {
            "access_control_list": [
                {
                    "user_name": "sekkati@nthrive.com",
                    "permission_level": "IS_OWNER"
                },
                {
                    "group_name": "sec_prod_prd_analytics_databricks_workspace_user",
                    "permission_level": "CAN_MANAGE"
                }
            ]
        }
    elif v_environment_name == "stg":
        payload = {
            "access_control_list": [
                {
                    "user_name": "sekkati@nthrive.com",
                    "permission_level": "IS_OWNER"
                },
                {
                    "group_name": "sec_prod_stg_analytics_databricks_workspace_user",
                    "permission_level": "CAN_MANAGE"
                }
            ]
        }
    
    response = requests.put(
        f"{databricks_url}/api/2.0/permissions/pipelines/{v_gateway_pipeline_id}",
        headers=headers,
        data=json.dumps(payload)
    )

    response2 = requests.put(
        f"{databricks_url}/api/2.0/permissions/pipelines/{ingestion_pipeline_id}",
        headers=headers,
        data=json.dumps(payload)
    )
    
    print("Create response:", response.status_code)
    print(response.json())

    print("Create response:", response2.status_code)
    print(response2.json())

except Exception as e:
    print(f"Error {e}")

In [0]:
output = {
  "gateway_pipeline_id": v_gateway_pipeline_id,
  "ingestion_pipeline_id": ingestion_pipeline_id
}

dbutils.notebook.exit(json.dumps(output))