In [0]:
dbutils.widgets.text("p_environment_name", "", "Environment Name")
dbutils.widgets.text("p_product_code", "", "Product Code")

In [0]:
from pyspark.sql import functions as F, types as T
from pyspark.sql.functions import col
from functools import reduce
import uuid, datetime, os, json, re
import hashlib, uuid, datetime, os
import re
import json

In [0]:
%run "../common/DataFabricCommonFunctions"

In [0]:
v_environment_name =  dbutils.widgets.get("p_environment_name").strip()
v_product_code = dbutils.widgets.get("p_product_code").strip()

v_locations_dict = get_locations_by_env(v_environment_name)

v_file_location = v_locations_dict['source']
v_archive_location = v_locations_dict['archive']
v_key_Vault_location = v_locations_dict['keyvault']  

In [0]:
v_df_client_source = execute_dbconfig_sql_query("SELECT * from cfg.Client", v_environment_name)
v_df_product_source = execute_dbconfig_sql_query("SELECT * from cfg.Product", v_environment_name)
v_df_facility_source = execute_dbconfig_sql_query("SELECT * from cfg.Facility", v_environment_name)
v_df_productclient_source = execute_dbconfig_sql_query("SELECT * from cfg.ProductClient", v_environment_name)
v_df_productclientfacility_source = execute_dbconfig_sql_query("SELECT * from cfg.ProductClientFacility", v_environment_name)
v_df_productinstance_source = execute_dbconfig_sql_query("SELECT * from cfg.ProductInstance", v_environment_name)

v_input_internal_product_id = v_df_product_source.filter(col("ProductCode") == f"{v_product_code}").first()["InternalProductId"]

Setup variables for path and expected schemas

In [0]:
v_attributesystem_df = execute_dbconfig_sql_query(f"SELECT * from cfg.AttributeSystem where Environment = '{v_environment_name}'", v_environment_name)
v_source_directory = v_file_location + v_attributesystem_df.filter("AttributeName = 'AdlsImplementationConfigPath'").first()["AttributeValue"]
v_error_path = v_file_location + v_attributesystem_df.filter("AttributeName = 'AdlsImplementationConfigErrorPath'").first()["AttributeValue"]
v_archive_path = v_file_location + v_attributesystem_df.filter("AttributeName = 'AdlsImplementationConfigArchivePath'").first()["AttributeValue"]
v_productclient_schema =v_attributesystem_df.filter("AttributeName = 'ProductClientSchema'").first()["AttributeValue"]
v_productclientfacility_schema = v_attributesystem_df.filter("AttributeName = 'ProductClientFacilitySchema'").first()["AttributeValue"]
v_productinstance_schema = v_attributesystem_df.filter("AttributeName = 'ProductInstanceSchema'").first()["AttributeValue"]
v_client_schema = v_attributesystem_df.filter("AttributeName = 'ClientSchema'").first()["AttributeValue"]
v_facility_schema = v_attributesystem_df.filter("AttributeName = 'FacilitySchema'").first()["AttributeValue"]

In [0]:
v_require_exact_order = True

def coerce_columns(schema_val):
    if isinstance(schema_val, (list, tuple)) and all(isinstance(c, str) for c in schema_val):
        return list(schema_val)
    if isinstance(schema_val, str):
        s = schema_val.strip()
        if s.startswith("[") and s.endswith("]"):
            try:
                parsed = json.loads(s.replace("'", '"'))
                if isinstance(parsed, list):
                    return [str(x) for x in parsed]
            except Exception:
                pass
            inner = s[1:-1]
            parts = [p.strip().strip('"').strip("'") for p in inner.split(",")]
            return [p for p in parts if p]
        parts = [p.strip() for p in s.split(",")]
        return [p for p in parts if p]
    raise ValueError("Each v_*_schema must be a list[str] or a string describing columns")


v_schema_cols_map = {
    "client":                coerce_columns(v_client_schema),
    "facility":              coerce_columns(v_facility_schema),
    "productclient":         coerce_columns(v_productclient_schema),
    "productclientfacility": coerce_columns(v_productclientfacility_schema),
    "productinstance":       coerce_columns(v_productinstance_schema),
}

ZERO_WIDTH = "".join(["\u200B", "\u200C", "\u200D", "\uFEFF"])
ZW_RE = re.compile(f"[{ZERO_WIDTH}]")
 
def clean_name(s: str) -> str:
    if s is None: return ""
    s = s.replace("\xa0", " ")
    s = ZW_RE.sub("", s)
    s = s.strip().strip('"').strip("'")
    s = re.sub(r"\s+", " ", s)
    return s.lower()
 
def normalize_cols(cols): 
    return [clean_name(c) for c in cols]
 
v_schema_norm_map = {
    key: {
        "orig": cols,
        "seq":  normalize_cols(cols),
        "set":  set(normalize_cols(cols))
    } for key, cols in v_schema_cols_map.items()
}
 
def csv_schema_for(cols):
    return T.StructType([T.StructField(c, T.StringType(), True) for c in cols] +
                        [T.StructField("__corrupt_record", T.StringType(), True)])

CSV_OPTS = {
    "header": "true",
    "inferSchema": "false",
    "encoding": "UTF-8",
    "sep": ",",
    "quote": '"',
    "escape": '"',
    "multiLine": "true",
    "ignoreLeadingWhiteSpace": "true",
    "ignoreTrailingWhiteSpace": "true",
    "mode": "PERMISSIVE",
    "columnNameOfCorruptRecord": "__corrupt_record",
}

v_files_df = (spark.read.format("binaryFile")
            .load(v_source_directory)
            .select("path")
            .filter(F.lower(F.col("path")).endswith(".csv")))
v_paths_list = [r["path"] for r in v_files_df.collect()]
print(f"Found {len(v_paths_list)} CSV files in {v_source_directory} (no subfolders)")

v_routing_rows_list = []
for p in v_paths_list:
    try:
        v_head_df = (spark.read.format("csv")
                   .options(**{k:v for k,v in CSV_OPTS.items() if k != "columnNameOfCorruptRecord"})
                   .load(p)
                   .limit(1))
        v_incoming_cols_list = v_head_df.columns
        v_inc_norm_seq_list = normalize_cols(v_incoming_cols_list)
        v_inc_norm_set = set(v_inc_norm_seq_list)

        v_matched_key = None
        for key, info in v_schema_norm_map.items():
            if v_require_exact_order:
                if v_inc_norm_seq_list == info["seq"]:
                    v_matched_key = key; break
            else:
                if v_inc_norm_set == info["set"]:
                    v_matched_key = key; break

        v_diff = None
        if v_matched_key is None and v_incoming_cols_list:
            v_best_key, v_best_overlap = None, -1
            for key, info in v_schema_norm_map.items():
                overlap = len(v_inc_norm_set & info["set"])
                if overlap > v_best_overlap:
                    v_best_key, v_best_overlap = key, overlap
            if v_best_key:
                v_missing_list = list(sorted(info["set"] - v_inc_norm_set))
                v_extra_list   = list(sorted(v_inc_norm_set - info["set"]))
                v_diff = f"closest={v_best_key}; missing={v_missing_list}; extra={v_extra_list}"

        v_routing_rows_list.append({
            "path": p,
            "matched_schema": v_matched_key,
            "incoming_cols_raw": ",".join(v_incoming_cols_list) if v_incoming_cols_list else None,
            "incoming_cols_norm": ",".join(v_inc_norm_seq_list) if v_incoming_cols_list else None,
            "debug": v_diff
        })
    except Exception as e:
        v_routing_rows_list.append({
            "path": p,
            "matched_schema": None,
            "incoming_cols_raw": None,
            "incoming_cols_norm": None,
            "debug": f"header_read_error={str(e)}"
        })

v_routing_schema = T.StructType([
    T.StructField("path", T.StringType(), True),
    T.StructField("matched_schema", T.StringType(), True),
    T.StructField("incoming_cols_raw", T.StringType(), True),
    T.StructField("incoming_cols_norm", T.StringType(), True),
    T.StructField("debug", T.StringType(), True),
])
v_routing_summary_df = spark.createDataFrame(v_routing_rows_list, v_routing_schema)

v_paths_by_schema_dict = { key: [r["path"] for r in v_routing_rows_list if r["matched_schema"] == key] for key in v_schema_cols_map.keys() }
v_unmatched_paths_list = [r["path"] for r in v_routing_rows_list if r["matched_schema"] is None and r["incoming_cols_norm"] is not None]
v_failed_paths_list    = [r["path"] for r in v_routing_rows_list if r["incoming_cols_norm"] is None]

print("Matched counts:", {k: len(v) for k, v in v_paths_by_schema_dict.items()})
print("Unmatched files (read OK):", len(v_unmatched_paths_list))
print("Failed header reads:", len(v_failed_paths_list))

def read_single_with_schema(p, cols):
    df = (spark.read.format("csv")
            .options(**CSV_OPTS)
            .schema(csv_schema_for(cols))
            .load(p))
    if "_metadata" in df.columns:
        df = df.select("*", F.col("_metadata.file_path").alias("__source_file"))
    else:
        df = df.withColumn("__source_file", F.lit(p))
    return df.select([F.col(c).cast("string") for c in cols] + ["__source_file", "__corrupt_record"])

def combine_group(paths_list, cols):
    if not paths_list:
        empty_schema = T.StructType([T.StructField(c, T.StringType(), True) for c in cols] +
                                    [T.StructField("__source_file", T.StringType(), True),
                                     T.StructField("__corrupt_record", T.StringType(), True)])
        return spark.createDataFrame([], empty_schema)
    dfs = [read_single_with_schema(p, cols) for p in paths_list]
    return reduce(lambda a, b: a.unionByName(b, allowMissingColumns=False), dfs)

v_client_df                = combine_group(v_paths_by_schema_dict["client"],                v_schema_cols_map["client"])
v_facility_df              = combine_group(v_paths_by_schema_dict["facility"],              v_schema_cols_map["facility"])
v_productclient_df         = combine_group(v_paths_by_schema_dict["productclient"],         v_schema_cols_map["productclient"])
v_productclientfacility_df = combine_group(v_paths_by_schema_dict["productclientfacility"], v_schema_cols_map["productclientfacility"])
v_productinstance_df       = combine_group(v_paths_by_schema_dict["productinstance"],       v_schema_cols_map["productinstance"])

def show_df(name, df):
    cnt = df.count()
    print(f"{name}: {cnt} rows")
    if cnt > 0:
        display(df.limit(20))

In [0]:
try:
    if v_client_df.count() > 0:
        for row in v_client_df.collect():
            v_client_client_code = row["ClientCode"] if row["ClientCode"] is not None else ""
            v_client_client_name = row["ClientName"] if row["ClientName"] is not None else ""
            v_client_salesforce_client_id = row["SalesforceClientId"] if row["SalesforceClientId"] is not None else ""
            v_client_pas = row["PAS"] if row["PAS"] is not None else ""
            v_client_pas_code = row["PASCode"] if row["PASCode"] is not None else ""
            v_client_zone = row["Zone"] if row["Zone"] is not None else ""
            v_client_state = row["State"] if row["State"] is not None else ""
            v_client_zip = row["ZIP"] if row["ZIP"] is not None else ""
            v_client_source_client_id = row["SourceClientId"] if row["SourceClientId"] is not None else ""
            v_client_dwh_server_name = row["DWHServerName"] if row["DWHServerName"] is not None else ""
            v_client_dwh_database_name = row["DwhDatabaseName"] if row["DwhDatabaseName"] is not None else ""
            v_client_dwh_database_username = row["DwhDatabaseUserName"] if row["DwhDatabaseUserName"] is not None else ""
            v_new_internal_client_id = execute_dbconfig_sql_query("Select Max(InternalClientId) + 1 As InternalClientId From cfg.Client Where InternalClientId < 700000", v_environment_name).first()["InternalClientId"]
            try:
                v_derived_internal_client_id = v_df_client_source.filter(col("ClientCode") == f"{v_client_client_code}").first()["InternalClientId"]
            except Exception as e:
                v_derived_internal_client_id = None

            if v_derived_internal_client_id:
                print(f"Updating cfg.Client for ClientCode {v_client_client_code}")
                execute_dbconfig_stored_procedure(
                    f"""
                    EXEC cfg.InsertOrUpdateClient @IsClientExistsInDb = 1, @InternalClientId = '{v_derived_internal_client_id}', @SalesforceClientId = '{v_client_salesforce_client_id}', @ClientCode = '{v_client_client_code}', @ClientName = '{v_client_client_name}', @PAS = '{v_client_pas}', @PASCode = '{v_client_pas_code}', @Zone = '{v_client_zone}', @State = '{v_client_state}', @Zip = '{v_client_zip}', @DWHServerName = '{v_client_dwh_server_name}', @DwhDatabaseName = '{v_client_dwh_database_name}', @DwhDataloadUserName = '{v_client_dwh_database_username}'
                    """, v_environment_name
                )
            else:
                print(f"Inserting cfg.Client for ClientCode {v_client_client_code}")
                execute_dbconfig_stored_procedure(
                    f"""
                    EXEC cfg.InsertOrUpdateClient @IsClientExistsInDb = 0, @InternalClientId = {v_new_internal_client_id}, @SalesforceClientId = {v_client_salesforce_client_id}, @ClientCode = '{v_client_client_code}', @ClientName = '{v_client_client_name}', @PAS = '{v_client_pas}', @PASCode = '{v_client_pas_code}', @Zone = '{v_client_zone}', @State = '{v_client_state}', @Zip = '{v_client_zip}', @DWHServerName = '{v_client_dwh_server_name}', @DwhDatabaseName = '{v_client_dwh_database_name}', @DwhDataloadUserName = '{v_client_dwh_database_username}'
                    """, v_environment_name
                )
except Exception as e:
    print(f"Skipping Client. Error: {e}")

In [0]:
try:
    if v_facility_df.count() > 0:
        for row in v_facility_df.collect():
            v_facility_client_code = row["ClientCode"] if row["ClientCode"] is not None else ""
            v_facility_client_name = row["ClientName"] if row["ClientName"] is not None else ""
            v_facility_facility_code = row["FacilityCode"] if row["FacilityCode"] is not None else ""
            v_facility_facility_name = row["FacilityName"] if row["FacilityName"] is not None else ""
            v_facility_facility_type = row["FacilityType"] if row["FacilityType"] is not None else ""
            v_facility_salesforce_facility_id = row["SalesforceFacilityId"] if row["SalesforceFacilityId"] is not None else ""
            v_facility_pas = row["PAS"] if row["PAS"] is not None else ""
            v_facility_pas_code = row["PASCode"] if row["PASCode"] is not None else ""
            v_facility_pas_category = row["PASCategory"] if row["PASCategory"] is not None else ""
            v_facility_pas_subcategory = row["PASSubCategory"] if row["PASSubCategory"] is not None else ""
            v_facility_zone = row["Zone"] if row["Zone"] is not None else ""
            v_facility_state = row["State"] if row["State"] is not None else ""
            v_facility_zip = row["ZIP"] if row["ZIP"] is not None else ""
            v_facility_group_name = row["FacilityGroupName"] if row["FacilityGroupName"] is not None else ""
            v_derived_internal_client_id = v_df_client_source.filter(col("ClientCode") == f"{v_facility_client_code}").first()["InternalClientId"]
            try:
                v_derived_internal_facility_id = execute_dbconfig_sql_query(f"Select InternalFacilityId From cfg.Facility Where InternalClientId = {v_derived_internal_client_id} and FacilityName = '{v_facility_facility_name}'", v_environment_name).first()["InternalFacilityId"]
            except Exception as e:
                v_derived_internal_facility_id = None
            v_new_internal_facility_id = execute_dbconfig_sql_query("Select Max(InternalFacilityId) + 1 As InternalFacilityId From cfg.Facility Where InternalFacilityId < 20000", v_environment_name).first()["InternalFacilityId"]

            if v_derived_internal_facility_id:
                print(f"Updating cfg.Facility for FacilityName {v_facility_facility_name}")
                execute_dbconfig_stored_procedure(
                    f"""
                    EXEC cfg.InsertOrUpdateFacility @IsFacilityExistsInDb = 1, @InternalClientId = {v_derived_internal_client_id}, @InternalFacilityId = {v_derived_internal_facility_id}, @SalesforceFacilityId = {v_facility_salesforce_facility_id}, @FacilityCode = '{v_facility_facility_code}', @FacilityName = '{v_facility_facility_name}', @FacilityType = '{v_facility_facility_type}', @PAS = '{v_facility_pas}', @PASCode = '{v_facility_pas_code}', @PASCategory = '{v_facility_pas_category}', @PASSubCategory = '{v_facility_pas_subcategory}', @Zone = '{v_facility_zone}', @State = '{v_facility_state}', @Zip = '{v_facility_zip}', @FacilityGroupName = '{v_facility_group_name}'
                    """, v_environment_name
                )
            else:
                print(f"Inserting cfg.Facility for FacilityName {v_facility_facility_name}")
                execute_dbconfig_stored_procedure(
                    f"""
                    EXEC cfg.InsertOrUpdateFacility @IsFacilityExistsInDb = 0, @InternalClientId = {v_derived_internal_client_id}, @InternalFacilityId = {v_new_internal_facility_id}, @SalesforceFacilityId = {v_facility_salesforce_facility_id}, @FacilityCode = '{v_facility_facility_code}', @FacilityName = '{v_facility_facility_name}', @FacilityType = '{v_facility_facility_type}', @PAS = '{v_facility_pas}', @PASCode = '{v_facility_pas_code}', @PASCategory = '{v_facility_pas_category}', @PASSubCategory = '{v_facility_pas_subcategory}', @Zone = '{v_facility_zone}', @State = '{v_facility_state}', @Zip = '{v_facility_zip}', @FacilityGroupName = '{v_facility_group_name}'
                    """, v_environment_name
                )
except Exception as e:
    print(f"Skipping Facility. Error: {e}")

In [0]:
try:
    if v_productclient_df.count() > 0:
        for row in v_productclient_df.collect():
            v_productclient_client_code = row["ClientCode"] if row["ClientCode"] is not None else ""
            v_productclient_client_name = row["ClientName"] if row["ClientName"] is not None else ""
            v_productclient_source_client_id = row["SourceClientId"] if row["SourceClientId"] is not None else ""
            v_productclient_source_server_name1 = row["SourceServerName1"] if row["SourceServerName1"] is not None else ""
            v_productclient_source_database_name1 = row["SourceDatabaseName1"] if row["SourceDatabaseName1"] is not None else ""
            v_productclient_source_server_name2 = row["SourceServerName2"] if row["SourceServerName2"] is not None else ""
            v_productclient_source_database_name2 = row["SourceDatabaseName2"] if row["SourceDatabaseName2"] is not None else ""
            v_productclient_source_server_name3 = row["SourceServerName3"] if row["SourceServerName3"] is not None else ""
            v_productclient_source_database_name3 = row["SourceDatabaseName3"] if row["SourceDatabaseName3"] is not None else ""
            v_productclient_source_server_name4 = row["SourceServerName4"] if row["SourceServerName4"] is not None else ""
            v_productclient_source_database_name4 = row["SourceDatabaseName4"] if row["SourceDatabaseName4"] is not None else ""
            v_productclient_source_server_name5 = row["SourceServerName5"] if row["SourceServerName5"] is not None else ""
            v_productclient_source_database_name5 = row["SourceDatabaseName5"] if row["SourceDatabaseName5"] is not None else ""
            v_productclient_process_all_facilities_together = row["ProcessAllFacilitiesTogether"] if row["ProcessAllFacilitiesTogether"] is not None else ""
            v_productclient_wait_for_all_facilities = row["WaitForAllFacilities"] if row["WaitForAllFacilities"] is not None else ""
            v_productclient_min_watermark_value = row["MinWatermarkValue"] if row["MinWatermarkValue"] is not None else ""
            v_productclient_datafactory_name = row["DatafactoryName"] if row["DatafactoryName"] is not None else ""
            v_productclient_databricks_cluster_id = row["DatabricksClusterId"] if row["DatabricksClusterId"] is not None else ""
            v_productclient_deployment_group_number = row["DeploymentGroupNumber"] if row["DeploymentGroupNumber"] is not None else ""
            try:
                v_derived_internal_client_id = v_df_client_source.filter(col("ClientCode") == f"{v_productclient_client_code}").first()["InternalClientId"]
            except Exception as e:
                raise ValueError(f"The client code {v_productclient_client_code} does not exist in cfg.Client. Please create a record there first.")
            try:
                v_derived_productclient_internal_client_id = v_df_productclient_source.filter(col("InternalClientId") == f"{v_derived_internal_client_id}").filter(col("InternalProductId") == f"{v_input_internal_product_id}").first()["InternalClientId"]
            except Exception as e:
                v_derived_productclient_internal_client_id = None

            if v_derived_productclient_internal_client_id:
                print(f"Updating cfg.ProductClient for Client {v_derived_productclient_internal_client_id}")
                execute_dbconfig_stored_procedure(
                    f"""
                    EXEC cfg.InsertOrUpdateProductClient @IsProductClientExistsInDb = 1, @InternalProductId = '{v_input_internal_product_id}', @InternalClientId = '{v_derived_internal_client_id}', @SourceClientId = '{v_productclient_source_client_id}', @SourceServerName1 = '{v_productclient_source_server_name1}', @SourceDatabaseName1 = '{v_productclient_source_database_name1}', @SourceServerName2 = '{v_productclient_source_server_name2}', @SourceDatabaseName2 = '{v_productclient_source_database_name2}', @SourceServerName3 = '{v_productclient_source_server_name3}', @SourceDatabaseName3 = '{v_productclient_source_database_name3}', @SourceServerName4 = '{v_productclient_source_server_name4}', @SourceDatabaseName4 = '{v_productclient_source_database_name4}', @SourceServerName5 = '{v_productclient_source_server_name5}', @SourceDatabaseName5 = '{v_productclient_source_database_name5}', @ProcessAllFacilitiesTogether = '{v_productclient_process_all_facilities_together}', @WaitForAllFacilities = '{v_productclient_wait_for_all_facilities}', @MinWatermarkValue = '{v_productclient_min_watermark_value}', @DatafactoryName = '{v_productclient_datafactory_name}',  @DatabricksClusterId = '{v_productclient_databricks_cluster_id}', @DeploymentGroupNumber = '{v_productclient_deployment_group_number}'
                    """, v_environment_name
                )
            else:
                print(f"Inserting cfg.ProductClient for Client {v_derived_internal_client_id}")
                execute_dbconfig_stored_procedure(
                    f"""
                    EXEC cfg.InsertOrUpdateProductClient @IsProductClientExistsInDb = 0, @InternalProductId = '{v_input_internal_product_id}', @InternalClientId = '{v_derived_internal_client_id}', @SourceClientId = '{v_productclient_source_client_id}', @SourceServerName1 = '{v_productclient_source_server_name1}', @SourceDatabaseName1 = '{v_productclient_source_database_name1}', @SourceServerName2 = '{v_productclient_source_server_name2}', @SourceDatabaseName2 = '{v_productclient_source_database_name2}', @SourceServerName3 = '{v_productclient_source_server_name3}', @SourceDatabaseName3 = '{v_productclient_source_database_name3}', @SourceServerName4 = '{v_productclient_source_server_name4}', @SourceDatabaseName4 = '{v_productclient_source_database_name4}', @SourceServerName5 = '{v_productclient_source_server_name5}', @SourceDatabaseName5 = '{v_productclient_source_database_name5}', @ProcessAllFacilitiesTogether = '{v_productclient_process_all_facilities_together}', @WaitForAllFacilities = '{v_productclient_wait_for_all_facilities}', @MinWatermarkValue = '{v_productclient_min_watermark_value}', @DatafactoryName = '{v_productclient_datafactory_name}', @DatabricksClusterId = '{v_productclient_databricks_cluster_id}', @DeploymentGroupNumber = '{v_productclient_deployment_group_number}'
                    """, v_environment_name
                )
except Exception as e:
    print(f"Skipping ProductClient. Error: {e}")

In [0]:
try:
    if v_productclientfacility_df.count() > 0:
        for row in v_productclientfacility_df.collect():
            v_productclientfacility_client_code = row["ClientCode"] if row["ClientCode"] is not None else ""
            v_productclientfacility_client_name = row["ClientName"] if row["ClientName"] is not None else ""
            v_productclientfacility_facility_code = row["FacilityCode"] if row["FacilityCode"] is not None else ""
            v_productclientfacility_facility_name = row["FacilityName"] if row["FacilityName"] is not None else ""
            v_productclientfacility_source_facility_id = row["SourceFacilityId"] if row["SourceFacilityId"] is not None else ""
            v_productclientfacility_source_facility_code = row["SourceFacilityCode"] if row["SourceFacilityId"] is not None else ""
            v_productclientfacility_source_server_name1 = row["SourceServerName1"] if row["SourceServerName1"] is not None else ""
            v_productclientfacility_source_database_name1 = row["SourceDatabaseName1"] if row["SourceDatabaseName1"] is not None else ""
            v_productclientfacility_source_server_name2 = row["SourceServerName2"] if row["SourceServerName2"] is not None else ""
            v_productclientfacility_source_database_name2 = row["SourceDatabaseName2"] if row["SourceDatabaseName2"] is not None else ""
            v_productclientfacility_source_server_name3 = row["SourceServerName3"] if row["SourceServerName3"] is not None else ""
            v_productclientfacility_source_database_name3 = row["SourceDatabaseName3"] if row["SourceDatabaseName3"] is not None else ""
            v_productclientfacility_source_server_name4 = row["SourceServerName4"] if row["SourceServerName4"] is not None else ""
            v_productclientfacility_source_database_name4 = row["SourceDatabaseName4"] if row["SourceDatabaseName4"] is not None else ""
            v_productclientfacility_project_id = row["ProjectId"] if row["ProjectId"] is not None else ""
            v_productclientfacility_instance_id = row["InstanceId"] if row["InstanceId"] is not None else ""
            v_productclientfacility_data_source_id = row["DataSourceId"] if row["DataSourceId"] is not None else ""
            v_productclientfacility_extract_file_name_pattern = row["ExtractFileNamePattern"] if row["ExtractFileNamePattern"] is not None else ""
            try:
                v_derived_internal_client_id = v_df_client_source.filter(col("ClientCode") == f"{v_productclientfacility_client_code}").first()["InternalClientId"]
            except Exception as e:
                raise ValueError(f"The client code {v_productclientfacility_client_code} doesn't exist in cfg.Client. Please create it there first.")
            try: 
                v_derived_internal_facility_id = v_df_facility_source.filter(col("InternalClientId") == f"{v_derived_internal_client_id}").filter(col("FacilityName") == f"{v_productclientfacility_facility_name}").first()["InternalFacilityId"]
            except Exception as e:
                raise ValueError(f"The facility name {v_productclientfacility_facility_name} doesn't exist in cfg.Client. Please create it there first.")
            v_source_table_count = v_df_productclientfacility_source.filter(col("InternalProductId") == f"{v_input_internal_product_id}").filter(col("InternalClientId") == f"{v_derived_internal_client_id}").filter(col("InternalFacilityId") == f"{v_derived_internal_facility_id}").count()

            if v_source_table_count > 1:
                print("There are more than 1 records of this Client and Facility combination in cfg.ProductClientFacility")
            elif v_source_table_count == 1:
                print(f"Updating cfg.ProductClientFacility for FacilityName {v_productclientfacility_facility_name}")
                execute_dbconfig_stored_procedure(
                    f"""
                    EXEC cfg.InsertOrUpdateProductClientFacility @IsProductClientFacilityExistsInDb = 1, @InternalProductId = {v_input_internal_product_id}, @InternalClientId = {v_derived_internal_client_id}, @InternalFacilityId = {v_derived_internal_facility_id}, @SourceFacilityId = '{v_productclientfacility_source_facility_id}', @SourceFacilityCode = '{v_productclientfacility_facility_code}', @SourceServerName1 = '{v_productclientfacility_source_server_name1}', @SourceDatabaseName1 = '{v_productclientfacility_source_database_name1}', @SourceServerName2 = '{v_productclientfacility_source_server_name2}', @SourceDatabaseName2 = '{v_productclientfacility_source_database_name2}', @SourceServerName3 = '{v_productclientfacility_source_server_name3}', @SourceDatabaseName3 = '{v_productclientfacility_source_database_name3}', @SourceServerName4 = '{v_productclientfacility_source_server_name4}', @SourceDatabaseName4 = '{v_productclientfacility_source_database_name4}', @ProjectId = '{v_productclientfacility_project_id}', @InstanceId = '{v_productclientfacility_instance_id}', @DataSourceId = '{v_productclientfacility_data_source_id}', @ExtractFileNamePattern = '{v_productclientfacility_extract_file_name_pattern}'
                    """, v_environment_name
                )
            else:
                if v_derived_internal_client_id and v_derived_internal_facility_id:
                    print(f"Inserting into cfg.ProductClientFacility for FacilityName {v_productclientfacility_facility_name}")
                    execute_dbconfig_stored_procedure(
                        f"""
                        EXEC cfg.InsertOrUpdateProductClientFacility @IsProductClientFacilityExistsInDb = 0, @InternalProductId = {v_input_internal_product_id}, @InternalClientId = {v_derived_internal_client_id}, @InternalFacilityId = {v_derived_internal_facility_id}, @SourceFacilityId = '{v_productclientfacility_source_facility_id}', @SourceFacilityCode = '{v_productclientfacility_facility_code}', @SourceServerName1 = '{v_productclientfacility_source_server_name1}', @SourceDatabaseName1 = '{v_productclientfacility_source_database_name1}', @SourceServerName2 = '{v_productclientfacility_source_server_name2}', @SourceDatabaseName2 = '{v_productclientfacility_source_database_name2}', @SourceServerName3 = '{v_productclientfacility_source_server_name3}', @SourceDatabaseName3 = '{v_productclientfacility_source_database_name3}', @SourceServerName4 = '{v_productclientfacility_source_server_name4}', @SourceDatabaseName4 = '{v_productclientfacility_source_database_name4}', @ProjectId = '{v_productclientfacility_project_id}', @InstanceId = '{v_productclientfacility_instance_id}', @DataSourceId = '{v_productclientfacility_data_source_id}', @ExtractFileNamePattern = '{v_productclientfacility_extract_file_name_pattern}'
                        """, v_environment_name
                    )
                else:
                    raise ValueError("The provided Client and Facility combinations don't exist in the client or facility source tables.")
except Exception as e:
    print(f"Skipping ProductClientFacility. Error: {e}")

In [0]:
try:
    if v_productinstance_df.count() > 0:
        for row in v_productinstance_df.collect():
            v_productinstance_internal_product_id = row["InternalProductId"] if row["InternalProductId"] is not None else ""
            v_productinstance_data_source_id = row["DataSourceId"] if row["DataSourceId"] is not None else ""
            v_productinstance_source_server_name1 = row["SourceServerName1"] if row["SourceServerName1"] is not None else ""
            v_productinstance_source_database_name1 = row["SourceDatabaseName1"] if row["SourceDatabaseName1"] is not None else ""
            v_productinstance_source_server_name2 = row["SourceServerName2"] if row["SourceServerName2"] is not None else ""
            v_productinstance_source_database_name2 = row["SourceDatabaseName2"] if row["SourceDatabaseName2"] is not None else ""
            v_productinstance_source_server_name3 = row["SourceServerName3"] if row["SourceServerName3"] is not None else ""
            v_productinstance_source_database_name3 = row["SourceDatabaseName3"] if row["SourceDatabaseName3"] is not None else ""
            v_productinstance_source_server_name4 = row["SourceServerName4"] if row["SourceServerName4"] is not None else ""
            v_productinstance_source_database_name4 = row["SourceDatabaseName4"] if row["SourceDatabaseName4"] is not None else ""
            v_productinstance_source_server_name5 = row["SourceServerName5"] if row["SourceServerName5"] is not None else ""
            v_productinstance_source_database_name5 = row["SourceDatabaseName5"] if row["SourceDatabaseName5"] is not None else ""
            v_productinstance_min_watermark_value = row["MinWatermarkValue"] if row["MinWatermarkValue"] is not None else ""
            v_productinstance_datafactory_name = row["DatafactoryName"] if row["DatafactoryName"] is not None else ""
            v_productinstance_databricks_cluster_name = row["DatabricksClusterName"] if row["DatabricksClusterName"] is not None else ""
            v_source_table_count = v_df_productinstance_source.filter(col("InternalProductId") == f"{v_productinstance_internal_product_id}").filter(col("SourceServerName1") == f"{v_productinstance_source_server_name1}").filter(col("SourceDatabaseName1") == f"{v_productinstance_source_database_name1}").count()

            if v_source_table_count > 1:
                print("There are more than 1 records of this Product and SourceServerName1 in cfg.ProductInstance")
            elif v_source_table_count == 1:
                print(f"Updating cfg.ProductInstance for this Product {v_productinstance_internal_product_id} and SourceServerName1 {v_productinstance_source_server_name1}")
                execute_dbconfig_stored_procedure(
                    f"""
                    EXEC cfg.InsertOrUpdateProductInstance @IsProductInstanceExists = 1, @InternalProductId = {v_input_internal_product_id}, @DataSourceId = {v_productinstance_data_source_id}, @SourceServerName1 = '{v_productinstance_source_server_name1}', @SourceDatabaseName1 = '{v_productinstance_source_database_name1}', @SourceServerName2 = '{v_productinstance_source_server_name2}', @SourceDatabaseName2 = '{v_productinstance_source_database_name2}', @SourceServerName3 = '{v_productinstance_source_server_name3}', @SourceDatabaseName3 = '{v_productinstance_source_database_name3}', @SourceServerName4 = '{v_productinstance_source_server_name4}', @SourceDatabaseName4 = '{v_productinstance_source_database_name4}', @SourceServerName5 = '{v_productinstance_source_server_name5}', @SourceDatabaseName5 = '{v_productinstance_source_database_name5}', @MinWatermarkValue = '{v_productinstance_min_watermark_value}', @DatafactoryName = '{v_productinstance_datafactory_name}', @DatabricksClusterName = '{v_productinstance_databricks_cluster_name}'
                    """, v_environment_name
                )
            else:
                print(f"Inserting cfg.ProductInstance for this Product {v_productinstance_internal_product_id} and SourceServerName1 {v_productinstance_source_server_name1}")
                execute_dbconfig_stored_procedure(
                    f"""
                    EXEC cfg.InsertOrUpdateProductInstance @IsProductInstanceExists = 0, @InternalProductId = {v_input_internal_product_id}, @DataSourceId = {v_productinstance_data_source_id}, @SourceServerName1 = '{v_productinstance_source_server_name1}', @SourceDatabaseName1 = '{v_productinstance_source_database_name1}', @SourceServerName2 = '{v_productinstance_source_server_name2}', @SourceDatabaseName2 = '{v_productinstance_source_database_name2}', @SourceServerName3 = '{v_productinstance_source_server_name3}', @SourceDatabaseName3 = '{v_productinstance_source_database_name3}', @SourceServerName4 = '{v_productinstance_source_server_name4}', @SourceDatabaseName4 = '{v_productinstance_source_database_name4}', @SourceServerName5 = '{v_productinstance_source_server_name5}', @SourceDatabaseName5 = '{v_productinstance_source_database_name5}', @MinWatermarkValue = '{v_productinstance_min_watermark_value}', @DatafactoryName = '{v_productinstance_datafactory_name}', @DatabricksClusterName = '{v_productinstance_databricks_cluster_name}'
                    """, v_environment_name
                )
except Exception as e:
    print(f"Skipping ProductInstance. Error: {e}")

In [0]:
def _basename(p: str) -> str:
    return p.split("/")[-1]

def _join_path(dir_path: str, filename: str) -> str:
    return f"{dir_path.rstrip('/')}/{filename}"

def _unique_dest(base_dir: str, filename: str) -> str:
    ts = datetime.datetime.utcnow().strftime("%Y%m%dT%H%M%S%f")[:-3]
    name, ext = os.path.splitext(filename)
    return _join_path(base_dir, f"{name}__{ts}__{uuid.uuid4().hex[:8]}{ext}")

def _safe_mv(src: str, dest_dir: str):
    base = _basename(src)
    dest_path = _join_path(dest_dir, base)
    if src == dest_path:
        return ("no_op", dest_path, "src_equals_dest")
    try:
        dbutils.fs.mv(src, dest_path)
        return ("moved", dest_path, None)
    except Exception as e1:
        try:
            alt_dest = _unique_dest(dest_dir, base)
            dbutils.fs.mv(src, alt_dest)
            return ("moved_renamed", alt_dest, f"existing_name_or_other: {str(e1)}")
        except Exception as e2:
            return ("failed", None, f"{str(e1)} || {str(e2)}")

total_matched = sum(len(v) for v in v_paths_by_schema_dict.values())
print(f"About to move: matched={total_matched}, unmatched={len(v_unmatched_paths_list)}, failed_header_reads={len(v_failed_paths_list)}")
for key, plist in v_paths_by_schema_dict.items():
    if plist:
        print(f"  {key}: {len(plist)} files (example: {plist[0]})")
if v_unmatched_paths_list:
    print(f"  unmatched example: {v_unmatched_paths_list[0]}")
if v_failed_paths_list:
    print(f"  failed header read example: {v_failed_paths_list[0]}")

v_move_rows_list = []
for key, plist in v_paths_by_schema_dict.items():
    for p in plist:
        status, dest_path, err = _safe_mv(p, v_archive_path)
        v_move_rows_list.append({"path": p, "category": "matched", "schema_key": key, "dest": dest_path, "status": status, "error": err})
for p in v_unmatched_paths_list:
    status, dest_path, err = _safe_mv(p, v_error_path)
    v_move_rows_list.append({"path": p, "category": "unmatched", "schema_key": None, "dest": dest_path, "status": status, "error": err})
for p in v_failed_paths_list:
    v_move_rows_list.append({"path": p, "category": "read_failed", "schema_key": None, "dest": None, "status": "left_in_place", "error": "header_read_failed"})
 
v_move_schema = T.StructType([
    T.StructField("path", T.StringType(), True),
    T.StructField("category", T.StringType(), True),
    T.StructField("schema_key", T.StringType(), True),
    T.StructField("dest", T.StringType(), True),
    T.StructField("status", T.StringType(), True),
    T.StructField("error", T.StringType(), True),
])
v_move_results_df = spark.createDataFrame(v_move_rows_list, v_move_schema)

In [0]:
dbutils.notebook.exit('Execution Complete')