In [0]:
v_clientId = dbutils.widgets.get("p_client_id").strip()
v_environment_name = dbutils.widgets.get("p_environment_name").strip()
sub_environment = v_environment_name.lower()

In [0]:
%run "../common/DataFabricCommonFunctions"

In [0]:
get_locations(sub_environment)
key_vault_location = keyVault
v_environment = environment

if sub_environment == 'qa':
    sub_environment = 'qae'

jdbc_username = "AnalyticsOwner"
jdbc_password = dbutils.secrets.get(scope = key_vault_location, key = "AnalyticsOwner")
jdbc_hostname = f"{v_environment}-analytics-{sub_environment}-01-sql.database.windows.net"
jdbc_port = 1433
jdbc_database = "db_Configuration"

jdbc_url = f"jdbc:sqlserver://{jdbc_hostname}:{jdbc_port};databaseName={jdbc_database};encrypt=true;trustServerCertificate=false;hostNameInCertificate=*.database.windows.net;loginTimeout=30;"
connection_properties = {
    "user": jdbc_username,
    "password": jdbc_password,
    "driver": "com.microsoft.sqlserver.jdbc.SQLServerDriver"
}

query = f"select * from cfg.vw_TableProcessList where SourceServerName2 is not null "
if v_clientId:
    query = query + f"and InternalClientId = {v_clientId}"

print(query)

list_df = execute_sql_query(query, jdbc_url, connection_properties)
list_df.display()

In [0]:
from pyspark.sql.functions import *
import pyspark

from functools import *

uc_source_list= []
client_database_list = []
 
rows = list_df.collect()

for row in rows:
    source_facility_id = row["SourceFacilityId"]
    source_servername2 = row["SourceServerName2"]
    source_databasename1 = row["SourceDatabaseName1"]
    table_name = row["Stepname"]
    client_id = row["InternalClientId"]
    short_name = row["DataSourceShortName"]
    historical_extract_query = row["HistoricalExtractQuery"]



    uc_source_query = f"{historical_extract_query}"

    uc_source_query = uc_source_query.replace("{UC_SchemaName}", source_servername2).replace("{TableName}", table_name).replace("{SiteId}", str(source_facility_id)).replace("SELECT *", "SELECT COUNT(1) as CDCSource_Count")
    print(uc_source_query)
    try:
        uc_source_list.append(spark.sql(uc_source_query).withColumn("UC_CDC_Source", lit(source_servername2)).withColumn("DatabaseName", lit(source_databasename1)).withColumn("TableName", lit(table_name)).withColumn("ClientID", lit(client_id)).withColumn("SourceFacilityId", lit(source_facility_id)).withColumn("SourceQuery", lit(uc_source_query)))
    except:
        pass


    client_database_query = row["HistoricalExtractQuery"]

    client_database_query = client_database_query.replace("{UC_SchemaName}", "db_"+ str(client_id)).replace("{TableName}", short_name + "_ods_" + table_name).replace("{SiteId}", str(source_facility_id)).replace("SELECT *", "SELECT COUNT(1) as ClientDatabase_Count")

    print(client_database_query)
    
    try:
        client_database_list.append(spark.sql(client_database_query).withColumn("UC_CDC_Source", lit(source_servername2)).withColumn("DatabaseName", lit(source_databasename1)).withColumn("TableName", lit(table_name)).withColumn("ClientID", lit(client_id)).withColumn("SourceFacilityId", lit(source_facility_id)).withColumn("SourceQuery", lit(client_database_query)))

    except:
        pass

display(uc_source_list[0])
display(client_database_list[0])

    


In [0]:

uc_list = reduce(lambda df1, df2: df1.union(df2), uc_source_list)
client_list = reduce(lambda df1, df2: df1.union(df2), client_database_list)


final_detail = uc_list.join(
    client_list,
    (uc_list.TableName == client_list.TableName) & (uc_list.ClientID == client_list.ClientID) & (uc_list.DatabaseName == client_list.DatabaseName) & (uc_list.SourceFacilityId == client_list.SourceFacilityId),
    'left_outer'
).select(
    uc_list.UC_CDC_Source,
    uc_list.ClientID,
    uc_list.SourceFacilityId,
    uc_list.DatabaseName,
    uc_list.TableName,
    uc_list.CDCSource_Count,
    client_list.ClientDatabase_Count,
    uc_list.SourceQuery,
    client_list.SourceQuery
)

final_detail = final_detail.orderBy("ClientID", "SourceFacilityId", "DatabaseName", "TableName")
display(final_detail)

uc_list_summary_df = uc_list.groupBy("UC_CDC_Source","ClientID", "TableName", "DatabaseName").agg(sum("CDCSource_Count").alias("TotalCDCSourceCount"))
display(uc_list_summary_df)
client_list_summary_df = client_list.groupBy("UC_CDC_Source", "ClientID", "TableName", "DatabaseName").agg(sum("ClientDatabase_Count").alias("TotalClientCount"))
display(client_list_summary_df)




In [0]:

final_results_df = uc_list_summary_df.join(
    client_list_summary_df,
    (uc_list_summary_df.TableName == client_list_summary_df.TableName) & (uc_list_summary_df.ClientID == client_list_summary_df.ClientID) & (uc_list_summary_df.DatabaseName == client_list_summary_df.DatabaseName),
    'left_outer'
).select(
    uc_list_summary_df.UC_CDC_Source,
    uc_list_summary_df.ClientID,
    uc_list_summary_df.DatabaseName,
    uc_list_summary_df.TableName,
    uc_list_summary_df.TotalCDCSourceCount,
    client_list_summary_df.TotalClientCount
)


display(final_results_df)



In [0]:

from pyspark.sql.functions import col, when

final_results_df = final_results_df.withColumn(
    "RecordDifference",
    col("TotalCDCSourceCount") - col("TotalClientCount")
).withColumn(
    "Status",
    when(col("TotalClientCount").isNull() & (col("TotalCDCSourceCount") == 0),
         "NOT CREATED IN CLIENT DB - NO RECORDS")
    .when(col("RecordDifference") != 0,
         "NOT EQUAL")
    .when((col("RecordDifference").isNull()) & (col("TotalCDCSourceCount") > 0),"NOT EQUAL")
    .otherwise("EQUAL")
)

display(final_results_df)
