# Silver - Schema enforcment, data cleaning and validation
# Purpose: Establish functions that clean bronze owners table
# Source: unity catalog table path
# Output: functions
## - transform_owners_bronze_silver("unity_catalog path")
## - owners_run_uc_tests("unity_catalog path") 

## CONFIG/PARAMETERS

In [0]:
from pyspark.sql import types as T
from pyspark.sql import functions as F
import time 

In [0]:
dbutils.widgets.text("year", "")
year = dbutils.widgets.get("year")

In [0]:
notebook_name = "owners_silver"
stage = "silver"
input_table = "bronze_owners"
output_table = f"validation_silver_owners_{year}"
test_table = output_table

Imports logging and metric functions

In [0]:
%run ../log_notebook

## TRANSFORMATIONS


In [0]:
def transform_owners_bronze_silver(data_frame):
    log_pipeline_runs(
        notebook=notebook_name, stage=stage, input_table=input_table, output_table=output_table, status="START"
        )
    start_time = time.monotonic()

    try:
        print("start transformation owners from bronze to silver")
        
        print("start checking assumptions")

        ## ASSUMPTION CHECKING 

        # NECESSARY COLUMNS acct and name
        necessary_columns = ["acct", "name"]
        missing_columns = []

        for columns in necessary_columns:
            if columns not in data_frame.columns:
                missing_columns.append(columns)
        
        if missing_columns:
            missing_string = ", ".join(missing_columns)
            print(f"ERROR: assumption failed - missing columns {missing_string}")
            raise ValueError(f"{missing_string} columns missing")

        #acct is not empty and name is not empty
        if data_frame.where((F.col("acct").isNotNull()) & (F.col("name").isNotNull())).limit(1).count() == 0: #rows where acct and name are not null is 0
            print(f"ERROR: assumption failed - no rows where accounts and name are both not null")
            raise ValueError("no rows where account and name are both not null")

        print("end checking assumptions")
        print("start transformations")

        # CALCULATION amount of ROWS in bronze table
        
        bronze_rows = data_frame.count()
        print(f"amount of rows in bronze table: {bronze_rows}")

        ## TRANSFORMATIONS

        # RENAME, TRIM, CAST rows
        owners_renamed = data_frame.select(
        F.lower(F.trim("acct")).cast("string").alias("dim_account_number")
        , F.lower(F.trim("name")).cast("string").alias("dim_name")
        , F.expr("try_cast(pct_own AS decimal(10,2))").alias("m_pct_own")
        )

        # SELECT NON NULL in dim_account
        owners_null_account_removed = owners_renamed.where(F.col("dim_account_number").isNotNull())

        # AGGREGATE names into LIST
        owners_grouped = owners_null_account_removed\
            .groupBy("dim_account_number")\
            .agg(
                F.collect_set("dim_name").alias("dim_name_list")
                , F.round(F.sum("m_pct_own"), 2).cast("decimal(10,2)").alias("m_total_pct")
                # , F.collect_list("m_pct_own")
            )
        
        # SELECT VALID m_total_pct (between 1.1 and 0.9 ownership to account for rounding)

        owners_valid = owners_grouped\
            .where((F.col("m_total_pct") <= 1.1) & (F.col("m_total_pct") >= 0.9))\
            .select(
                "dim_account_number"
                , "dim_name_list"
            )

        print("end tranfsormations")


        ## METRIC CALCULATIONS
        # DURATION of TRANSFORMATION
        end_time = time.monotonic()

        run_time = end_time = start_time

        # CALCULATION of ROWS post TRANSFORMATION 
        silver_rows = owners_valid.count()
        print(f"amount of rows after dropping null accounts, and total ownership between 0.9 and 1.1: {silver_rows}")
        
        # EMPTY LIST dim_name_list
        empty_name_list_count = owners_valid\
            .where(
                F.size("dim_name_list") == 0
            ).count()

        ## METRIC AGGREGATION
        metric_name_list = ["rows_in", "rows_out", "empty_name_list", "run_time"]
        metric_value_list = [
            bronze_rows
            , silver_rows
            , empty_name_list_count
            , run_time
            ]
        
        metric_table_upload(notebook=notebook_name, stage=stage, metric_name_list=metric_name_list, metric_value_list=metric_value_list)

        log_pipeline_runs(
                    notebook=notebook_name, stage=stage, input_table=input_table, output_table=output_table, status="SUCCESS", run_time= run_time
                    )

        return owners_valid
        
    except Exception as e:
        ## FAILURE

        end_time = time.monotonic()
        run_time = end_time - start_time
        
        log_pipeline_runs(
            notebook=notebook_name, stage=stage, input_table=input_table, output_table=output_table, status="FAILURE", error_message = str(e), run_time = run_time
            )
        raise

# transform_owners_bronze_silver(raw_owners_table).where(F.col("dim_account_number") == "1135550000026").show()

## VALIDATION

Ensures data quality
- at least 1,000,000 data points where dim_account and dim_name are not null
- no duplicate dim_account
- no null names

In [0]:
# we want at least 1,000,000 data points
# we do not want any duplicates at this point
def owners_run_uc_tests(table_name: str):
    failures = []
    ## TESTS DEFINITIONS (shorts descriptions above)
    # tests made so if a data frame has more than 0 rows, then there is a data quality issue (except first)
    tests = {
        "row_count_nimimum_failed": f"""
            SELECT 
                COUNT(*) as valid_rows 
            FROM {table_name} 
            WHERE dim_account_number IS NOT NULL 
                AND ARRAY_SIZE(dim_name_list) > 0
            HAVING valid_rows < 1000000
            """
        , "duplicate_dim_account_number" : f"""
        SELECT dim_account_number as duplicate 
        FROM {table_name} 
        GROUP BY dim_account_number 
        HAVING COUNT(*) > 1 
        LIMIT 1
        """
        , "null_dim_name_list": f"""
        SELECT dim_account_number 
        FROM {table_name} 
        WHERE dim_account_number IS NULL
        LIMIT 1
        """
        # maybe change limit 1 for logs
    }

    ## RUN TESTS
    for name, sql_test in tests.items():
        if spark.sql(sql_test).count() > 0:
            failures.append(name)
            log_data_quality_tests(notebook=notebook_name, test_table=test_table, test_name=name, status="FAILURE")
        else:
            print(f"test {name} passed")
            log_data_quality_tests(notebook=notebook_name, test_table=test_table, test_name=name, status="SUCCESS")

    if failures:
        raise RuntimeError(
            f"UC tests failed for {table_name}: {', '.join(failures)}"
        )

## DEBUGGING

In [0]:
%skip
%sql
USE CATALOG harris_county_catalog

In [0]:
%skip
input_path = f"bronze.bronze_owners_{2025}"#should not have table in name
raw_owners_table = spark.read.table(input_path)

In [0]:
%skip
df = spark.read.table("validation.owners_validate")

In [0]:
%skip
owners_run_uc_tests('validation.owners_validate')

In [0]:
# transform_owners_bronze_silver(raw_owners_table)