# Silver - Schema enforcment, data cleaning and validation
# Purpose: Establish functions that clean bronze zip code table
# Source: unity catalog table path
# Output: transform_zip_codes_bronze_silver("unity_catalog path"), zip_run_uc_tests("unity_catalog path") [functions]

## CONFIG/PARAMETERS


In [0]:
import re 
from pyspark.sql import types as T
from pyspark.sql import functions as F
import time

In [0]:
notebook_name = "zip_codes_silver"
stage = "silver"
input_table = "bronze_zip_code"
output_table = "validation_silver_zip_code"
test_table = output_table

Imports logging and metric functions


In [0]:
%run ../log_notebook

## TRANSFORMATION

Checks assumptions on ingested table, and changes names and types of columns

In [0]:
# could pass the logging and the time to a decorator, also the metric outputs
def transform_zip_codes_bronze_silver(data_frame):

    log_pipeline_runs(
        notebook=notebook_name, stage=stage, input_table=input_table, output_table=output_table, status="START"
        )
    start_time = time.monotonic()

    try:
        print("start transformation zip from bronze to silver")
        
        print("start checking assumptions")
        ## ASSUMPTION CHECKING

        # zip code, city, and calssification class COLUMNS MUST EXIST
        necessary_columns = ["ZIP_Code", "City", "ClassificationClass"]
        missing_columns = []

        for columns in necessary_columns:
            if columns not in data_frame.columns:
                missing_columns.append(columns)

        if missing_columns:
            missing_string = ", ".join(missing_columns)
            print(f"ERROR: assumptions failed - missing columns {missing_string}")
            raise ValueError(f"{missing_string} columns missing")

        # check if the DATA FRAME is EMPTY
        if data_frame.where((F.col("ZIP_Code").isNotNull()) & (F.col("City").isNotNull())).count() <= 0:
            print(f"ERROR: assumption failed - data frame has no rows containing both zip and city")
            raise ValueError("No rows contain both zip and city") 

        print("end checking assumptions")
        print("start transformations")

        # CALCULATE amount of ROWS in bronze table
        bronze_rows = data_frame.count()
        print(f"amount of rows in bronze table: {bronze_rows}")

        ## TRANSFORMATION

        # RENAME the columns and CAST values in COLUMNS
        zip_code_renamed = (
        data_frame
            .select(
                F.lower(F.trim("ZIP_Code")).cast("string").alias("dim_zip_code"),
                F.lower(F.trim("ClassificationClass")).cast("string").alias("dim_classification_class"),
                F.lower(F.trim("City")).cast("string").alias("dim_city")
            )
            .replace({"p.o. box":"p.o.box"})
        )
        
        # check all ZIP CODES are VALID
        zip_code_valid = zip_code_renamed.where(F.col("dim_zip_code").rlike("^[0-9]{5}$"))
        print("end tranfsormations")

        # CALCULATE amount of ROWS in post TRANSFORMATION table
        silver_rows = zip_code_valid.count()
        print(f"amount of rows after dropping invalid zip: {silver_rows}")

        ## METRIC CALCULATIONS

        # DURATION of TRANSFORMATION
        end_transformation = time.monotonic()
        run_time = end_transformation - start_time

        ## METRIC UPLOADS

        # metric aggregation
        metric_name_list = ["rows_in", "rows_out", "transformation_and_metric_runtime"]
        metric_value_list = [bronze_rows, silver_rows, run_time]
        metric_table_upload(notebook=notebook_name, stage=stage, metric_name_list=metric_name_list, metric_value_list=metric_value_list)


        log_pipeline_runs(
            notebook=notebook_name, stage=stage, input_table=input_table, output_table=output_table, status="SUCCESS", run_time=run_time
        )

        return zip_code_valid
    
    except Exception as e:
        ## FAILURE

        end_time = time.monotonic()
        run_time = end_time - start_time

        log_pipeline_runs(
            notebook=notebook_name, stage=stage, input_table=input_table, output_table=output_table, status="FAILURE", error_message = str(e), run_time=run_time
            )
        
        raise

## VALIDATION

Ensures data quality:
- there are no null zip codes
- all zip codes are of the correct length
- there are no null cities

In [0]:

def zip_run_uc_tests(table_name):
    failures = []
    
    ## TESTS DEFINITIONS (shorts descriptions above)
    # tests made so if a data frame has more than 0 rows, then there is a data quality issue
    tests = {
        "zip_code_null": f"""
            SELECT 1 
            FROM {table_name} 
            WHERE dim_zip_code IS NULL 
            LIMIT 1
            """
        ,"zip_code_length": f"""
            SELECT 1 
            FROM {table_name} 
            WHERE length(dim_zip_code) != 5 
            LIMIT 1
            """
        , "city_null": f"""
            SELECT 1 
            FROM {table_name} 
            WHERE dim_city IS NULL 
            LIMIT 1
            """
    }

    ## RUN TESTS

    for name, sql_test in tests.items():
        if spark.sql(sql_test).count() > 0:
            failures.append(name)
            log_data_quality_tests(notebook=notebook_name, test_table=test_table, test_name=name, status="FAILURE")
        else:
            print(f"test {name} passed")
            log_data_quality_tests(notebook=notebook_name, test_table=test_table, test_name=name, status="SUCCESS")

    if failures:
        raise RuntimeError(
            f"UC tests failed for {table_name}: {', '.join(failures)}"
        )

## Debugging

In [0]:
# input_path = f"harris_county_catalog.bronze.bronze_zip_code"
# raw_zip_code_table = spark.read.table(input_path)

In [0]:
# silver_pre_validate = transform_zip_codes_bronze_silver(raw_zip_code_table)
# silver_pre_validate.show(5)

In [0]:
# validated = zip_run_uc_tests("harris_county_catalog.validation.zip_validate")

In [0]:
# renamed_zip_code.write\
#     .mode("overwrite")\
#     .saveAsTable(f"harris_county_catalog.silver.zip_code.bronze_zip_code_{year}")