# Silver - Execture silver notebooks, joining tables, metric calculations, merge into cumulative table
# Purpose: Execute and join silver tables and merge them into a cumulative table
# Source: Delta tables (bronze.bronze_property_{year}, bronze.bronze_zip_code, bronze.bronze_owners_{year})
# Output: silver.cummulative_property_owners (DELTA TABLE)

## CONFIG/PARAMETERS

In [0]:
from pyspark.sql import functions as F
from pyspark.sql import types as T
from pyspark.sql import Window as Window

In [0]:
dbutils.widgets.text("year", "")
year = dbutils.widgets.get("year")

In [0]:
%sql
SET CATALOG harris_county_catalog

## READING SOURCES

### RUNNING ZIP CODES SILVER NOTEBOOK

In [0]:
%run ./zip_codes_silver

### READING, TRANSFORMING, AND VALIDATING ZIP CODES

In [0]:
input_path_zip_code = f"bronze.bronze_zip_code"
bronze_zip_code_table = spark.read.table(input_path_zip_code)
silver_zip_pre_validate = transform_zip_codes_bronze_silver(bronze_zip_code_table)
silver_zip_pre_validate.write.mode("overwrite").saveAsTable("validation.zip_validate")
# silver_zip = spark.read.table("validation.zip_validate") 
# silver_zip.cache()
# silver_zip.count()
zip_run_uc_tests("validation.zip_validate")

In [0]:
%skip
silver_zip.show()

### RUNNING OWNERS SILVER NOTEBOOK

In [0]:
%run ./owners_silver

### READING, TRANSFORMING, AND VALIDATING OWNERS

In [0]:
input_path_owners = f"bronze.bronze_owners_{year}"
bronze_owners_table = spark.read.table(input_path_owners)
silver_owners_pre_validate = transform_owners_bronze_silver(bronze_owners_table)
silver_owners_pre_validate.write.mode("overwrite").saveAsTable("validation.owners_validate")
# silver_owners = spark.read.table("validation.owners_validate")
# silver_owners.cache()
# silver_owners.count()
owners_run_uc_tests("validation.owners_validate")

### RUNNING PROPERTY SILVER NOTEBOOK

In [0]:
%run ./property_silver

### READING, TRANSFORMING, AND VALIDATING PROPERTY

In [0]:
input_path_property = f"bronze.bronze_property_{year}"
bronze_property_table = spark.read.table(input_path_property)
silver_property_pre_validate = transform_property_bronze_silver(bronze_property_table)
silver_property_pre_validate.write.mode("overwrite").saveAsTable("validation.property_validate")
# silver_property = spark.read.table("validation.property_validate")
# silver_property.cache()
# silver_property.count()
property_run_uc_tests("validation.property_validate")

### READING VALIDATED DATA

In [0]:
silver_zip = spark.read.table("validation.zip_validate")
silver_owners = spark.read.table("validation.owners_validate")
silver_property = spark.read.table("validation.property_validate")

## JOINS

### JOINING PROPERTY WITH ZIP CODE (LEFT JOIN)

This join is used to ensure zip code listed in property is linked to the correct city (using silver zip codes as a truth table)
If the zip code is "00000" it is deemed invalid and will be measured later

In [0]:
property_zip = silver_property.alias("main")\
    .join(silver_zip.alias("zip"), on=F.col("main.dim_zip_code") == F.col("zip.dim_zip_code"), how="left")\
    .withColumn(
    "dim_zip_code",
    F.coalesce(F.col("zip.dim_zip_code"), F.lit("00000"))
    )

### REFERENTIAL INTEGRITY TESTS

Count how many zip codes are in the property table that have a zip code outside harris county or have an invalid zip code ("00000") [1]

Count how many zip codes do not have a property in the selected state class [2]


In [0]:
# query [1] as specified above
unknown_or_out_of_county_zip_codes = silver_property\
    .alias("kept")\
    .join(silver_zip.alias("eliminator"), on=F.col("kept.dim_zip_code") == F.col("eliminator.dim_zip_code"), how="left_anti")\
    .count()

# query [2] as specified above
zip_codes_without_properties_in_designated_state_class = silver_zip\
    .alias("kept")\
    .join(silver_property.alias("eliminator"), on=F.col("kept.dim_zip_code") == F.col("eliminator.dim_zip_code"), how="left_anti")\
    .count()

# extract selected state classe from property table
state_classes = silver_property.select("dim_state_class").distinct().take(10)
state_class_list = [list(columns)[0] for columns in state_classes]
# print(state_class_list)

# print(f"number of null/empty zipcodes and out of county zip codes: {unknown_or_out_of_county_zip_codes}")

# print(f"number of zip codes without a property in {state_class_list} : {zip_codes_without_properties_in_designated_state_class}")

5	owners_silver	silver	rows_in	1874220

### JOINING PREVIOUS TABLE WITH OWNERS TABLE

This joins appends the name list of owners using the account number as the key

In [0]:
property_zip_owners = property_zip.alias("main")\
    .join(silver_owners.alias("owners"), on=F.col("main.dim_account_number")==F.col("owners.dim_account_number"), how="left")\
        .select(
            "main.dim_account_number"
            , "dim_zip_code"
            , "dim_city"
            , "dim_street"
            , "m_building_area"
            , "m_land_area"
            , "m_total_market_value"
            , "dim_name_list"
            , "dim_state_class"
            , "dim_year_date"
        )

### REFERENTIAL INTEGRITY

Count how many accounts are in owners table but not in the property_zip_joined table [1]
Count how many accounts are in the property_zip_joined table but not in the owners table [2]


In [0]:
accounts_not_in_owners = silver_property.alias("kept")\
    .join(silver_owners.alias("eliminator"), on=F.col("kept.dim_account_number") == F.col("eliminator.dim_account_number"), how="left_anti").count()

accounts_not_in_property = silver_owners.alias("kept")\
    .join(silver_property.alias("eliminator"), on=F.col("kept.dim_account_number") == F.col("eliminator.dim_account_number"), how="left_anti").count()


# print(accounts_not_in_owners, accounts_not_in_property)

### UPLOAD OF REFERENTIAL INTEGRITY METRICS

In [0]:
notebook_name = "joined_silver"
stage = "silver"

metric_name_list = ["unknown_or_out_of_county_zip_codes", "zip_codes_without_properties_in_designated_state_class", "accounts_not_in_owners", "accounts_not_in_property"]
metric_value_list = [unknown_or_out_of_county_zip_codes, zip_codes_without_properties_in_designated_state_class, accounts_not_in_owners, accounts_not_in_property]


metric_table_upload(notebook=notebook_name, stage=stage, metric_name_list=metric_name_list, metric_value_list=metric_value_list)

## CUMULATIVE TABLE CREATION AND BACKFILL


In [0]:
%sql
CREATE TABLE IF NOT EXISTS silver.cummulative_property_owners(
  dim_account_number STRING PRIMARY KEY
  , dim_zip_code STRING
  , dim_city STRING
  , dim_street STRING
  , m_building_area FLOAT
  , m_total_market_value FLOAT
  , dim_name_list ARRAY<STRING>
  , dim_state_class STRING
  , dim_year_date DATE
)

In [0]:
property_zip_owners.createOrReplaceTempView("property_zip_owners_yearly")

In [0]:
%sql
MERGE INTO silver.cummulative_property_owners as parent
USING property_zip_owners_yearly as child
    ON parent.dim_account_number = child.dim_account_number
    AND parent.dim_year_date = child.dim_year_date
WHEN MATCHED THEN UPDATE SET
    dim_zip_code = child.dim_zip_code
    , dim_city = child.dim_city
    , dim_street = child.dim_street
    , m_building_area = child.m_building_area
    , dim_name_list = child.dim_name_list
    , dim_state_class = child.dim_state_class
WHEN NOT MATCHED THEN INSERT *