# Bronze - Orders Ingestion
# Purpose: Ingest raw property data per year into Bronze layer
# Source: CSV files
# Output: bronze.bronze_property_{year} (Delta table)

## CONFIG/PARAMETERS

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

In [0]:
%sql
USE CATALOG harris_county_catalog

In [0]:
dbutils.widgets.text("year", "")
year = dbutils.widgets.get("year")

print(f"Processing year: {year}")

## READ SOURCE

In [0]:
input_path = f"/Volumes/harris_county_catalog/raw_data/property/raw_property_{year}.txt"

try:
    dbutils.fs.ls(input_path)
    print("Path exists")
except Exception:
    print("Path does not exist")

## WRITE DELTA TABLE

Using copy into, having duplicate sources is a concern


In [0]:
spark.sql(f"""
    CREATE TABLE IF NOT EXISTS bronze.bronze_property_{year} (
        acct STRING
        , yr STRING
        , mailto STRING
        , mail_addr_1 STRING
        , mail_addr_2 STRING
        , mail_city STRING
        , mail_state STRING
        , mail_zip STRING
        , mail_country STRING
        , undeliverable STRING
        , str_pfx STRING
        , str_num STRING
        , str_num_sfx STRING
        , str STRING
        , str_sfx STRING
        , str_sfx_dir STRING
        , str_unit STRING
        , site_addr_1 STRING
        , site_addr_2 STRING
        , site_addr_3 STRING
        , state_class STRING
        , school_dist STRING
        , map_facet STRING
        , key_map STRING
        , Neighborhood_Code STRING
        , Neighborhood_Grp STRING
        , Market_Area_1 STRING
        , Market_Area_1_Dscr STRING
        , Market_Area_2 STRING
        , Market_Area_2_Dscr STRING
        , econ_area STRING
        , econ_bld_class STRING
        , center_code STRING
        , yr_impr STRING
        , yr_annexed STRING
        , splt_dt STRING
        , dsc_cd STRING
        , nxt_bld STRING
        , bld_ar STRING
        , land_ar STRING
        , acreage STRING
        , Cap_acct STRING
        , shared_cad STRING
        , land_val STRING
        , bld_val STRING
        , x_features_val STRING
        , ag_val STRING
        , assessed_val STRING
        , tot_appr_val STRING
        , tot_mkt_val STRING
        , prior_land_val STRING
        , prior_bld_val STRING
        , prior_x_features_val STRING
        , prior_ag_val STRING
        , prior_tot_appr_val STRING
        , prior_tot_mkt_val STRING
        , new_construction_val STRING
        , tot_rcn_val STRING
        , value_status STRING
        , noticed STRING
        , notice_dt STRING
        , protested STRING
        , certified_date STRING
        , rev_dt STRING
        , rev_by STRING
        , new_own_dt STRING
        , lgl_1 STRING
        , lgl_2 STRING
        , lgl_3 STRING
        , lgl_4 STRING 
        , jurs STRING

    )
""")

In [0]:
spark.sql(f"""
    COPY INTO bronze.bronze_property_{year}
    FROM '{input_path}'
    FILEFORMAT = CSV
    FORMAT_OPTIONS ('header' = 'true', 'delimiter' = '\t')
    COPY_OPTIONS ('mergeSchema' = 'false')
""")

In [0]:
%skip
spark.read.table("harris_county_catalog.bronze.bronze_property_2025").show(10, truncate=False, vertical=True)

In [0]:
%skip
spark.read.table("harris_county_catalog.bronze.bronze_property_2025").printSchema()