# Bronze - Orders Ingestion
# Purpose: Ingest raw owners data per year into Bronze layer
# Source: CSV files
# Output: bronze.bronze_owners_{year} (Delta table)


## CONFIG/PARAMETERS

In [0]:
from pyspark.sql import SparkSession 
from pyspark.sql import functions as F
from pyspark.sql import types as T


In [0]:
%sql
USE CATALOG harris_county_catalog

In [0]:
dbutils.widgets.text("year", "")
year = dbutils.widgets.get("year")

print(f"Processing year: {year}")

Processing year: 2025


##READ SOURCE

In [0]:
input_path = f"/Volumes/harris_county_catalog/raw_data/owners/raw_owners_{year}.txt"
try:
    dbutils.fs.ls(input_path)
    print("path exists")
except Exception:
    print("path does not exist")


path exists


## WRITE DELTA TABLE

We perform schema enforcement


In [0]:
spark.sql(f"""
  CREATE TABLE IF NOT EXISTS bronze.bronze_owners_{year}(
      acct STRING
      , ln_num STRING
      , `name` STRING
      , aka STRING
      , pct_own STRING
  )
"""
)

DataFrame[]

Using copy into, having duplicate sources is a concern


In [0]:
spark.sql(f"""
    COPY INTO bronze.bronze_owners_{year}
    FROM '{input_path}'
    FILEFORMAT = CSV
    FORMAT_OPTIONS ('header' = 'true', 'delimiter' = '\t')
    COPY_OPTIONS('mergeSchema' = 'false')
""")

DataFrame[num_affected_rows: bigint, num_inserted_rows: bigint, num_skipped_corrupt_files: int]