In [0]:
from pyspark.sql.functions import *
from pyspark.sql.functions import col, sha2, concat_ws

source_path = "/Volumes/bronze_dev/bronze_dev/raw_data/*.csv"
bronze_table = "bronze_dev.bronze_dev.stg_bronze_superstore"

# 1. Read CSV
df_raw = (
    spark.read.option("header", True)
              .csv(source_path)
)

# 2. Sanitize column names (Delta requirement)
df_raw = df_raw.toDF(*[c.replace(" ", "_").lower() for c in df_raw.columns])

# 3. Add Bronze metadata (use _metadata.file_path instead of input_file_name)
df_raw = (
    df_raw
    .withColumn("_ingest_file", col("_metadata.file_path"))
    .withColumn("_ingest_ts", current_timestamp())
)

# 4. Create record hash (BUSINESS columns ONLY)
business_cols = [c for c in df_raw.columns if not c.startswith("_")]

df_hashed = df_raw.withColumn(
    "_record_hash",
    sha2(concat_ws("||", *[col(c).cast("string") for c in business_cols]), 256)
)

# 5. Idempotent load
if spark.catalog.tableExists(bronze_table):
    df_existing = spark.table(bronze_table)

    df_new = df_hashed.join(
        df_existing.select("_record_hash"),
        on="_record_hash",
        how="left_anti"
    )
else:
    df_new = df_hashed

# 6. Write new rows only
if df_new.count() > 0:
    df_new.write.format("delta").mode("append").saveAsTable(bronze_table)
    print(f"Inserted {df_new.count()} new rows.")
else:
    print("No new rows to insert.")

In [0]:
spark.sql(f' select * from bronze_dev.bronze_dev.stg_bronze_superstore limit 10').display()