In [0]:
from pyspark.sql.functions import *
from pyspark.sql import SparkSession

# Paths
input_path = "/Volumes/bronze_dev/bronze_dev/raw_data/*.csv"
table_name = "bronze_dev.bronze_dev.stg_bronze_superstore"

# 1. Read raw CSV
df = spark.read.format("csv").option("header", True).load("/Volumes/bronze_dev/bronze_dev/raw_data/*.csv")
df = df.withColumn("source_file", col("_metadata.file_path"))

display (df)


In [0]:
from pyspark.sql.functions import col, sha2, concat_ws

input_path = "/Volumes/bronze_dev/bronze_dev/raw_data/*.csv"
table_name = "bronze_dev.bronze_dev.stg_bronze_superstore"

# 1. Read CSV and sanitize columns
df_raw = spark.read.format("csv").option("header", True).load(input_path)
df_raw = df_raw.toDF(*[c.replace(" ", "_").lower() for c in df_raw.columns])
df_raw = df_raw.withColumn("source_file", col("_metadata.file_path"))

# 2. Create _record_hash
df_hashed = df_raw.withColumn("_record_hash", sha2(concat_ws("||", *df_raw.columns), 256))

# 3. Load existing table if exists
if spark.catalog.tableExists(table_name):
    df_existing = spark.table(table_name)
    
    # Sanitize columns to match
    df_existing = df_existing.toDF(*[c.replace(" ", "_").lower() for c in df_existing.columns])
    
    # Filter out rows that already exist
    df_new = df_hashed.join(df_existing.select("_record_hash"), on="_record_hash", how="leftanti")
else:
    df_new = df_hashed  # first-time load

# 4. Write new rows to Delta
if df_new.count() > 0:
    df_new.write.format("delta").mode("append").saveAsTable(table_name)
    print(f"Inserted {df_new.count()} new rows.")
else:
    print("No new rows to insert.")


In [0]:
df_hashed.display()