In [0]:
# Import dbutils (required in Databricks for file system commands)
import uuid

# --- 1. Define All Project Paths using the new Volume structure ---
# NOTE: Replace with your actual Volume details
UC_ROOT = "/Volumes/ecommerce_audit/audit_schema/audit_volume/APAF_Capstone_Project"

# Bronze Layer - Raw JSON files land here
dbfs_raw_path = f"{UC_ROOT}/0_bronze_raw/price_events"

# Bronze Layer - Delta Table path
dbfs_bronze_delta_path = f"ecommerce_audit.audit_schema.bronze_price_requests"

# Checkpoint location for Bronze Stream (Crucial for streaming state)
# Checkpoints must also be in a writable location, typically under the Volume
dbfs_checkpoint_bronze = f"{UC_ROOT}/_checkpoints/bronze/{str(uuid.uuid4())}"

# --- 2. Create Directory Structure (using the correct method for Volumes) ---
# We still use dbutils to create the folder structure within the Volume
dbutils.fs.mkdirs(dbfs_raw_path)
dbutils.fs.mkdirs(dbfs_checkpoint_bronze) 

print(f"Project structure created under: {UC_ROOT}")
print(f"Bronze Delta Table Name (Unity Catalog): {dbfs_bronze_delta_path}")

###PySpark Ingestion with Auto Loader (Bronze Stream)
#### This code will start the stream, setting up Databricks to continuously monitor the path you just created (dbfs_raw_path) for the arrival of new JSON files.



for query in spark.streams.active:
    if query.name == "Bronze_Ingestion_Stream":
        query.stop()
        print(f"🛑 Stopped the failed stream: {query.name}")
from pyspark.sql.types import *
from pyspark.sql.functions import col, current_timestamp
# REMOVED: from pyspark.sql.streaming import Trigger # Abandoning this import

# --- Paths (Assume these variables are defined) ---
# UC_ROOT, dbfs_raw_path, dbfs_bronze_delta_path, dbfs_checkpoint_bronze 

# --- 1. Configure Auto Loader Stream Read (remains the same) ---
raw_schema = StructType([
    StructField("event_time", StringType(), True), 
    StructField("event_type", StringType(), True),
    StructField("user_id", StringType(), True),
    StructField("product_id", StringType(), True),
    StructField("request_price", DoubleType(), True),
    StructField("device_type", StringType(), True),
    StructField("geo_cluster", StringType(), True),
    StructField("request_id", StringType(), False)
])

raw_df = (
    spark.readStream
    .format("cloudFiles")
    .option("cloudFiles.format", "json")
    .option("cloudFiles.schemaLocation", f"{UC_ROOT}/_checkpoints/bronze_schema") 
    .schema(raw_schema)
    .load(dbfs_raw_path)
    .withColumn("ingestion_time", current_timestamp())
)

# --- 2. Corrected Bronze Delta Lake Write (FINAL SAFE FIX: Trigger.Once() via Option) ---
bronze_query = (
    raw_df.writeStream
    .format("delta")
    .option("checkpointLocation", dbfs_checkpoint_bronze)
    .trigger(once=True)
    .queryName("Bronze_Ingestion_Stream")
    .toTable(dbfs_bronze_delta_path)
)

bronze_query.awaitTermination()

print(f"✅ Bronze Stream: '{bronze_query.name}' finished processing all available files.")

###Verification (Bronze Layer Complete)
###Run the following cell to confirm that data has been ingested into your Bronze table.

# --- Paths (Ensure these match your Unity Catalog setup) ---
dbfs_bronze_delta_path = "ecommerce_audit.audit_schema.bronze_price_requests"

# Read the Bronze Delta table as a batch job to check the count
bronze_table = spark.read.table(dbfs_bronze_delta_path)
    
count_result = bronze_table.count()
print(f"✅ Success! Total records ingested in Bronze: {count_result}")

if count_result > 0:
    print("Sample Bronze Data (Raw Price Events):")
    bronze_table.select("product_id", "geo_cluster", "request_price", "ingestion_time").limit(10).display()
else:
    print("Data count is zero. Ensure you uploaded a JSON file before running the ingestion step.")

Project structure created under: /Volumes/ecommerce_audit/audit_schema/audit_volume/APAF_Capstone_Project
Bronze Delta Table Name (Unity Catalog): ecommerce_audit.audit_schema.bronze_price_requests
✅ Bronze Stream: 'Bronze_Ingestion_Stream' finished processing all available files.
✅ Success! Total records ingested in Bronze: 5000
Sample Bronze Data (Raw Price Events):


product_id,geo_cluster,request_price,ingestion_time
PROD_0005,Standard_Region_C,120.59,2025-10-20T06:54:06.990Z
PROD_0005,Standard_Region_C,129.5,2025-10-20T06:54:06.990Z
PROD_0017,VIP_City_A,175.93,2025-10-20T06:54:06.990Z
PROD_0016,Standard_Region_B,340.19,2025-10-20T06:54:06.990Z
PROD_0016,Standard_Region_B,388.12,2025-10-20T06:54:06.990Z
PROD_0012,VIP_City_A,194.26,2025-10-20T06:54:06.990Z
PROD_0015,Standard_Region_B,423.91,2025-10-20T06:54:06.990Z
PROD_0011,Standard_Region_C,148.62,2025-10-20T06:54:06.990Z
PROD_0017,Standard_Region_C,145.92,2025-10-20T06:54:06.990Z
PROD_0016,VIP_City_A,241.75,2025-10-20T06:54:06.990Z


Files in raw path (/Volumes/ecommerce_audit/audit_schema/audit_volume/APAF_Capstone_Project/0_bronze_raw/price_events): []
No files found in raw path. Upload JSON files to start ingestion.
