In [0]:
# 01_Ingest_Bronze 

from pyspark.sql.functions import col, current_timestamp

# 1. Define Paths
source_path = "abfss://landing-zone@sacryptotradesdata.dfs.core.windows.net/"
checkpoint_path = "abfss://bronze@sacryptotradesdata.dfs.core.windows.net/_checkpoints/sales_raw"
schema_path = "abfss://bronze@sacryptotradesdata.dfs.core.windows.net/_schemas/sales_raw"

# 2. Read Stream (Auto Loader)
df_raw = (spark.readStream
    .format("cloudFiles")
    .option("cloudFiles.format", "csv")
    .option("cloudFiles.inferColumnTypes", "true")
    .option("cloudFiles.schemaLocation", schema_path)
    .option("header", "true")
    .option("recursiveFileLookup", "true")
    .load(source_path)
)

# 3. Add Audit Columns 
df_enriched = df_raw \
    .withColumn("ingestion_time", current_timestamp()) \
    .withColumn("source_file", col("_metadata.file_path")) # <--- Changed input_file_name() to this

# 4. Write to Bronze Table
print("Starting Stream... please wait for 'Stream Initialized'...")
(df_enriched.writeStream
    .format("delta")
    .option("checkpointLocation", checkpoint_path)
    .trigger(availableNow=True)
    .toTable("crypto_cat.bronze.sales_raw")
)

Starting Stream... please wait for 'Stream Initialized'...


<pyspark.sql.connect.streaming.query.StreamingQuery at 0x7f09e21e66c0>

In [0]:
%sql
SELECT count(*) FROM crypto_cat.bronze.sales_raw;

count(*)
76007534


In [0]:
%sql
DESCRIBE crypto_cat.bronze.sales_raw;

col_name,data_type,comment
timestamp,timestamp,
exchange,string,
qty,double,
quoteQty,double,
deal,string,
ccy,string,
quoteCcy,string,
_rescued_data,string,
ingestion_time,timestamp,
source_file,string,
