In [0]:
use catalog data;
use schema bronze;


In [0]:
CREATE TABLE if not exists data.bronze.bronze_A16 (
  current_timestamp TIMESTAMP,
  filename STRING,
  xml_string STRING
)
USING DELTA;

In [0]:
%python
from pyspark.sql.functions import current_timestamp, col

# Define the source path
source_path = "/Volumes/source/source_schema/source_volume/A16"

# Read XML files as binary/text using Auto Loader
df = (spark.readStream
  .format("cloudFiles")
  .option("cloudFiles.format", "binaryFile")  # Read files as binary
  .option("pathGlobFilter", "*.xml")  # Filter for XML files only
  .option("cloudFiles.useNotifications", "false")  # Use directory listing for volumes
  .load(source_path)
)

# Transform: extract filename and convert binary content to string
df_transformed = df.select(
  current_timestamp().alias("current_timestamp"),
  col("_metadata.file_path").alias("filename"),
  col("content").cast("string").alias("xml_string")
)

# Write to bronze table using Auto Loader streaming
query = (df_transformed.writeStream
  .format("delta")
  .outputMode("append")
  .option("checkpointLocation", "/Volumes/source/source_schema/source_volume/checkpoints/bronze_A16")
  .trigger(availableNow=True)  # Process all available files then stop
  .toTable("data.bronze.bronze_A16")
)

# Wait for the stream to finish processing
query.awaitTermination()

print("Auto Loader ingestion completed successfully!")

In [0]:
%python
# # Delete the checkpoint directory to reset Auto Loader
# # This will cause Auto Loader to reprocess all files on the next run
# def delete_checkpoint():
#     checkpoint_path = "/Volumes/source/source_schema/source_volume/_checkpoints/bronze_A16"

#     try:
#         dbutils.fs.rm(checkpoint_path, recurse=True)
#         print(f"Checkpoint deleted successfully: {checkpoint_path}")
#         print("Next Auto Loader run will reprocess all files from scratch.")
#     except Exception as e:
#         print(f"Error deleting checkpoint: {e}")
# delete_checkpoint()

In [0]:
-- select count(*) from bronze_A16