In [None]:
from pyspark.sql import functions as F
from datetime import date
from databricks.sdk import WorkspaceClient

In [None]:
def add_audit_columns(batch_df, batch_id):
    # get job details for audit columns
    # Add audit columns
    batch_df = (
        batch_df.withColumn("dp_job_id", F.lit(job_id))
        .withColumn("dp_ingestion_time", F.current_timestamp())
        .selectExpr("*","_metadata as dp_source_metadata")
    )
    
    # append to table
    batch_df.write.mode("append").saveAsTable(full_table_name)

def process_bronze(checkpoint_path, file_path, full_table_name, job_id):
    raw_df = (
        spark.readStream.format("cloudFiles")
        .option("cloudFiles.format", "parquet")
        .option("cloudFiles.schemaHints", "airport_fee DOUBLE")
        .option(
            "cloudFiles.schemaLocation", checkpoint_path
        )
        .load(file_path)
    )
    # write to table incrementally
    query = (
        raw_df.writeStream
        .foreachBatch(add_audit_columns)
        .option(
            "checkpointLocation", checkpoint_path
        )
        .option("mergeSchema", "true")
        .outputMode("append")
        .trigger(availableNow=True)  # this means incremental batch
        .start()
    )
    query.awaitTermination()

In [None]:
var = dbutils.widgets.getAll()
for key in var.keys():
    # create variables based on the k,v pair of the var dictionary
    # - we could use the dict directly but this will ease things -
    globals()[f'{key}'] = var[key]

print(f"{checkpoint_path = }\n{file_path = }\n{full_table_name = }\n{job_id = }")

In [None]:
process_bronze(
    checkpoint_path,
    file_path,
    full_table_name,
    job_id
)
print(f"Data loaded successfully to {full_table_name}.")