In [0]:
from pyspark.sql.functions import lit, current_timestamp,col
from pyspark.sql.types import StructType, StructField, IntegerType, DoubleType,StringType


In [0]:
# Function to process and write batches of data into a Delta table

def process_batch(df, batch_id, table_name):
    # Add a batch ID column to identify the batch being processed
    df = df.withColumn("batch_id", lit(batch_id))
    # Add a timestamp column to record the data ingestion time
    df = df.withColumn("ingest_datetime", current_timestamp())
    # Write the DataFrame to a Delta table in append mode
    df.write.format("delta").mode("append").saveAsTable(table_name)

order_items_schema = StructType([
    StructField("ORDER_ID", IntegerType(), False),                  # Unique identifier for each order
    StructField("LINE_ITEM_ID", IntegerType(), False),              # Unique identifier for each line item in the order
    StructField("PRODUCT_ID", IntegerType(), False),                # Identifier for the product in the line item
    StructField("UNIT_PRICE", DoubleType(), True),                  # Unit price of the product
    StructField("QUANTITY", IntegerType(), True)                    # Quantity of the product ordered
])

# Specify table name, data source location, and checkpoint directory for "order items" data
order_items_table_name = 'stoyan.bronze_order_items'
order_items_load_location = 's3://data-engineering-upskill-final-exam/stoyan/input_data/order_items'
order_items_checkpoint_location = 's3://data-engineering-upskill-final-exam/stoyan/bronze_order_items/checkpoint'

# Define a streaming DataFrame for reading "order items" data
order_items_stream = (
spark.readStream
.format("cloudFiles")                                               # Enable Auto Loader for cloud file sources
.option("cloudFiles.format", "csv")                                 # Specify the input file format as CSV
.option("header", "true")                                           # Indicate that the first row contains column headers
.schema(order_items_schema)                                         # Apply the defined schema for order items
.load(order_items_load_location)                                    # Load data from the specified S3 location 
)


In [0]:
# Write the stream to a Delta table with batch processing

order_items_stream.writeStream \
.foreachBatch(lambda df, batch_id: process_batch(df, batch_id, order_items_table_name)) \
.option("checkpointLocation", order_items_checkpoint_location) \
.trigger(availableNow=True) \
.outputMode("append") \
.start() 

<pyspark.sql.connect.streaming.query.StreamingQuery at 0x7fcf44dc30d0>