In [0]:
from pyspark.sql.types import StructType, StructField, IntegerType, DoubleType,StringType,DateType
from pyspark.sql.functions import lit, current_timestamp,col


In [0]:
# spark.sql('drop table stoyan.bronze_orders')

DataFrame[]

In [0]:
# Function to process and write batches of data into a Delta table

def process_batch(df, batch_id, table_name):
    # Add a batch ID column to identify the batch being processed
    df = df.withColumn("batch_id", lit(batch_id))
    # Add a timestamp column to record the data ingestion time
    df = df.withColumn("ingest_datetime", current_timestamp())
    # Write the DataFrame to a Delta table in append mode
    df.write.format("delta").mode("append").saveAsTable(table_name)

# Define the schema for the "orders" data

orders_schema = StructType([
    StructField("ORDER_ID", IntegerType(), False),                      # Unique identifier for each order
    StructField("ORDER_DATE", StringType(), True),                      # Date of the order
    StructField("ORDER_MODE", StringType(), True),                      # Mode of the order
    StructField("CUSTOMER_ID", IntegerType(), True),                    # Identifier for the customer placing the order
    StructField("ORDER_STATUS", IntegerType(), True),                   # Status of the order
    StructField("ORDER_TOTAL", DoubleType(), True),                     # Total value of the order
    StructField("SALES_REP_ID", IntegerType(), True),                   # Identifier for the sales representative
    StructField("PROMOTION_ID", StringType(), True)                     # Identifier for any applied promotions
])

# Specify table name, data source location, and checkpoint directory for "orders" data

orders_table_name = 'stoyan.bronze_orders'
orders_load_location = 's3://data-engineering-upskill-final-exam/stoyan/input_data/orders'
orders_checkpoint_location = 's3://data-engineering-upskill-final-exam/stoyan/bronze_orders/checkpoint'


# Define a streaming DataFrame for reading "orders" data

orders_stream = (
spark.readStream
.format("cloudFiles")                                                # Enable Auto Loader for cloud file sources
.option("cloudFiles.format", "csv")                                  # Specify the input file format as CSV
.option("header", "true")                                            # Indicate that the first row contains column headers
.schema(orders_schema)                                               # Apply the defined schema for orders
.load(orders_load_location)                                          # Load data from the specified S3 location
)




In [0]:
orders_stream.writeStream \
.foreachBatch(lambda df, batch_id: process_batch(df, batch_id, orders_table_name)) \
.option("checkpointLocation", orders_checkpoint_location) \
.trigger(availableNow=True) \
.outputMode("append") \
.start()



<pyspark.sql.connect.streaming.query.StreamingQuery at 0x7fc7901d3c50>