In [0]:
from pyspark.sql.types import StructType, StructField, IntegerType, DoubleType,StringType
from pyspark.sql.functions import lit, current_timestamp,col


In [0]:
# Function to process and write batches of data into a Delta table

def process_batch(df, batch_id, table_name):
    # Add a batch ID column to identify the batch being processed
    df = df.withColumn("batch_id", lit(batch_id))
    # Add a timestamp column to record the data ingestion time
    df = df.withColumn("ingest_datetime", current_timestamp())
    # Write the DataFrame to a Delta table in append mode
    df.write.format("delta").mode("append").saveAsTable(table_name)


# Define the S3 location for the "products" data and checkpoint directory
products_load_location = 's3://data-engineering-upskill-final-exam/stoyan/input_data/products'
products_checkpoint_location = 's3://data-engineering-upskill-final-exam/stoyan/bronze_products/checkpoint'
products_table_name = 'stoyan.bronze_products'


# Define the schema for the "products" data
products_schema = StructType([
    StructField("PRODUCT_ID", IntegerType(), False),                # Unique identifier for each product
    StructField("PRODUCT_NAME", StringType(), True),                # Name of the product
    StructField("CATEGORY_NAME", StringType(), True),               # Category to which the product belongs
    StructField("WEIGHT_CLASS", IntegerType(), True),               # Weight classification of the product
    StructField("PRODUCT_STATUS", StringType(), True),              # Status of the product
    StructField("LIST_PRICE", DoubleType(), True),                  # Listed price of the product
    StructField("MIN_PRICE", DoubleType(), True)                    # Minimum allowable price for the product
])


# Define a streaming DataFrame for reading "products" data

products_stream = (
spark.readStream
.format("cloudFiles")                                               # Enable Auto Loader for cloud file sources
.option("cloudFiles.format", "csv")                                 # Specify the input file format as CSV
.option("header", "true")                                           # Indicate that the first row contains column headers
.schema(products_schema)                                            # Apply the defined schema for products
.load(products_load_location)                                       # Load data from the specified S3 location
)

 
 

In [0]:

products_stream.writeStream \
.foreachBatch(lambda df, batch_id: process_batch(df, batch_id, products_table_name)) \
.option("checkpointLocation", products_checkpoint_location) \
.trigger(availableNow=True) \
.outputMode("append") \
.start() 

<pyspark.sql.connect.streaming.query.StreamingQuery at 0x7f3894754350>