Before we start, we need to make sure that we have a Kafka cluster running and a topic that produces some streaming data. For simplicity, we will use a single-node Kafka cluster and a topic named `events`. Open the `4.0 events-gen-kafka.ipynb` notebook and execute the cell. This notebook produces an event record every second and put it on a Kafka topic called `events`. 

In [14]:
from delta import configure_spark_with_delta_pip
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, from_json, window, count, to_timestamp
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

In [15]:
builder = (SparkSession.builder
           .appName("apply-window-aggregations")
           .master("spark://spark-master:7077")
           .config("spark.executor.memory", "512m")
           .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
           .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog"))

spark = configure_spark_with_delta_pip(builder,['org.apache.spark:spark-sql-kafka-0-10_2.12:3.4.1']).getOrCreate()
spark.sparkContext.setLogLevel("ERROR")

In [16]:
df = (spark.readStream
      .format("kafka")
      .option("kafka.bootstrap.servers", "kafka:9092")
      .option("subscribe", "events")
      .option("startingOffsets", "earliest")
      .load())

In [17]:
schema = StructType([
    StructField('user_id', IntegerType(), True),
    StructField('event_type', StringType(), True),
    StructField('event_time', StringType(), True),
    StructField('processing_time', StringType(), True)])

df = df.withColumn('value', from_json(col('value').cast("STRING"), schema))

In [18]:
df = (df
      .select(
          col('value.user_id').alias('user_id'),
          col('value.event_type').alias('event_type'),
          col('value.event_time').alias('event_time'),
          col('value.processing_time').alias('processing_time'))
      .withColumn("event_time"
        , to_timestamp(col("event_time")
        , "MM/dd/yyyy, HH:mm:ss" ))
      .withColumn("processing_time"
        , to_timestamp(col("processing_time")
        , "MM/dd/yyyy, HH:mm:ss")))

In [19]:
df = (df.groupBy(
    window(col("event_time"), "60 minute", "60 minute")
    , col("event_type"))
      .agg(count(col("user_id")).alias("NumberOfUsers")))

In [20]:
query = (df.writeStream
    .outputMode('complete')
    .format('console')
    .option("truncate", False)
    .start())

                                                                                

-------------------------------------------
Batch: 0
-------------------------------------------
+------------------------------------------+----------+-------------+
|window                                    |event_type|NumberOfUsers|
+------------------------------------------+----------+-------------+
|{2024-02-04 18:00:00, 2024-02-04 19:00:00}|view      |4            |
|{2024-02-04 18:00:00, 2024-02-04 19:00:00}|click     |5            |
|{2024-02-04 18:00:00, 2024-02-04 19:00:00}|purchase  |4            |
|{2024-02-04 18:00:00, 2024-02-04 19:00:00}|like      |8            |
|{2024-02-04 18:00:00, 2024-02-04 19:00:00}|share     |2            |
+------------------------------------------+----------+-------------+



                                                                                

-------------------------------------------
Batch: 1
-------------------------------------------
+------------------------------------------+----------+-------------+
|window                                    |event_type|NumberOfUsers|
+------------------------------------------+----------+-------------+
|{2024-02-04 18:00:00, 2024-02-04 19:00:00}|view      |4            |
|{2024-02-04 18:00:00, 2024-02-04 19:00:00}|click     |5            |
|{2024-02-04 18:00:00, 2024-02-04 19:00:00}|purchase  |5            |
|{2024-02-04 18:00:00, 2024-02-04 19:00:00}|like      |8            |
|{2024-02-04 18:00:00, 2024-02-04 19:00:00}|share     |3            |
+------------------------------------------+----------+-------------+



[Stage 4:>                                                          (0 + 1) / 1]

In [21]:
query.stop()

24/02/04 18:15:30 ERROR WriteToDataSourceV2Exec: Data source write support MicroBatchWrite[epoch: 2, writer: ConsoleWriter[numRows=20, truncate=false]] is aborting.
24/02/04 18:15:30 ERROR WriteToDataSourceV2Exec: Data source write support MicroBatchWrite[epoch: 2, writer: ConsoleWriter[numRows=20, truncate=false]] aborted.


In [22]:
# Update output mode 
query = (df.writeStream.outputMode("update") 
    .format("console") 
    .option("truncate", False) 
    .start())

                                                                                

-------------------------------------------
Batch: 0
-------------------------------------------
+------------------------------------------+----------+-------------+
|window                                    |event_type|NumberOfUsers|
+------------------------------------------+----------+-------------+
|{2024-02-04 18:00:00, 2024-02-04 19:00:00}|view      |4            |
|{2024-02-04 18:00:00, 2024-02-04 19:00:00}|click     |5            |
|{2024-02-04 18:00:00, 2024-02-04 19:00:00}|purchase  |5            |
|{2024-02-04 18:00:00, 2024-02-04 19:00:00}|like      |8            |
|{2024-02-04 18:00:00, 2024-02-04 19:00:00}|share     |4            |
+------------------------------------------+----------+-------------+



                                                                                

-------------------------------------------
Batch: 1
-------------------------------------------
+------------------------------------------+----------+-------------+
|window                                    |event_type|NumberOfUsers|
+------------------------------------------+----------+-------------+
|{2024-02-04 18:00:00, 2024-02-04 19:00:00}|like      |9            |
+------------------------------------------+----------+-------------+





In [25]:
# # Append output mode 
# query = (df.writeStream.outputMode("append") 
#     .format("console") 
#     .option("truncate", False) 
#     .start())

In [26]:
query.stop()

In [27]:
spark.stop() 