In [28]:
from delta import *
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [29]:
builder = (SparkSession.builder
           .appName("handle-late-and-out-of-order-data")
           .master("spark://spark-master:7077")
           .config("spark.executor.memory", "512m")
           .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
           .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog"))

spark = configure_spark_with_delta_pip(builder,['org.apache.spark:spark-sql-kafka-0-10_2.12:3.4.1']).getOrCreate()
spark.sparkContext.setLogLevel("ERROR")

In [30]:
df = (spark.readStream
      .format("kafka")
      .option("kafka.bootstrap.servers", "kafka:9092")
      .option("subscribe", "events")
      .option("startingOffsets", "latest")
      .load())

In [31]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, TimestampType
schema = StructType([
    StructField('user_id', IntegerType(), True),
    StructField('event_type', StringType(), True),
    StructField('event_time', StringType(), True),
    StructField('processing_time', StringType(), True)])

df = df.withColumn('value', from_json(col('value').cast("STRING"), schema))

In [32]:
from pyspark.sql.functions import col

df = (df.select(
          col('value.user_id').alias('user_id'),
          col('value.event_type').alias('event_type'),
          col('value.event_time').alias('event_time'),
          col('value.processing_time').alias('processing_time'))
      .withColumn("event_time", to_timestamp(col("event_time"), "MM/dd/yyyy, HH:mm:ss" ))
      .withColumn("processing_time", to_timestamp(col("processing_time"), "MM/dd/yyyy, HH:mm:ss"))     )

In [33]:
# Define the watermark logic for the streaming DataFrame
df = df.withWatermark("event_time", "10 seconds")

In [34]:
df = (df
      .groupBy(window(col("event_time"), "1 minute", "1 minute"), col("user_id"))
      .count().alias("NumberOfEvents"))

In [35]:
query = (df.writeStream
    .outputMode('update')
    .format('console')
    .option("truncate", False)
    .start())

                                                                                

-------------------------------------------
Batch: 0
-------------------------------------------
+------+-------+-----+
|window|user_id|count|
+------+-------+-----+
+------+-------+-----+



                                                                                

-------------------------------------------
Batch: 1
-------------------------------------------
+------------------------------------------+-------+-----+
|window                                    |user_id|count|
+------------------------------------------+-------+-----+
|{2023-08-07 14:25:00, 2023-08-07 14:26:00}|10     |2    |
|{2023-08-07 14:25:00, 2023-08-07 14:26:00}|5      |1    |
+------------------------------------------+-------+-----+



                                                                                

-------------------------------------------
Batch: 2
-------------------------------------------
+------------------------------------------+-------+-----+
|window                                    |user_id|count|
+------------------------------------------+-------+-----+
|{2023-08-07 14:25:00, 2023-08-07 14:26:00}|4      |1    |
+------------------------------------------+-------+-----+



                                                                                

-------------------------------------------
Batch: 3
-------------------------------------------
+------------------------------------------+-------+-----+
|window                                    |user_id|count|
+------------------------------------------+-------+-----+
|{2023-08-07 14:25:00, 2023-08-07 14:26:00}|4      |2    |
|{2023-08-07 14:25:00, 2023-08-07 14:26:00}|5      |2    |
+------------------------------------------+-------+-----+



                                                                                

-------------------------------------------
Batch: 4
-------------------------------------------
+------------------------------------------+-------+-----+
|window                                    |user_id|count|
+------------------------------------------+-------+-----+
|{2023-08-07 14:25:00, 2023-08-07 14:26:00}|3      |1    |
+------------------------------------------+-------+-----+



                                                                                

-------------------------------------------
Batch: 5
-------------------------------------------
+------------------------------------------+-------+-----+
|window                                    |user_id|count|
+------------------------------------------+-------+-----+
|{2023-08-07 14:26:00, 2023-08-07 14:27:00}|6      |1    |
+------------------------------------------+-------+-----+



                                                                                

-------------------------------------------
Batch: 6
-------------------------------------------
+------------------------------------------+-------+-----+
|window                                    |user_id|count|
+------------------------------------------+-------+-----+
|{2023-08-07 14:25:00, 2023-08-07 14:26:00}|6      |1    |
+------------------------------------------+-------+-----+



                                                                                

-------------------------------------------
Batch: 7
-------------------------------------------
+------------------------------------------+-------+-----+
|window                                    |user_id|count|
+------------------------------------------+-------+-----+
|{2023-08-07 14:26:00, 2023-08-07 14:27:00}|1      |1    |
+------------------------------------------+-------+-----+



                                                                                

-------------------------------------------
Batch: 8
-------------------------------------------
+------------------------------------------+-------+-----+
|window                                    |user_id|count|
+------------------------------------------+-------+-----+
|{2023-08-07 14:26:00, 2023-08-07 14:27:00}|1      |2    |
+------------------------------------------+-------+-----+



                                                                                

-------------------------------------------
Batch: 9
-------------------------------------------
+------------------------------------------+-------+-----+
|window                                    |user_id|count|
+------------------------------------------+-------+-----+
|{2023-08-07 14:26:00, 2023-08-07 14:27:00}|1      |3    |
+------------------------------------------+-------+-----+



                                                                                

-------------------------------------------
Batch: 10
-------------------------------------------
+------------------------------------------+-------+-----+
|window                                    |user_id|count|
+------------------------------------------+-------+-----+
|{2023-08-07 14:27:00, 2023-08-07 14:28:00}|6      |1    |
+------------------------------------------+-------+-----+



                                                                                

-------------------------------------------
Batch: 11
-------------------------------------------
+------+-------+-----+
|window|user_id|count|
+------+-------+-----+
+------+-------+-----+



                                                                                

-------------------------------------------
Batch: 12
-------------------------------------------
+------------------------------------------+-------+-----+
|window                                    |user_id|count|
+------------------------------------------+-------+-----+
|{2023-08-07 14:27:00, 2023-08-07 14:28:00}|9      |1    |
+------------------------------------------+-------+-----+



                                                                                

-------------------------------------------
Batch: 13
-------------------------------------------
+------------------------------------------+-------+-----+
|window                                    |user_id|count|
+------------------------------------------+-------+-----+
|{2023-08-07 14:27:00, 2023-08-07 14:28:00}|9      |2    |
+------------------------------------------+-------+-----+



                                                                                

-------------------------------------------
Batch: 14
-------------------------------------------
+------------------------------------------+-------+-----+
|window                                    |user_id|count|
+------------------------------------------+-------+-----+
|{2023-08-07 14:27:00, 2023-08-07 14:28:00}|3      |1    |
+------------------------------------------+-------+-----+



                                                                                

-------------------------------------------
Batch: 15
-------------------------------------------
+------+-------+-----+
|window|user_id|count|
+------+-------+-----+
+------+-------+-----+



                                                                                

-------------------------------------------
Batch: 16
-------------------------------------------
+------------------------------------------+-------+-----+
|window                                    |user_id|count|
+------------------------------------------+-------+-----+
|{2023-08-07 14:27:00, 2023-08-07 14:28:00}|4      |1    |
+------------------------------------------+-------+-----+



                                                                                

-------------------------------------------
Batch: 17
-------------------------------------------
+------+-------+-----+
|window|user_id|count|
+------+-------+-----+
+------+-------+-----+



In [36]:
query.stop()

In [37]:
spark.stop() 