# Handling Late Data and Watermarking

Experience how late arrivals and output modes interleaves in windowed aggregation queries.

**NOTE**: Run the other notebook first.

In [1]:
import os
from pyspark.sql import SparkSession
from pyspark.streaming import StreamingContext
import io
from pyspark.sql.functions import *
import time
import json
import struct
import requests 

os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-sql-kafka-0-10_2.12:3.0.1,org.apache.spark:spark-streaming-kafka-0-10_2.11:2.4.5,org.apache.kafka:kafka-clients:2.6.0 pyspark-shell'
                                    
spark = (SparkSession.builder 
    .master("local[*]")
    .appName("test")
    .getOrCreate()
        )

spark

set up the environment variables

In [2]:
servers = "kafka:9092"
topic = "words"

In [3]:
from pyspark.sql.types import *

schema = StructType([
    StructField("word", StringType(), True),
    StructField("ts", TimestampType(), True)])

In [4]:
raw_sdf = (spark
  .readStream
  .format("kafka")
  .option("kafka.bootstrap.servers", servers)
  .option("startingOffsets", "latest")
  .option("subscribe", topic)
  .load())

In [5]:
raw_sdf.isStreaming

True

In [6]:
words=(raw_sdf.select(from_json(col("value").cast("string"), schema).alias("value"))
              .select("value.*"))

In [7]:
words.printSchema()

root
 |-- word: string (nullable = true)
 |-- ts: timestamp (nullable = true)



In [8]:
q_update=(words.withWatermark("ts","10 minutes").groupBy(window(words.ts, "10 minutes", "5 minutes"),words.word).count()
   .writeStream
   .format("memory")
   .outputMode("update") 
   .queryName("sinkTable_update")
   .start())

In [9]:
q_append=(words.withWatermark("ts","10 minutes").groupBy(window("ts", "10 minutes", "5 minutes"),words.word).count()
   .writeStream
   .format("memory")
   .outputMode("append") 
   .queryName("sinkTable_append")
   .start())

Go the the other notebook and run the cells in **Section 1)**

In [14]:
q_update.status

{'message': 'Waiting for data to arrive',
 'isDataAvailable': False,
 'isTriggerActive': False}

In [15]:
q_append.status

{'message': 'Waiting for data to arrive',
 'isDataAvailable': False,
 'isTriggerActive': False}

In [16]:
# look up the most recent results
spark.sql("SELECT * FROM sinkTable_update ORDER BY window,word").show(15,False) # without ORDER BY TS DESC because the result in the table is already only the most recent

+------------------------------------------+----+-----+
|window                                    |word|count|
+------------------------------------------+----+-----+
|[2024-03-24 12:00:00, 2024-03-24 12:10:00]|dog |1    |
|[2024-03-24 12:00:00, 2024-03-24 12:10:00]|owl |1    |
|[2024-03-24 12:05:00, 2024-03-24 12:15:00]|dog |1    |
|[2024-03-24 12:05:00, 2024-03-24 12:15:00]|owl |1    |
+------------------------------------------+----+-----+



In [17]:
# look up the most recent results
spark.sql("SELECT * FROM sinkTable_append ORDER BY window,word").show(10,False) # without ORDER BY TS DESC because the result in the table is already only the most recent

+------+----+-----+
|window|word|count|
+------+----+-----+
+------+----+-----+



Go the the other notebook and run the cells in **Section 2)**

In [24]:
q_update.status

{'message': 'Waiting for data to arrive',
 'isDataAvailable': False,
 'isTriggerActive': False}

In [21]:
q_append.status

{'message': 'No new data but cleaning up state',
 'isDataAvailable': False,
 'isTriggerActive': True}

In [25]:
# look up the most recent results
spark.sql("SELECT * FROM sinkTable_update ORDER BY window,word,count").show(15,False) # without ORDER BY TS DESC because the result in the table is already only the most recent

+------------------------------------------+----+-----+
|window                                    |word|count|
+------------------------------------------+----+-----+
|[2024-03-24 12:00:00, 2024-03-24 12:10:00]|cat |1    |
|[2024-03-24 12:00:00, 2024-03-24 12:10:00]|dog |1    |
|[2024-03-24 12:00:00, 2024-03-24 12:10:00]|owl |1    |
|[2024-03-24 12:05:00, 2024-03-24 12:15:00]|cat |1    |
|[2024-03-24 12:05:00, 2024-03-24 12:15:00]|dog |1    |
|[2024-03-24 12:05:00, 2024-03-24 12:15:00]|dog |2    |
|[2024-03-24 12:05:00, 2024-03-24 12:15:00]|owl |1    |
|[2024-03-24 12:10:00, 2024-03-24 12:20:00]|dog |1    |
+------------------------------------------+----+-----+



In [27]:
# look up the most recent results
spark.sql("SELECT * FROM sinkTable_append ORDER BY window,word").show(10,False) # without ORDER BY TS DESC because the result in the table is already only the most recent

+------+----+-----+
|window|word|count|
+------+----+-----+
+------+----+-----+



Go the the other notebook and run the cells in **Section 3)**

In [30]:
q_update.status

{'message': 'Getting offsets from KafkaV2[Subscribe[words]]',
 'isDataAvailable': False,
 'isTriggerActive': True}

In [31]:
q_append.status

{'message': 'Waiting for data to arrive',
 'isDataAvailable': False,
 'isTriggerActive': False}

In [32]:
# look up the most recent results
spark.sql("SELECT * FROM sinkTable_update ORDER BY window,word,count").show(15,False) # without ORDER BY TS DESC because the result in the table is already only the most recent

+------------------------------------------+----+-----+
|window                                    |word|count|
+------------------------------------------+----+-----+
|[2024-03-24 12:00:00, 2024-03-24 12:10:00]|cat |1    |
|[2024-03-24 12:00:00, 2024-03-24 12:10:00]|dog |1    |
|[2024-03-24 12:00:00, 2024-03-24 12:10:00]|dog |2    |
|[2024-03-24 12:00:00, 2024-03-24 12:10:00]|owl |1    |
|[2024-03-24 12:05:00, 2024-03-24 12:15:00]|cat |1    |
|[2024-03-24 12:05:00, 2024-03-24 12:15:00]|dog |1    |
|[2024-03-24 12:05:00, 2024-03-24 12:15:00]|dog |2    |
|[2024-03-24 12:05:00, 2024-03-24 12:15:00]|dog |3    |
|[2024-03-24 12:05:00, 2024-03-24 12:15:00]|owl |1    |
|[2024-03-24 12:05:00, 2024-03-24 12:15:00]|owl |2    |
|[2024-03-24 12:10:00, 2024-03-24 12:20:00]|cat |1    |
|[2024-03-24 12:10:00, 2024-03-24 12:20:00]|dog |1    |
|[2024-03-24 12:10:00, 2024-03-24 12:20:00]|owl |1    |
|[2024-03-24 12:15:00, 2024-03-24 12:25:00]|cat |1    |
+------------------------------------------+----

In [33]:
# look up the most recent results
spark.sql("SELECT * FROM sinkTable_append ORDER BY window,word").show(10,False) # without ORDER BY TS DESC because the result in the table is already only the most recent

+------+----+-----+
|window|word|count|
+------+----+-----+
+------+----+-----+



Go the the other notebook and run the cells in **Section 4)**

In [37]:
q_update.status

{'message': 'Waiting for data to arrive',
 'isDataAvailable': False,
 'isTriggerActive': False}

In [38]:
q_append.status

{'message': 'Waiting for data to arrive',
 'isDataAvailable': False,
 'isTriggerActive': False}

In [39]:
# look up the most recent results
spark.sql("SELECT * FROM sinkTable_update ORDER BY window,word,count").show(15,False) # without ORDER BY TS DESC because the result in the table is already only the most recent

+------------------------------------------+------+-----+
|window                                    |word  |count|
+------------------------------------------+------+-----+
|[2024-03-24 11:55:00, 2024-03-24 12:05:00]|donkey|1    |
|[2024-03-24 12:00:00, 2024-03-24 12:10:00]|cat   |1    |
|[2024-03-24 12:00:00, 2024-03-24 12:10:00]|dog   |1    |
|[2024-03-24 12:00:00, 2024-03-24 12:10:00]|dog   |2    |
|[2024-03-24 12:00:00, 2024-03-24 12:10:00]|donkey|1    |
|[2024-03-24 12:00:00, 2024-03-24 12:10:00]|owl   |1    |
|[2024-03-24 12:05:00, 2024-03-24 12:15:00]|cat   |1    |
|[2024-03-24 12:05:00, 2024-03-24 12:15:00]|dog   |1    |
|[2024-03-24 12:05:00, 2024-03-24 12:15:00]|dog   |2    |
|[2024-03-24 12:05:00, 2024-03-24 12:15:00]|dog   |3    |
|[2024-03-24 12:05:00, 2024-03-24 12:15:00]|owl   |1    |
|[2024-03-24 12:05:00, 2024-03-24 12:15:00]|owl   |2    |
|[2024-03-24 12:10:00, 2024-03-24 12:20:00]|cat   |1    |
|[2024-03-24 12:10:00, 2024-03-24 12:20:00]|dog   |1    |
|[2024-03-24 1

In [40]:
# look up the most recent results
spark.sql("SELECT * FROM sinkTable_append ORDER BY window,word").show(10,False) # without ORDER BY TS DESC because the result in the table is already only the most recent

+------------------------------------------+----+-----+
|window                                    |word|count|
+------------------------------------------+----+-----+
|[2024-03-24 12:00:00, 2024-03-24 12:10:00]|cat |1    |
|[2024-03-24 12:00:00, 2024-03-24 12:10:00]|dog |2    |
|[2024-03-24 12:00:00, 2024-03-24 12:10:00]|owl |1    |
|[2024-03-24 12:05:00, 2024-03-24 12:15:00]|cat |1    |
|[2024-03-24 12:05:00, 2024-03-24 12:15:00]|dog |3    |
|[2024-03-24 12:05:00, 2024-03-24 12:15:00]|owl |2    |
+------------------------------------------+----+-----+



Too late, or not?

https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html#semantic-guarantees-of-aggregation-with-watermarking