In [22]:
# Theory: 

# Spark Session Window in Structured Streaming
# https://www.databricks.com/blog/2021/10/12/native-support-of-session-window-in-spark-structured-streaming.html


In [1]:
# Create the Spark Session
from pyspark.sql import SparkSession

spark = (
    SparkSession
    .builder
    .appName("Spark Window Operation")
    .config("spark.streaming.stopGracefullyOnShutdown", True)
    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.1.2")
    .config("spark.sql.shuffle.partitions", 8)
    .master("local[*]")
    .getOrCreate()
    
)

spark

In [None]:
# Sample Topic payload 
# {"event_time": "2024-04-09 12:00:00.000000", "data": "owl dog owl"}
# {"event_time": "2024-04-09 12:03:00.000000", "data": "owl"}
# {"event_time": "2024-04-09 12:05:00.000000", "data": "owl"}

In [12]:
# Create the kafka_df to read from kafka

kafka_df = (
    spark
    .readStream
    .format("kafka")
    .option("kafka.bootstrap.servers", "ed-kafka:29092")
    .option("subscribe", "wildlife-1")
    .option("startingOffsets", "earliest")
    .load()
)

In [13]:
# Convert binary to string value column
from pyspark.sql.functions import expr

kafka_json_df = kafka_df.withColumn("value", expr("cast(value as string)"))

In [14]:
from pyspark.sql.functions import from_json, col, split, explode

# JSON Schema
json_schema = "event_time string, data string"

# Expand JSON from Value column using Schema
json_df = kafka_json_df.withColumn("values_json", from_json(col("value"), json_schema))

In [15]:
# Select the required columns

flattened_df = json_df.select("values_json.event_time","values_json.data")

In [16]:
# Split the data in words

words_df = flattened_df \
    .withColumn("words", split("data", " ")) \
    .withColumn("word", explode("words")) \
    .withColumn("event_time", col("event_time").cast("timestamp"))

In [17]:
words_df.printSchema()

root
 |-- event_time: timestamp (nullable = true)
 |-- data: string (nullable = true)
 |-- words: array (nullable = true)
 |    |-- element: string (containsNull = false)
 |-- word: string (nullable = false)



In [18]:
# Aggregate the words to generate count
from pyspark.sql.functions import count, lit, window

df_agg = words_df \
    .withWatermark("event_time", "10 minutes") \
    .groupBy(window("event_time", "10 minutes", "5 minutes"),
                          "word").agg(count(lit(1)).alias("cnt"))

In [19]:
df_final = df_agg.selectExpr("window.start as start_time", "window.end as end_time", "word", "cnt")
df_final.printSchema()

root
 |-- start_time: timestamp (nullable = true)
 |-- end_time: timestamp (nullable = true)
 |-- word: string (nullable = false)
 |-- cnt: long (nullable = false)



In [20]:
# Limitations - 
# Watermark discards old data, which Complete mode needs for correctness.
# and hence we cannot use withWatermark with outputMode as "complete"
# Avoid 'complete' mode in production environment as this may lead to OOM issue 

(df_final
 .writeStream
 .format("console")
 .outputMode("complete")
 .trigger(processingTime='30 seconds')
 .option("checkpointLocation", "checkpoint_dir_kafka_2")
 .start()
)

<pyspark.sql.streaming.StreamingQuery at 0x7ffff8c4dba0>

In [21]:
(df_final
 .writeStream
 .format("console")
 .outputMode("update")
 .trigger(processingTime='30 seconds')
 .option("checkpointLocation", "checkpoint_dir_kafka_3")
 .start()
)

<pyspark.sql.streaming.StreamingQuery at 0x7ffff82a91b0>

In [None]:
25/03/09 12:08:06 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
2025-03-09 17:38:10 -------------------------------------------
2025-03-09 17:38:10 Batch: 0
2025-03-09 17:38:10 -------------------------------------------
2025-03-09 17:38:11 +-------------------+-------------------+----+---+
2025-03-09 17:38:11 |         start_time|           end_time|word|cnt|
2025-03-09 17:38:11 +-------------------+-------------------+----+---+
2025-03-09 17:38:11 |2024-04-09 11:55:00|2024-04-09 12:05:00| dog|  1|
2025-03-09 17:38:11 |2024-04-09 11:55:00|2024-04-09 12:05:00| owl|  2|
2025-03-09 17:38:11 |2024-04-09 12:00:00|2024-04-09 12:10:00| owl|  2|
2025-03-09 17:38:11 |2024-04-09 12:00:00|2024-04-09 12:10:00| dog|  1|
2025-03-09 17:38:11 +-------------------+-------------------+----+---+
2025-03-09 17:38:11 
2025-03-09 17:38:15 
[I 2025-03-09 12:08:15.954 ServerApp] Saving file at /spark-streaming/08_window_operations_and_watermarks.ipynb
2025-03-09 17:38:18 -------------------------------------------
2025-03-09 17:38:18 Batch: 0
2025-03-09 17:38:18 -------------------------------------------
2025-03-09 17:38:19 +-------------------+-------------------+----+---+
2025-03-09 17:38:19 |         start_time|           end_time|word|cnt|
2025-03-09 17:38:19 +-------------------+-------------------+----+---+
2025-03-09 17:38:19 |2024-04-09 11:55:00|2024-04-09 12:05:00| dog|  1|
2025-03-09 17:38:19 |2024-04-09 11:55:00|2024-04-09 12:05:00| owl|  2|
2025-03-09 17:38:19 |2024-04-09 12:00:00|2024-04-09 12:10:00| owl|  2|
2025-03-09 17:38:19 |2024-04-09 12:00:00|2024-04-09 12:10:00| dog|  1|
2025-03-09 17:38:19 +-------------------+-------------------+----+---+
2025-03-09 17:38:19 
