# Spark Structured Streaming - Demo
## Fire alarm

In [1]:
import os
from pyspark.sql import SparkSession
from pyspark.streaming import StreamingContext
import io
from pyspark.sql.functions import *
import time
import json
import struct
import requests 

os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-sql-kafka-0-10_2.12:3.0.1,org.apache.spark:spark-streaming-kafka-0-10_2.11:2.4.5,org.apache.kafka:kafka-clients:2.6.0 pyspark-shell'
                                    
spark = (SparkSession.builder 
    .master("local[*]")
    .appName("test")
    .getOrCreate()
        )

spark

set up the environment variables

In [3]:
smoke_topic = 'SmokeSensorEvent'
temperature_topic = 'TemperatureSensorEvent'
servers = "kafka:9092"

## Understanding spark-kafka integration
Let's treat first kafka as a bulk source using `SparkSession.read()` instead of `SparkSession.readStream()` (see below)

In [4]:
smoke_df = (spark
  .read
  .format("kafka")
  .option("kafka.bootstrap.servers", servers)
  .option("subscribe", smoke_topic)
  .option("startingOffsets", "earliest")
  .option("endingOffsets", "latest")
  .load())

In [5]:
smoke_df.printSchema()

root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)



In [6]:
smoke_df.show(5)

+-------+--------------------+----------------+---------+------+--------------------+-------------+
|    key|               value|           topic|partition|offset|           timestamp|timestampType|
+-------+--------------------+----------------+---------+------+--------------------+-------------+
|[53 31]|[7B 22 73 65 6E 7...|SmokeSensorEvent|        0|     0|2024-10-19 05:17:...|            0|
|[53 31]|[7B 22 73 65 6E 7...|SmokeSensorEvent|        0|     1|2024-10-19 05:17:...|            0|
|[53 31]|[7B 22 73 65 6E 7...|SmokeSensorEvent|        0|     2|2024-10-19 05:17:...|            0|
|[53 31]|[7B 22 73 65 6E 7...|SmokeSensorEvent|        0|     3|2024-10-19 05:18:...|            0|
|[53 31]|[7B 22 73 65 6E 7...|SmokeSensorEvent|        0|     4|2024-10-19 05:18:...|            0|
+-------+--------------------+----------------+---------+------+--------------------+-------------+
only showing top 5 rows



In [7]:
stringified_smoke_df = smoke_df.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)","topic","partition","offset")
stringified_smoke_df.show(5,False)

+---+--------------------------------------------------+----------------+---------+------+
|key|value                                             |topic           |partition|offset|
+---+--------------------------------------------------+----------------+---------+------+
|S1 |{"sensor": "S1", "smoke": false, "ts": 1729315058}|SmokeSensorEvent|0        |0     |
|S1 |{"sensor": "S1", "smoke": false, "ts": 1729315069}|SmokeSensorEvent|0        |1     |
|S1 |{"sensor": "S1", "smoke": false, "ts": 1729315079}|SmokeSensorEvent|0        |2     |
|S1 |{"sensor": "S1", "smoke": false, "ts": 1729315089}|SmokeSensorEvent|0        |3     |
|S1 |{"sensor": "S1", "smoke": false, "ts": 1729315099}|SmokeSensorEvent|0        |4     |
+---+--------------------------------------------------+----------------+---------+------+
only showing top 5 rows



In [8]:
stringified_smoke_df.printSchema()

root
 |-- key: string (nullable = true)
 |-- value: string (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)



In [9]:
from pyspark.sql.types import *

smoke_schema = StructType([
    StructField("sensor", StringType(), True),
    StructField("smoke", BooleanType(), True),
    StructField("ts", TimestampType(), True)])

In [10]:
smoke_df = stringified_smoke_df.select(col("key").cast("string"),from_json(col("value"), smoke_schema).alias("value"))

In [11]:
smoke_df.printSchema()

root
 |-- key: string (nullable = true)
 |-- value: struct (nullable = true)
 |    |-- sensor: string (nullable = true)
 |    |-- smoke: boolean (nullable = true)
 |    |-- ts: timestamp (nullable = true)



In [12]:
smoke_df.select("value.*").show(5)

+------+-----+-------------------+
|sensor|smoke|                 ts|
+------+-----+-------------------+
|    S1|false|2024-10-19 05:17:38|
|    S1|false|2024-10-19 05:17:49|
|    S1|false|2024-10-19 05:17:59|
|    S1|false|2024-10-19 05:18:09|
|    S1|false|2024-10-19 05:18:19|
+------+-----+-------------------+
only showing top 5 rows



## Let's explore Spark Structured Streaming by example
Please refer to [EPL fire allarm](https://github.com/emanueledellavalle/streaming-data-analytics/tree/main/codes/epl_firealarm) for the EPL version of the following queries.

### Let's create the streaming Data Frames using the data in the kafka smoke topic

This time using the correct `SparkSession.readStream()`

In [13]:
raw_streaming_smoke_df = (spark
  .readStream
  .format("kafka")
  .option("kafka.bootstrap.servers", servers)
  .option("startingOffsets", "earliest")
  .option("subscribe", smoke_topic)
  .load())

In [14]:
raw_streaming_smoke_df.isStreaming

True

In [15]:
raw_streaming_smoke_df.printSchema()

root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)



In [16]:
smoke_sdf=(raw_streaming_smoke_df
                      .select(from_json(col("value").cast("string"), smoke_schema).alias("value"))
                      .select("value.*"))

In [17]:
smoke_sdf.printSchema()

root
 |-- sensor: string (nullable = true)
 |-- smoke: boolean (nullable = true)
 |-- ts: timestamp (nullable = true)



**NOTE**: it is not a DataFrame, you cannot directly execute an action on it. 

**The following cell *intetionally* gives an error**.

In [18]:
smoke_sdf.count()

AnalysisException: Queries with streaming sources must be executed with writeStream.start();;
kafka

Queries with streaming sources must be registred and started with `writeStream.start()`

### Let's register and start a simple query

In [19]:
basic_query = (smoke_sdf
    .writeStream
    .format("memory") # this is for debug purpose only! DO NOT USE IN PRODUCTION
    .queryName("sinkTable")
    .start())

In [20]:
basic_query.lastProgress

{'id': 'ba768a9b-666d-40ab-8a29-53257df35a20',
 'runId': '78a34ac4-3219-4b18-8d6f-510f1d24ed79',
 'name': 'sinkTable',
 'timestamp': '2024-10-19T05:19:00.001Z',
 'batchId': 1,
 'numInputRows': 1,
 'inputRowsPerSecond': 71.42857142857143,
 'processedRowsPerSecond': 7.874015748031496,
 'durationMs': {'addBatch': 44,
  'getBatch': 0,
  'latestOffset': 3,
  'queryPlanning': 20,
  'triggerExecution': 127,
  'walCommit': 42},
 'stateOperators': [],
 'sources': [{'description': 'KafkaV2[Subscribe[SmokeSensorEvent]]',
   'startOffset': {'SmokeSensorEvent': {'0': 8}},
   'endOffset': {'SmokeSensorEvent': {'0': 9}},
   'numInputRows': 1,
   'inputRowsPerSecond': 71.42857142857143,
   'processedRowsPerSecond': 7.874015748031496}],
 'sink': {'description': 'MemorySink', 'numOutputRows': 1}}

run the following cell to see the most recent content of the sinkTable

In [21]:
spark.sql("SELECT * FROM sinkTable ORDER BY TS DESC").show(5)

+------+-----+-------------------+
|sensor|smoke|                 ts|
+------+-----+-------------------+
|    S1|false|2024-10-19 05:19:10|
|    S1|false|2024-10-19 05:18:59|
|    S1|false|2024-10-19 05:18:49|
|    S1|false|2024-10-19 05:18:39|
|    S1|false|2024-10-19 05:18:29|
+------+-----+-------------------+
only showing top 5 rows



do not forget to stop queries that you are not using

In [22]:
basic_query.stop()

### Let's create the streaming Data Frames for the kafka temperature topic

In [23]:
temperarture_schema = StructType([
    StructField("sensor", StringType(), True),
    StructField("temperature", DoubleType(), True),
    StructField("ts", TimestampType(), True)])

raw_streaming_temperature_df = (spark
  .readStream
  .format("kafka")
  .option("kafka.bootstrap.servers", servers)
  .option("startingOffsets", "earliest")
  .option("subscribe", temperature_topic)
  .load())

temperature_sdf = (raw_streaming_temperature_df
                      .select(from_json(col("value").cast("string"), temperarture_schema).alias("value"))
                      .select("value.*"))

In [24]:
temperature_sdf.printSchema()

root
 |-- sensor: string (nullable = true)
 |-- temperature: double (nullable = true)
 |-- ts: timestamp (nullable = true)



## Q0 - Filter

The temperature events whose temperature is greater than 50 °C 

### the SQL style

In [25]:
# create a logic table on top of the streaming data frame
temperature_sdf.createTempView("TemperatureSensorEvent")

# write your query in SQL, register it and start it
q0 = (spark.sql("select * from TemperatureSensorEvent where temperature > 50")
                     .writeStream
                     .format("memory")
                     .queryName("sinkTable")
                     .start())

let's ask for the execution plan, we will compare it with cells down with the one of the query in Data Frame style

In [26]:
q0.explain()

== Physical Plan ==
WriteToDataSourceV2 org.apache.spark.sql.execution.streaming.sources.MicroBatchWrite@1ec12c77
+- Project [from_json(StructField(sensor,StringType,true), StructField(temperature,DoubleType,true), StructField(ts,TimestampType,true), cast(value#262 as string), Some(Etc/UTC)).sensor AS sensor#277, from_json(StructField(sensor,StringType,true), StructField(temperature,DoubleType,true), StructField(ts,TimestampType,true), cast(value#262 as string), Some(Etc/UTC)).temperature AS temperature#278, from_json(StructField(sensor,StringType,true), StructField(temperature,DoubleType,true), StructField(ts,TimestampType,true), cast(value#262 as string), Some(Etc/UTC)).ts AS ts#279]
   +- Filter (from_json(StructField(sensor,StringType,true), StructField(temperature,DoubleType,true), StructField(ts,TimestampType,true), cast(value#262 as string), Some(Etc/UTC)).temperature > 50.0)
      +- *(1) Project [key#261, value#262, topic#263, partition#264, offset#265L, timestamp#266, timesta

In [31]:
# look up the most recent results
spark.sql("SELECT * FROM sinkTable ORDER BY TS DESC").show(5)

+------+-----------+---+
|sensor|temperature| ts|
+------+-----------+---+
+------+-----------+---+



if you are following carefully the instruction it should be empty because we are sending temperature aropund 20 °C.

Go back to the `temperature_sensor_simulator` notebook, stop the cell that is sending temperature around 20°C and run the one that sends temperature around 55°C

In [32]:
# look up the most recent results
spark.sql("SELECT * FROM sinkTable ORDER BY TS DESC").show(5)

+------+------------------+-------------------+
|sensor|       temperature|                 ts|
+------+------------------+-------------------+
|    S1|53.662382263870484|2024-10-19 05:20:36|
+------+------------------+-------------------+



Now you should see results.

In [33]:
# clean up
q0.stop()
spark.catalog.dropTempView("TemperatureSensorEvent")

### The DataFrame style

In [34]:
q0bis = (temperature_sdf
                     .where("temperature > 50") # you can add anything that fits in a SQL where statemente 
                     .writeStream
                     .format("memory")
                     .queryName("sinkTable1")
                     .start())

let's ask for the explanation of the plan. Comparing with the one of the SQL style, you can see that there is no difference. This is expected because the [catalyst optimizer](https://databricks.com/glossary/catalyst-optimizer) created it out of our declarations (which are semantically equivalent)

In [35]:
q0bis.explain()

== Physical Plan ==
WriteToDataSourceV2 org.apache.spark.sql.execution.streaming.sources.MicroBatchWrite@23d36e91
+- Project [from_json(StructField(sensor,StringType,true), StructField(temperature,DoubleType,true), StructField(ts,TimestampType,true), cast(value#262 as string), Some(Etc/UTC)).sensor AS sensor#277, from_json(StructField(sensor,StringType,true), StructField(temperature,DoubleType,true), StructField(ts,TimestampType,true), cast(value#262 as string), Some(Etc/UTC)).temperature AS temperature#278, from_json(StructField(sensor,StringType,true), StructField(temperature,DoubleType,true), StructField(ts,TimestampType,true), cast(value#262 as string), Some(Etc/UTC)).ts AS ts#279]
   +- Filter (from_json(StructField(sensor,StringType,true), StructField(temperature,DoubleType,true), StructField(ts,TimestampType,true), cast(value#262 as string), Some(Etc/UTC)).temperature > 50.0)
      +- *(1) Project [key#261, value#262, topic#263, partition#264, offset#265L, timestamp#266, timesta

In [36]:
spark.sql("SELECT * FROM sinkTable ORDER BY TS DESC").show(5)

+------+------------------+-------------------+
|sensor|       temperature|                 ts|
+------+------------------+-------------------+
|    S1|54.617697571948646|2024-10-19 05:20:46|
|    S1|53.662382263870484|2024-10-19 05:20:36|
+------+------------------+-------------------+



In [37]:
q0bis.stop()

> NOTE: there was no need to
> * create a logic table on top of the streaming data frame with `temperature_sdf.createTempView("TemperatureSensorEvent")`
> * drop such a logic table with `spark.catalog.dropTempView("TemperatureSensorEvent")`

## Q1 - Avg

the average of all the temperature observation for each sensor up to the last event received

### the SQL sytyle

In [38]:
# create a logic table on top of the streaming data frame
temperature_sdf.createTempView("TemperatureSensorEvent") # this time we will not clean it up, because we use it in the next queries

**NOTE**: the following query gives *intentionally* an error

In [39]:
query_string = """
SELECT SENSOR, AVG(temperature) 
FROM TemperatureSensorEvent
GROUP BY SENSOR
"""

# write your query in SQL, register it and start it
q1 = (spark.sql(query_string)
                     .writeStream
                     .format("memory")
                     .queryName("sinkTable")
                     .start())

AnalysisException: Append output mode not supported when there are streaming aggregations on streaming DataFrames/DataSets without watermark;;
Aggregate [SENSOR#277], [SENSOR#277, avg(temperature#278) AS avg(temperature)#537]
+- SubqueryAlias temperaturesensorevent
   +- Project [value#275.sensor AS sensor#277, value#275.temperature AS temperature#278, value#275.ts AS ts#279]
      +- Project [from_json(StructField(sensor,StringType,true), StructField(temperature,DoubleType,true), StructField(ts,TimestampType,true), cast(value#262 as string), Some(Etc/UTC)) AS value#275]
         +- StreamingRelationV2 org.apache.spark.sql.kafka010.KafkaSourceProvider@746b7573, kafka, org.apache.spark.sql.kafka010.KafkaSourceProvider$KafkaTable@4cf4f932, org.apache.spark.sql.util.CaseInsensitiveStringMap@24f684c4, [key#261, value#262, topic#263, partition#264, offset#265L, timestamp#266, timestampType#267], StreamingRelation DataSource(org.apache.spark.sql.SparkSession@69bd9244,kafka,List(),None,List(),None,Map(startingOffsets -> earliest, subscribe -> TemperatureSensorEvent, kafka.bootstrap.servers -> kafka:9092),None), kafka, [key#254, value#255, topic#256, partition#257, offset#258L, timestamp#259, timestampType#260]


The **append output mode** (i.e., the default one) is not supported when there are streaming aggregations on streaming DataFrames/DataSets without watermark, we need to use the **complete output mode**.

In [40]:
query_string = """
SELECT SENSOR, AVG(temperature) 
FROM TemperatureSensorEvent
GROUP BY SENSOR
"""

# write your query in SQL, register it and start it
q1 = (spark.sql(query_string)
                     .writeStream
                     .format("memory")
                     .outputMode("complete") # <-- CHANGE HERE
                     .queryName("sinkTable")
                     .start())

In [45]:
q1.status

{'message': 'Processing new data',
 'isDataAvailable': True,
 'isTriggerActive': True}

In [46]:
# look up the most recent results
spark.sql("SELECT * FROM sinkTable").show() # without ORDER BY TS DESC because the result in the table is already only the most recent

+------+-----------------+
|SENSOR| avg(temperature)|
+------+-----------------+
|    S1|40.67552232364142|
+------+-----------------+



**NOTE**: if the cell above gives an empty result, wait 10 seconds and run it again. The very first excution may take time, expecially if you have already ingested many temperature events in kafka. Here we are querying the sink table and it may be empty because the first execution is still running.

In [47]:
# clean up
q1.stop()

### The DataFrame style

In [48]:
# write your query in SQL, register it and start it
q1bis = (temperature_sdf 
                     .groupBy("sensor")
                     .avg()
                     .writeStream
                     .format("memory")
                     .outputMode("complete") 
                     .queryName("sinkTable")
                     .start())

In [51]:
q1bis.status

{'message': 'Waiting for data to arrive',
 'isDataAvailable': False,
 'isTriggerActive': False}

In [52]:
# look up the most recent results
spark.sql("SELECT * FROM sinkTable").show() # woithout ORDER BY TS DESC because the result in the table is already only the most recent

+------+-----------------+
|sensor| avg(temperature)|
+------+-----------------+
|    S1|44.00893301636756|
+------+-----------------+



In [53]:
# clean up
q1bis.stop()

## Q2 - Logical Sliding Window

The average temperature observed by each sensor in the last 4 seconds

MEMO: the average should change as soon as the receive a new event

**Not supported**

## Q3 - Logical Tumbling Window

The average temperature of the last 30 seconds every 30 seconds (was 4 seconds in EPL)

NOTE: this query is not possibile in the SQL style

In [54]:
q3 = (temperature_sdf
                  .groupBy(window("TS", "30 seconds"),"SENSOR")
                  .avg("TEMPERATURE")
                  .writeStream
                  .outputMode("complete")
                  .format("memory")
                  .queryName("sinkTable")
                  .start())

In [58]:
q3.status

{'message': 'Waiting for data to arrive',
 'isDataAvailable': False,
 'isTriggerActive': False}

In [59]:
spark.sql("SELECT * FROM sinkTable ORDER BY window DESC").show(5,False) # NOTE: here we order by window instead of ordering by timestamp# window instead of timestamp, again

+------------------------------------------+------+-----------------+
|window                                    |SENSOR|avg(TEMPERATURE) |
+------------------------------------------+------+-----------------+
|[2024-10-19 05:22:00, 2024-10-19 05:22:30]|S1    |54.99742509749736|
|[2024-10-19 05:21:30, 2024-10-19 05:22:00]|S1    |55.12030199212136|
|[2024-10-19 05:21:00, 2024-10-19 05:21:30]|S1    |54.49941117079944|
|[2024-10-19 05:20:30, 2024-10-19 05:21:00]|S1    |54.19547244233558|
|[2024-10-19 05:20:00, 2024-10-19 05:20:30]|S1    |20.07691564946969|
+------------------------------------------+------+-----------------+
only showing top 5 rows



In [60]:
q3.stop()

## Q4 - Physical Sliding Window

The moving average of the last 4 temperature events

**Not supported**

## Q5 - Physical Tumbling Window

The moving average of the last 4 temperature events every 4 events 

**Not supported**

## Q6 - Logical Hopping Window

The average temperature of the last 1 minute (was 4 seconds in EPL) every 30 seconds (was 2 seconds in EPL)

In [61]:
q6 = (temperature_sdf
      .groupBy(window("TS", "1 minutes", "30 seconds"),"SENSOR")
      .avg("TEMPERATURE")
      .writeStream
      .outputMode("complete")
      .format("memory")
      .queryName("sinkTable")
      .start())

In [63]:
q6.status

{'message': 'Waiting for data to arrive',
 'isDataAvailable': False,
 'isTriggerActive': False}

In [64]:
spark.sql("SELECT * FROM sinkTable ORDER BY window DESC").show(6,False) # NOTE: here we order by window instead of ordering by timestamp

+------------------------------------------+------+------------------+
|window                                    |SENSOR|avg(TEMPERATURE)  |
+------------------------------------------+------+------------------+
|[2024-10-19 05:22:30, 2024-10-19 05:23:30]|S1    |54.037300434331954|
|[2024-10-19 05:22:00, 2024-10-19 05:23:00]|S1    |54.61337523223119 |
|[2024-10-19 05:21:30, 2024-10-19 05:22:30]|S1    |55.058863544809356|
|[2024-10-19 05:21:00, 2024-10-19 05:22:00]|S1    |54.809856581460394|
|[2024-10-19 05:20:30, 2024-10-19 05:21:30]|S1    |54.34744180656751 |
|[2024-10-19 05:20:00, 2024-10-19 05:21:00]|S1    |37.136194045902634|
+------------------------------------------+------+------------------+
only showing top 6 rows



In [65]:
q6.stop()

## Q7 - Stream-to-Stream Join

In EPL, at this point we moved on to the pattern matching part required to satisfy the information need, i.e., "find every smoke event followed by a temperature event whose temperature is above 50 °C within 2 minutes."

Spark Structured Streaming does not support the EPL's operator `->` (that reads as *followed by*. We need to use a stream-to-stream join.

In [66]:
smoke_events = (smoke_sdf
                .where("smoke = True")
                .withColumnRenamed("sensor","sensorSmoke")
                .withColumnRenamed("ts","tsSmoke")
               )

high_temperature_events = (temperature_sdf
                .where("temperature > 50")
                .withColumnRenamed("sensor","sensorTemp")
                .withColumnRenamed("ts","tsTemp")
               )

Join with event-time constraints (made easy by the renaming done above).

In [68]:
join_sdf = (smoke_events.join(
  high_temperature_events, expr("""
    (sensorTemp == sensorSmoke) AND
    (tsTemp > tsSmoke ) AND
    (tsTemp < tsSmoke + interval 2 minute )
    """
    )))

In [69]:
q7 = (join_sdf
                     .writeStream
                     .format("memory")
                     .queryName("sinkTable")
                     .start())

In [76]:
q7.status

{'message': 'Processing new data',
 'isDataAvailable': True,
 'isTriggerActive': True}

**IMPORTANT** To detect fire, run the appropriate cells in the data generators.

In [None]:
spark.sql("SELECT * FROM sinkTable ORDER BY tsTemp DESC").show(20,False) # note, I change ts in tsTemp

If you are following carefully the instructions the answer should be empty because we are sending temperature around 55°C, but there is no `smoke==true`, yet.

Go to the `smoke_sensor_simulator` notebook and start sending `smoke==true`.

In [78]:
spark.sql("SELECT * FROM sinkTable ORDER BY tsTemp DESC").show(20,False) # note, I change ts in tsTemp

+-----------+-----+-------------------+----------+------------------+-------------------+
|sensorSmoke|smoke|tsSmoke            |sensorTemp|temperature       |tsTemp             |
+-----------+-----+-------------------+----------+------------------+-------------------+
|S1         |true |2024-10-19 05:24:39|S1        |53.92623421948255 |2024-10-19 05:25:26|
|S1         |true |2024-10-19 05:24:49|S1        |53.92623421948255 |2024-10-19 05:25:26|
|S1         |true |2024-10-19 05:25:00|S1        |53.92623421948255 |2024-10-19 05:25:26|
|S1         |true |2024-10-19 05:25:10|S1        |53.92623421948255 |2024-10-19 05:25:26|
|S1         |true |2024-10-19 05:25:20|S1        |53.92623421948255 |2024-10-19 05:25:26|
|S1         |true |2024-10-19 05:24:39|S1        |55.41546969937239 |2024-10-19 05:25:16|
|S1         |true |2024-10-19 05:25:00|S1        |55.41546969937239 |2024-10-19 05:25:16|
|S1         |true |2024-10-19 05:25:10|S1        |55.41546969937239 |2024-10-19 05:25:16|
|S1       

let's have a look to the progresses

In [79]:
from IPython.display import clear_output
import json
while True:
    print(json.dumps(q7.lastProgress, indent=4))
    print(q7.status)
    time.sleep(1)
    clear_output(wait=True)
    

{
    "id": "5156d3a4-ac4a-467d-ba54-e27cd25d12ac",
    "runId": "c30fce59-3ef9-43ed-af86-1b8596fa2539",
    "name": "sinkTable",
    "timestamp": "2024-10-19T05:26:23.462Z",
    "batchId": 15,
    "numInputRows": 2,
    "inputRowsPerSecond": 0.27037988373665,
    "processedRowsPerSecond": 0.20540207456095305,
    "durationMs": {
        "addBatch": 9621,
        "getBatch": 0,
        "latestOffset": 1,
        "queryPlanning": 77,
        "triggerExecution": 9737,
        "walCommit": 19
    },
    "stateOperators": [
        {
            "numRowsTotal": 46,
            "numRowsUpdated": 2,
            "memoryUsedBytes": 333640,
            "customMetrics": {
                "loadedMapCacheHitCount": 6000,
                "loadedMapCacheMissCount": 0,
                "stateOnCurrentVersionSizeBytes": 46128
            }
        }
    ],
    "sources": [
        {
            "description": "KafkaV2[Subscribe[SmokeSensorEvent]]",
            "startOffset": {
                "SmokeSen

KeyboardInterrupt: 

to interrupt the execution of the cell, prese the square icon in the bar or choose *interrupt kernel* from the *kernel* dropdown menu

#### Discussion

> This query is equivalent to the EPL pattern `every a = SmokeSensorEvent(smoke=true) -> every TemperatureSensorEvent(temperature > 50, sensor=a.sensor) where timer:within(1 min)`. 
>
> Do not expect the same performances! It is evaluated as a relational join. Spark Structured Streaming lacks the specilized data structure of Esper.
>
> **It does not tame the torrent effect**, but this is expected! 
>
> Spark Structured Streaming is a Data Stream Management System meant to tame *flow that you cannot stop*

Even id Q8 consumes Q7 results, we can stop Q7 because we only need the streaming Data Frame `join_sdf`. We do not need Q7 to write its result in the in memory table.

In [80]:
q7.stop()

## Q8 - Count FireEvent

we are very close to the solution of the running example, we "just" need to count the number of events generated by the previous query over an hopping window of 1 minutes that slides every 30 seconds (was a sliding window of 10 secondsin EPL). 

So let's count the results of Q7. 

**NOTE**: the following queries give *intentionally* errors

In [81]:
q8 = (join_sdf
            .groupBy(window("tsTemp", "1 minutes", "30 seconds"),"sensorTemp")
            .count()
            .writeStream
            .outputMode("complete")
            .format("memory")
            .queryName("sinkTable") 
            .start())

AnalysisException: Join between two streaming DataFrames/Datasets is not supported in Complete output mode, only in Append output mode;;
Join Inner, (((sensorTemp#18937 = sensorSmoke#18929) AND (tsTemp#18941 > tsSmoke#18933)) AND (tsTemp#18941 < cast(tsSmoke#18933 + 2 minutes as timestamp)))
:- Project [sensorSmoke#18929, smoke#182, ts#183 AS tsSmoke#18933]
:  +- Project [sensor#181 AS sensorSmoke#18929, smoke#182, ts#183]
:     +- Filter (smoke#182 = true)
:        +- Project [value#179.sensor AS sensor#181, value#179.smoke AS smoke#182, value#179.ts AS ts#183]
:           +- Project [from_json(StructField(sensor,StringType,true), StructField(smoke,BooleanType,true), StructField(ts,TimestampType,true), cast(value#166 as string), Some(Etc/UTC)) AS value#179]
:              +- StreamingRelationV2 org.apache.spark.sql.kafka010.KafkaSourceProvider@3e7077f8, kafka, org.apache.spark.sql.kafka010.KafkaSourceProvider$KafkaTable@240222e, org.apache.spark.sql.util.CaseInsensitiveStringMap@7098eff9, [key#165, value#166, topic#167, partition#168, offset#169L, timestamp#170, timestampType#171], StreamingRelation DataSource(org.apache.spark.sql.SparkSession@69bd9244,kafka,List(),None,List(),None,Map(startingOffsets -> earliest, subscribe -> SmokeSensorEvent, kafka.bootstrap.servers -> kafka:9092),None), kafka, [key#158, value#159, topic#160, partition#161, offset#162L, timestamp#163, timestampType#164]
+- Project [sensorTemp#18937, temperature#278, ts#279 AS tsTemp#18941]
   +- Project [sensor#277 AS sensorTemp#18937, temperature#278, ts#279]
      +- Filter (temperature#278 > cast(50 as double))
         +- Project [value#275.sensor AS sensor#277, value#275.temperature AS temperature#278, value#275.ts AS ts#279]
            +- Project [from_json(StructField(sensor,StringType,true), StructField(temperature,DoubleType,true), StructField(ts,TimestampType,true), cast(value#262 as string), Some(Etc/UTC)) AS value#275]
               +- StreamingRelationV2 org.apache.spark.sql.kafka010.KafkaSourceProvider@746b7573, kafka, org.apache.spark.sql.kafka010.KafkaSourceProvider$KafkaTable@4cf4f932, org.apache.spark.sql.util.CaseInsensitiveStringMap@24f684c4, [key#261, value#262, topic#263, partition#264, offset#265L, timestamp#266, timestampType#267], StreamingRelation DataSource(org.apache.spark.sql.SparkSession@69bd9244,kafka,List(),None,List(),None,Map(startingOffsets -> earliest, subscribe -> TemperatureSensorEvent, kafka.bootstrap.servers -> kafka:9092),None), kafka, [key#254, value#255, topic#256, partition#257, offset#258L, timestamp#259, timestampType#260]


Indeed, join between two streaming DataFrames/Datasets is not supported in Complete output mode, only in Append output mode. So let's try to use append mode.

In [82]:
q8 = (join_sdf
            .groupBy(window("tsTemp", "1 minutes", "30 seconds"),"sensorTemp")
            .count()
            .writeStream
            .outputMode("append") # <-- CHANGE HERE
            .format("memory")
            .queryName("sinkTable") 
            .start())

AnalysisException: Append output mode not supported when there are streaming aggregations on streaming DataFrames/DataSets without watermark;;
Aggregate [window#106994, sensorTemp#18937], [window#106994 AS window#106985, sensorTemp#18937, count(1) AS count#106993L]
+- Filter ((tsTemp#18941 >= window#106994.start) AND (tsTemp#18941 < window#106994.end))
   +- Expand [ArrayBuffer(named_struct(start, precisetimestampconversion(((((CASE WHEN (cast(CEIL((cast((precisetimestampconversion(tsTemp#18941, TimestampType, LongType) - 0) as double) / cast(30000000 as double))) as double) = (cast((precisetimestampconversion(tsTemp#18941, TimestampType, LongType) - 0) as double) / cast(30000000 as double))) THEN (CEIL((cast((precisetimestampconversion(tsTemp#18941, TimestampType, LongType) - 0) as double) / cast(30000000 as double))) + cast(1 as bigint)) ELSE CEIL((cast((precisetimestampconversion(tsTemp#18941, TimestampType, LongType) - 0) as double) / cast(30000000 as double))) END + cast(0 as bigint)) - cast(2 as bigint)) * 30000000) + 0), LongType, TimestampType), end, precisetimestampconversion((((((CASE WHEN (cast(CEIL((cast((precisetimestampconversion(tsTemp#18941, TimestampType, LongType) - 0) as double) / cast(30000000 as double))) as double) = (cast((precisetimestampconversion(tsTemp#18941, TimestampType, LongType) - 0) as double) / cast(30000000 as double))) THEN (CEIL((cast((precisetimestampconversion(tsTemp#18941, TimestampType, LongType) - 0) as double) / cast(30000000 as double))) + cast(1 as bigint)) ELSE CEIL((cast((precisetimestampconversion(tsTemp#18941, TimestampType, LongType) - 0) as double) / cast(30000000 as double))) END + cast(0 as bigint)) - cast(2 as bigint)) * 30000000) + 0) + 60000000), LongType, TimestampType)), sensorSmoke#18929, smoke#182, tsSmoke#18933, sensorTemp#18937, temperature#278, tsTemp#18941), ArrayBuffer(named_struct(start, precisetimestampconversion(((((CASE WHEN (cast(CEIL((cast((precisetimestampconversion(tsTemp#18941, TimestampType, LongType) - 0) as double) / cast(30000000 as double))) as double) = (cast((precisetimestampconversion(tsTemp#18941, TimestampType, LongType) - 0) as double) / cast(30000000 as double))) THEN (CEIL((cast((precisetimestampconversion(tsTemp#18941, TimestampType, LongType) - 0) as double) / cast(30000000 as double))) + cast(1 as bigint)) ELSE CEIL((cast((precisetimestampconversion(tsTemp#18941, TimestampType, LongType) - 0) as double) / cast(30000000 as double))) END + cast(1 as bigint)) - cast(2 as bigint)) * 30000000) + 0), LongType, TimestampType), end, precisetimestampconversion((((((CASE WHEN (cast(CEIL((cast((precisetimestampconversion(tsTemp#18941, TimestampType, LongType) - 0) as double) / cast(30000000 as double))) as double) = (cast((precisetimestampconversion(tsTemp#18941, TimestampType, LongType) - 0) as double) / cast(30000000 as double))) THEN (CEIL((cast((precisetimestampconversion(tsTemp#18941, TimestampType, LongType) - 0) as double) / cast(30000000 as double))) + cast(1 as bigint)) ELSE CEIL((cast((precisetimestampconversion(tsTemp#18941, TimestampType, LongType) - 0) as double) / cast(30000000 as double))) END + cast(1 as bigint)) - cast(2 as bigint)) * 30000000) + 0) + 60000000), LongType, TimestampType)), sensorSmoke#18929, smoke#182, tsSmoke#18933, sensorTemp#18937, temperature#278, tsTemp#18941)], [window#106994, sensorSmoke#18929, smoke#182, tsSmoke#18933, sensorTemp#18937, temperature#278, tsTemp#18941]
      +- Join Inner, (((sensorTemp#18937 = sensorSmoke#18929) AND (tsTemp#18941 > tsSmoke#18933)) AND (tsTemp#18941 < cast(tsSmoke#18933 + 2 minutes as timestamp)))
         :- Project [sensorSmoke#18929, smoke#182, ts#183 AS tsSmoke#18933]
         :  +- Project [sensor#181 AS sensorSmoke#18929, smoke#182, ts#183]
         :     +- Filter (smoke#182 = true)
         :        +- Project [value#179.sensor AS sensor#181, value#179.smoke AS smoke#182, value#179.ts AS ts#183]
         :           +- Project [from_json(StructField(sensor,StringType,true), StructField(smoke,BooleanType,true), StructField(ts,TimestampType,true), cast(value#166 as string), Some(Etc/UTC)) AS value#179]
         :              +- StreamingRelationV2 org.apache.spark.sql.kafka010.KafkaSourceProvider@3e7077f8, kafka, org.apache.spark.sql.kafka010.KafkaSourceProvider$KafkaTable@240222e, org.apache.spark.sql.util.CaseInsensitiveStringMap@7098eff9, [key#165, value#166, topic#167, partition#168, offset#169L, timestamp#170, timestampType#171], StreamingRelation DataSource(org.apache.spark.sql.SparkSession@69bd9244,kafka,List(),None,List(),None,Map(startingOffsets -> earliest, subscribe -> SmokeSensorEvent, kafka.bootstrap.servers -> kafka:9092),None), kafka, [key#158, value#159, topic#160, partition#161, offset#162L, timestamp#163, timestampType#164]
         +- Project [sensorTemp#18937, temperature#278, ts#279 AS tsTemp#18941]
            +- Project [sensor#277 AS sensorTemp#18937, temperature#278, ts#279]
               +- Filter (temperature#278 > cast(50 as double))
                  +- Project [value#275.sensor AS sensor#277, value#275.temperature AS temperature#278, value#275.ts AS ts#279]
                     +- Project [from_json(StructField(sensor,StringType,true), StructField(temperature,DoubleType,true), StructField(ts,TimestampType,true), cast(value#262 as string), Some(Etc/UTC)) AS value#275]
                        +- StreamingRelationV2 org.apache.spark.sql.kafka010.KafkaSourceProvider@746b7573, kafka, org.apache.spark.sql.kafka010.KafkaSourceProvider$KafkaTable@4cf4f932, org.apache.spark.sql.util.CaseInsensitiveStringMap@24f684c4, [key#261, value#262, topic#263, partition#264, offset#265L, timestamp#266, timestampType#267], StreamingRelation DataSource(org.apache.spark.sql.SparkSession@69bd9244,kafka,List(),None,List(),None,Map(startingOffsets -> earliest, subscribe -> TemperatureSensorEvent, kafka.bootstrap.servers -> kafka:9092),None), kafka, [key#254, value#255, topic#256, partition#257, offset#258L, timestamp#259, timestampType#260]


Append output mode is not supported when there are streaming aggregations on streaming DataFrames/DataSets without watermark. Indeed, **the streaming join can create out of orders**.

Let's add a watermark, then.

In [83]:
q8 = (join_sdf
            .withWatermark(delayThreshold="2 minutes",eventTime="tsTemp") # <-- CHANGE HERE
            .groupBy(window("tsTemp", "1 minutes", "30 seconds"),"sensorTemp")
            .count()
            .writeStream
            .outputMode("append") 
            .format("memory")
            .queryName("sinkTable") 
            .start())

NOTE: 2 minutes is maximum delay that the join can cause given the way we declared it. The temperature and the smoke event cannot be more than 2 minutes apart.

**THUS, BE PATIENT! YOU'RE NOT GOING TO GET ANY RESULT BEFORE 2 MINUTES HAS PASSED**

In [84]:
spark.sql("SELECT * FROM sinkTable ORDER BY window DESC").show(5,False)

+------------------------------------------+----------+-----+
|window                                    |sensorTemp|count|
+------------------------------------------+----------+-----+
|[2024-10-19 05:29:00, 2024-10-19 05:30:00]|S1        |72   |
|[2024-10-19 05:28:30, 2024-10-19 05:29:30]|S1        |72   |
|[2024-10-19 05:28:00, 2024-10-19 05:29:00]|S1        |72   |
|[2024-10-19 05:27:30, 2024-10-19 05:28:30]|S1        |72   |
|[2024-10-19 05:27:00, 2024-10-19 05:28:00]|S1        |72   |
+------------------------------------------+----------+-----+
only showing top 5 rows



as expected we get 72 results every 2 minutes: 6 smoke=true X 6 temperature>50 X 2 minutes

In [85]:
q8.stop()