# Download the libraries

In [None]:
! rm -rf jars
!mkdir jars
!wget -q -P jars https://repo1.maven.org/maven2/org/apache/spark/spark-sql-kafka-0-10_2.12/3.4.1/spark-sql-kafka-0-10_2.12-3.4.1.jar
!wget -q -P jars https://repo1.maven.org/maven2/org/apache/kafka/kafka-clients/3.5.1/kafka-clients-3.5.1.jar
!wget -q -P jars https://repo1.maven.org/maven2/org/apache/spark/spark-token-provider-kafka-0-10_2.12/3.4.1/spark-token-provider-kafka-0-10_2.12-3.4.1.jar
!wget -q -P jars https://repo1.maven.org/maven2/org/scala-lang/scala-library/2.12.18/scala-library-2.12.18.jar
!wget -q -P jars https://repo1.maven.org/maven2/org/apache/commons/commons-pool2/2.11.1/commons-pool2-2.11.1.jar

# Set up SparkSession

In [3]:
import os
from pyspark.sql import SparkSession

In [4]:
base_dir = os.getcwd() + '/jars'

spark = (SparkSession.builder
    .master('local[*]')
    .appName('Spark Structured Streaming example with Kafka')
    .config("spark.jars", 
            base_dir + '/kafka-clients-3.5.1.jar' + "," + 
            base_dir +'/spark-sql-kafka-0-10_2.12-3.4.1.jar' + "," + 
            base_dir + '/spark-token-provider-kafka-0-10_2.12-3.4.1.jar' + "," + 
            base_dir + '/scala-library-2.12.18.jar' + "," + 
            base_dir + '/commons-pool2-2.11.1.jar')
    .getOrCreate())

spark

# Define the schema for our data

In [5]:
from pyspark.sql.types import *

In [6]:
schema = StructType([
    StructField("VP", StructType([
      StructField("desi", StringType()),
      StructField("dir", StringType()),
      StructField("oper", IntegerType()),
      StructField("veh", IntegerType()),
      StructField("tst", TimestampType()),
      StructField("tsi", LongType()),
      StructField("spd", DoubleType()),
      StructField("hdg", IntegerType()),
      StructField("lat", DoubleType()),
      StructField("long", DoubleType()),
      StructField("acc", DoubleType()),
      StructField("dl", IntegerType()),
      StructField("odo", StringType()),
      StructField("drst", StringType()),
      StructField("oday", DateType()),
      StructField("jrn", IntegerType()),
      StructField("line", IntegerType()),
      StructField("start", StringType()),
      StructField("loc", StringType()),
      StructField("stop", LongType()),
      StructField("route", StringType()),
      StructField("occu", IntegerType())
    ]))
])

# Initialize the stream

In [7]:
from pyspark import SparkContext
from pyspark.streaming import StreamingContext

We will read the data from the topic `vehicle-positions` in the Kafka cluster

In [8]:
kafka_source_df = (spark.readStream
    .format("kafka")
    .option("kafka.bootstrap.servers", "broker:29092")
    .option("subscribe", "vehicle-positions")
    .option("startingOffsets", "earliest")
    .load()
    .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)"))

In [9]:
from pyspark.sql.functions import *


In [10]:
vehicle_position_df = (kafka_source_df
    .select(from_json(col("value"), schema).alias("json")) 
    .select("json.VP.*"))

In [11]:
vehicle_position_df.printSchema()

root
 |-- desi: string (nullable = true)
 |-- dir: string (nullable = true)
 |-- oper: integer (nullable = true)
 |-- veh: integer (nullable = true)
 |-- tst: timestamp (nullable = true)
 |-- tsi: long (nullable = true)
 |-- spd: double (nullable = true)
 |-- hdg: integer (nullable = true)
 |-- lat: double (nullable = true)
 |-- long: double (nullable = true)
 |-- acc: double (nullable = true)
 |-- dl: integer (nullable = true)
 |-- odo: string (nullable = true)
 |-- drst: string (nullable = true)
 |-- oday: date (nullable = true)
 |-- jrn: integer (nullable = true)
 |-- line: integer (nullable = true)
 |-- start: string (nullable = true)
 |-- loc: string (nullable = true)
 |-- stop: long (nullable = true)
 |-- route: string (nullable = true)
 |-- occu: integer (nullable = true)



<h3>Record Sample</h3> 

<code>
{
  "desi": "M1",
  "dir": "1",
  "oper": 50,
  "veh": 302,
  "tst": "2023-08-28T09:57:56Z",
  "tsi": 1693216676,
  "spd": 11.86,
  "hdg": 52,
  "lat": 60.1721918,
  "long": 24.94817722,
  "acc": null,
  "dl": null,
  "odo": null,
  "drst": null,
  "oday": "2023-08-28",
  "start": "12:26",
  "loc": "MAN",
  "stop": 1020603,
  "route": "31M1",
  "occu": 0,
  "seq": 1
}
</code>

### Perform streaming transformations

[window documentation](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.functions.window.html#pyspark.sql.functions.window)<br>
[withWatermark documentation](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.DataFrame.withWatermark.html#pyspark.sql.DataFrame.withWatermark)

In [None]:
vehicle_position_window_df = (vehicle_position_df
      .withWatermark("tst", "10 seconds")
      .groupBy(
        window(col("tst"), "1 minutes", "1 minutes"), #window(timeColumn, windowDuration, slideDuration) (tumbling windows: an input can belong only to one window)
        col("route")
      ).count())

In [None]:
vehicle_position_window_df.printSchema()

# Start the streaming query

## Append mode (no trigger)

In [None]:
query_append = (vehicle_position_window_df.writeStream
    .format("memory")
    .outputMode("update")
    .queryName("query_append")
    .start())

In [None]:
(spark.sql("select * from query_append")
    .filter(col('route') == '2113')
    .show(n = 100, truncate = False))

In [None]:
query_append.stop()

## Append mode (with trigger)

In [None]:
query_append_trigger = (vehicle_position_window_df.writeStream
    .format("memory")
    .trigger(processingTime = '1 minutes') 
    .outputMode("update")
    .queryName("query_append_trigger")
    .start())

In [None]:
(spark.sql("select * from query_append_trigger")
    .filter(col('route') == '2113')
    .show(n = 100, truncate = False))

<h3> Thoughts on Append mode with trigger and without trigger </h3>
If we don't specify the trigger, Structured Streaming will write multiple times the same window, updating its value each time. <br>
As the data continue to come the later count increase in value for the same window.
<br>
<br>
Instead by setting the trigger at 1 minute we can make sure that the rows are written to the output only one time. <br>
In fact Structured Streaming waits for the late data to arrive up until the watermark interval (10 seconds), and then when window < watermark, writes the row to the sink and drops the intermadiate state.
<br>
<br>
To conclude: the append mode without the trigger should not be used in fact the documentation states theat the <b>append mode</b> is supported for only those queries where rows added to the Result Table are never going to change.

## Complete mode

In [None]:
query_complete = (vehicle_position_window_df.writeStream
            .format("memory")
            .outputMode("complete")
            .queryName("query_complete")
            .start())

In [2]:
(spark.sql("select * from query_complete ")
    .filter(col('route') == '2113')
    .show(n = 100, truncate = False))

NameError: name 'spark' is not defined

In [None]:
query_complete.stop()

<h3> Thoughts on Complete mode </h3>
The complete mode never releases the intermediate state of the aggregation which means that the memory consumption will increase indefinitely for a query such this. 

### Thoughts on watermarking output mode and sink

- The watermarking can only be used in **update** and **append** mode. 
- In **complete** mode the old aggregation state is never dropped and we cannot use watermarking.
- Without the watermarking in the **append** mode the old aggregation state is never dropped.
- The sink format **memory** should be used only for debugging purposes and with low volumes of data since the output is entirely stored in the driver's memory
- The sink format **memory** does not support the **update** mode.

# Join Stream-Static

In [12]:
operators_df = spark.read.csv("operators.csv", header = True, inferSchema = True)

In [None]:
operators_df.show(truncate = False)

In [15]:
operators_df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- operator_name: string (nullable = true)
 |-- country: string (nullable = true)
 |-- city: string (nullable = true)
 |-- address: string (nullable = true)



In [16]:
join_vehicle_operators_df = vehicle_position_df.join(operators_df, vehicle_position_df.oper == operators_df.id, "left_outer")

In [None]:
join_vehicle_operators_df.printSchema()

## Append mode

In [None]:
vehicle_position_operators_group_df = (join_vehicle_operators_df
      .withWatermark("tst", "10 seconds")
      .groupBy(
        window(col("tst"), "1 minutes", "1 minutes"),
        col("oper"),
        col("operator_name") 
      ).count())

In [None]:
vehicle_position_operators_query_append = (vehicle_position_operators_group_df.writeStream
        .format("memory")
        .trigger(processingTime = '1 minutes') 
        .outputMode("append")
        .queryName("vehicle_position_operators_append")
        .start())

In [None]:
(spark.sql("select * from vehicle_position_operators_append")
    .filter(col('oper') == '22')
    .show(truncate = False, n = 30))

In [None]:
vehicle_position_operators_query_append.stop()

## Complete mode

In [None]:
vehicle_position_operators_group_sort_df = (join_vehicle_operators_df
      .groupBy(
        window(col("tst"), "1 minutes", "1 minutes"),
        col("oper"),
        col("operator_name") 
      ).count()
      .orderBy(desc('window')))

In [None]:
vehicle_position_operators_query_complete = (vehicle_position_operators_group_sort_df.writeStream
    .format("memory")
    .trigger(processingTime = '1 minutes') 
    .outputMode("complete")
    .queryName("vehicle_position_operators_complete")
    .start())

In [None]:
(spark.sql("select * from vehicle_position_operators_complete")
    .filter(col('oper') == '22')
    .show(truncate = False, n = 100))

In [None]:
vehicle_position_operators_query_complete.stop()

<h3> Thoughts on sorting </h3>
<b>Sorting operations are supported on streaming Datasets only after an aggregation and in Complete Output Mode</b>. <br>
So we can only use the groupBy in Complete mode and not in Append mode.

<h1>Query mamagement</h1>

[documentation](https://spark.apache.org/docs/3.4.1/api/python/reference/pyspark.ss/api/pyspark.sql.streaming.StreamingQuery.html)

<h3>Get the unique identifier of the running query</h3>

In [None]:
vehicle_position_operators_query_complete.id

<h3>Get the run id of the query</h3>

In [None]:
vehicle_position_operators_query_complete.runId

<h3>Get the name of the auto-generated or user-specified name</h3>

In [None]:
vehicle_position_operators_query_complete.name

<h3>Print detailed explanations of the query</h3>

In [None]:
vehicle_position_operators_query_complete.explain()

<h3>Query recent progress</h3>

In [None]:
vehicle_position_operators_query_complete.recentProgress

<h3>Get the query last progress</h3>

In [None]:
vehicle_position_operators_query_complete.lastProgress

<h3>Get the list of currently active streaming queries</h3>

In [236]:
spark.streams.active

[<pyspark.sql.streaming.query.StreamingQuery at 0xffff71583510>]

<h3>Get the query object from the id</h3>

In [None]:
query = spark.streams.get(vehicle_position_operators_query_complete.id)

In [1]:
query.name

NameError: name 'query' is not defined

<h3>Query active</h3>

In [185]:
vehicle_position_operators_query_complete.isActive

False

<h3>Query status</h3>

In [183]:
vehicle_position_operators_query_complete.status

{'message': 'Stopped', 'isDataAvailable': False, 'isTriggerActive': False}

<h3>Query exception (useful if the query has terminated with an exception)</h3>

In [181]:
vehicle_position_operators_query_complete.exception()

<h3>Await query termination</h3>

In [None]:
#query.awaitTermination([timeout])

<h3>Stop the query</h3>

In [161]:
vehicle_position_operators_query_complete.stop()

<h1>Output Sinks</h1>

<h2>CSV</h2>

In [18]:
vehicle_position_operators_filter_df = (join_vehicle_operators_df
      .withWatermark("tst", "1 seconds")
      .filter(col('oper').isin('22','90','6','30','12','50'))                                  
      .groupBy(
        window(col("tst"), "1 minutes", "1 minutes"),
        col("oper"),
        col("operator_name") 
      ).count())

In [276]:
vehicle_position_operators_filter_mod_df = (vehicle_position_operators_filter_df 
        .withColumn("window",vehicle_position_operators_filter_df.window.cast('string'))
        .coalesce(1)) # writes 1 file csv for each trigger

In [277]:
vehicle_position_operators_filter_mod = (vehicle_position_operators_filter_mod_df.writeStream
        .format("csv")                               # can be "orc", "json", "parquet", etc.
        .option("path", "csv")
        .option("header",True)
        .trigger(processingTime = '1 minutes') 
        .outputMode("append")
        .option("checkpointLocation", "checkpoint")
        .start())

In [253]:
vehicle_position_operators_filter_mod.exception()

In [279]:
vehicle_position_operators_filter_mod.status

{'message': 'Waiting for next trigger',
 'isDataAvailable': True,
 'isTriggerActive': False}

In [255]:
vehicle_position_operators_filter_mod.isActive

True

In [274]:
vehicle_position_operators_filter_mod.stop()

In [265]:
# to delete folders
! rm -rf csv

<h3>Thoughts on CSV sink</h3>
<ul>
    <li> It is not allowed to use the Complete mode so we cannot perform any sorting on the data. </li>
    <li> It is necessary to cast the column window to string because timestamp columns are not allowed when the sink is of type CSV </li>
    <li> It will be generated one CSV file for each trigger </li>
</ul>

<h2>Kafka Topic</h2>

In [41]:
vehicle_position_operators_kafka_df = (join_vehicle_operators_df
      .withWatermark("tst", "1 seconds")
      #.filter(col('oper').isin('22','90','6','30','12','50'))                                  
      .groupBy(
        window(col("tst"), "1 minutes", "1 minutes"),
        col("oper"),
        col("operator_name") 
      ).count()
      .select(col("oper").alias("key"), to_json(struct("window", "count", "operator_name")).alias("value")))  

In [42]:
vehicle_position_operators_kafka_df.printSchema()

root
 |-- key: integer (nullable = true)
 |-- value: string (nullable = true)



In [43]:
# Just to check the output
vehicle_position_operators_kafka_query_memory = (vehicle_position_operators_kafka_df
    .writeStream
    .format("memory")
    .trigger(processingTime = '1 minutes') 
    .outputMode("append")
    .queryName("vehicle_position_operators_kafka_query_memory")
    .start())

In [45]:
(spark.sql("select * from vehicle_position_operators_kafka_query_memory")
    .filter(col('key') == 22)
    .show(truncate = False, n = 100))

+---+-----------------------------------------------------------------------------------------------------------------------------------------------+
|key|value                                                                                                                                          |
+---+-----------------------------------------------------------------------------------------------------------------------------------------------+
|22 |{"window":{"start":"2023-08-28T09:53:00.000Z","end":"2023-08-28T09:54:00.000Z"},"count":4734,"operator_name":"Mueller, Hoeger and Morissette"} |
|22 |{"window":{"start":"2023-08-29T14:45:00.000Z","end":"2023-08-29T14:46:00.000Z"},"count":8160,"operator_name":"Mueller, Hoeger and Morissette"} |
|22 |{"window":{"start":"2023-08-29T14:48:00.000Z","end":"2023-08-29T14:49:00.000Z"},"count":9497,"operator_name":"Mueller, Hoeger and Morissette"} |
|22 |{"window":{"start":"2023-08-28T09:57:00.000Z","end":"2023-08-28T09:58:00.000Z"},"count":5989,"o

In [46]:
vehicle_position_operators_kafka_query_memory.stop()

Before continuing create the topic operators-counts by executing this code on a terminal: 
<br>
<br>
`! docker exec -it broker kafka-topics --create --bootstrap-server broker:9092 --partitions 1 --replication-factor 1 --topic operators-counts`

In [48]:
vehicle_position_operators_kafka_query_kafka = (vehicle_position_operators_kafka_df 
      .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)") 
      .writeStream 
      .format("kafka") 
      .option("kafka.bootstrap.servers", "broker:29092") 
      .option("topic", "topic1") 
      .start())

AnalysisException: checkpointLocation must be specified either through option("checkpointLocation", ...) or SparkSession.conf.set("spark.sql.streaming.checkpointLocation", ...).

In [None]:
vehicle_position_operators_kafka_query_kafka.stop()