## Download the libraries

In [None]:
! rm -rf jars
!mkdir jars
!wget -q -P jars https://repo1.maven.org/maven2/org/apache/spark/spark-sql-kafka-0-10_2.12/3.4.1/spark-sql-kafka-0-10_2.12-3.4.1.jar
!wget -q -P jars https://repo1.maven.org/maven2/org/apache/kafka/kafka-clients/3.5.1/kafka-clients-3.5.1.jar
!wget -q -P jars https://repo1.maven.org/maven2/org/apache/spark/spark-token-provider-kafka-0-10_2.12/3.4.1/spark-token-provider-kafka-0-10_2.12-3.4.1.jar
!wget -q -P jars https://repo1.maven.org/maven2/org/scala-lang/scala-library/2.12.18/scala-library-2.12.18.jar
!wget -q -P jars https://repo1.maven.org/maven2/org/apache/commons/commons-pool2/2.11.1/commons-pool2-2.11.1.jar

## Set up SparkSession

In [2]:
import os
from pyspark.sql import SparkSession

In [4]:
base_dir = os.getcwd() + '/jars'

spark = (SparkSession.builder
        .master('local[*]')
        .appName('Spark Structured Streaming example with Kafka')
        .config("spark.jars", 
                base_dir + '/kafka-clients-3.5.1.jar' + "," + 
                base_dir +'/spark-sql-kafka-0-10_2.12-3.4.1.jar' + "," + 
                base_dir + '/spark-token-provider-kafka-0-10_2.12-3.4.1.jar' + "," + 
                base_dir + '/scala-library-2.12.18.jar' + "," + 
                base_dir + '/commons-pool2-2.11.1.jar')
        .getOrCreate())

spark

## Define the schema for our data

In [7]:
from pyspark.sql.types import *

In [8]:
schema = StructType([
    StructField("VP", StructType([
      StructField("desi", StringType()),
      StructField("dir", StringType()),
      StructField("oper", IntegerType()),
      StructField("veh", IntegerType()),
      StructField("tst", TimestampType()),
      StructField("tsi", LongType()),
      StructField("spd", DoubleType()),
      StructField("hdg", IntegerType()),
      StructField("lat", DoubleType()),
      StructField("long", DoubleType()),
      StructField("acc", DoubleType()),
      StructField("dl", IntegerType()),
      StructField("odo", StringType()),
      StructField("drst", StringType()),
      StructField("oday", DateType()),
      StructField("jrn", IntegerType()),
      StructField("line", IntegerType()),
      StructField("start", StringType()),
      StructField("loc", StringType()),
      StructField("stop", LongType()),
      StructField("route", StringType()),
      StructField("occu", IntegerType())
    ]))
])

## Initialize the stream

In [9]:
from pyspark import SparkContext
from pyspark.streaming import StreamingContext

In [10]:
lines = (spark.readStream
            .format("kafka")
            .option("kafka.bootstrap.servers", "broker:29092") # localhost:9092 local, broker:29092 docker
            .option("subscribe", "vehicle-positions")
            .option("startingOffsets", "earliest")
            .load()
            .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)"))

In [11]:
from pyspark.sql.functions import *


In [12]:
select = (lines
    .select(from_json(col("value"), schema).alias("json")) 
    .select("json.VP.*"))

In [13]:
select.printSchema()

root
 |-- desi: string (nullable = true)
 |-- dir: string (nullable = true)
 |-- oper: integer (nullable = true)
 |-- veh: integer (nullable = true)
 |-- tst: timestamp (nullable = true)
 |-- tsi: long (nullable = true)
 |-- spd: double (nullable = true)
 |-- hdg: integer (nullable = true)
 |-- lat: double (nullable = true)
 |-- long: double (nullable = true)
 |-- acc: double (nullable = true)
 |-- dl: integer (nullable = true)
 |-- odo: string (nullable = true)
 |-- drst: string (nullable = true)
 |-- oday: date (nullable = true)
 |-- jrn: integer (nullable = true)
 |-- line: integer (nullable = true)
 |-- start: string (nullable = true)
 |-- loc: string (nullable = true)
 |-- stop: long (nullable = true)
 |-- route: string (nullable = true)
 |-- occu: integer (nullable = true)



## Perform streaming transformations

In [14]:
df = (select.groupBy(
        window(col("tst"), "10 minutes", "10 minutes"),
        col("route")
      ).count())

In [15]:
df.printSchema()

root
 |-- window: struct (nullable = false)
 |    |-- start: timestamp (nullable = true)
 |    |-- end: timestamp (nullable = true)
 |-- route: string (nullable = true)
 |-- count: long (nullable = false)



## Start the streaming query

### Append mode 

In [16]:
query_append = (df.writeStream
            .format("memory")
            .outputMode("update")
            .queryName("query_append")
            .start())

In [36]:
(spark.sql("select * from query_append ")
    .filter(col('route') == '2113')
    .show(n = 100, truncate = False))

+------------------------------------------+-----+-----+
|window                                    |route|count|
+------------------------------------------+-----+-----+
|{2023-08-04 12:30:00, 2023-08-04 12:40:00}|2113 |16   |
|{2023-08-04 12:50:00, 2023-08-04 13:00:00}|2113 |533  |
|{2023-08-04 13:00:00, 2023-08-04 13:10:00}|2113 |104  |
|{2023-08-04 13:10:00, 2023-08-04 13:20:00}|2113 |35   |
|{2023-08-04 12:40:00, 2023-08-04 12:50:00}|2113 |169  |
|{2023-08-04 13:40:00, 2023-08-04 13:50:00}|2113 |30   |
|{2023-08-04 13:40:00, 2023-08-04 13:50:00}|2113 |64   |
|{2023-08-04 13:40:00, 2023-08-04 13:50:00}|2113 |96   |
|{2023-08-04 13:40:00, 2023-08-04 13:50:00}|2113 |126  |
|{2023-08-04 13:40:00, 2023-08-04 13:50:00}|2113 |154  |
|{2023-08-04 13:40:00, 2023-08-04 13:50:00}|2113 |181  |
|{2023-08-04 13:40:00, 2023-08-04 13:50:00}|2113 |189  |
+------------------------------------------+-----+-----+



### Complete mode

In [17]:
query_complete = (df.writeStream
            .format("memory")
            .outputMode("complete")
            .queryName("query_complete")
            .start())

In [33]:
(spark.sql("select * from query_complete ")
    .filter(col('route') == '2113')
    .show(n = 100, truncate = False))

+------------------------------------------+-----+-----+
|window                                    |route|count|
+------------------------------------------+-----+-----+
|{2023-08-04 12:30:00, 2023-08-04 12:40:00}|2113 |16   |
|{2023-08-04 12:50:00, 2023-08-04 13:00:00}|2113 |533  |
|{2023-08-04 13:40:00, 2023-08-04 13:50:00}|2113 |189  |
|{2023-08-04 13:00:00, 2023-08-04 13:10:00}|2113 |104  |
|{2023-08-04 13:10:00, 2023-08-04 13:20:00}|2113 |35   |
|{2023-08-04 12:40:00, 2023-08-04 12:50:00}|2113 |169  |
+------------------------------------------+-----+-----+



### Update mode

In [18]:
query_update = (df.writeStream
            .format("memory")
            .outputMode("update")
            .queryName("query_update")
            .start())

In [37]:
(spark.sql("select * from query_update ")
    .filter(col('route') == '2113')
    .show(n = 100, truncate = False))

+------------------------------------------+-----+-----+
|window                                    |route|count|
+------------------------------------------+-----+-----+
|{2023-08-04 12:30:00, 2023-08-04 12:40:00}|2113 |16   |
|{2023-08-04 12:50:00, 2023-08-04 13:00:00}|2113 |533  |
|{2023-08-04 13:00:00, 2023-08-04 13:10:00}|2113 |104  |
|{2023-08-04 13:10:00, 2023-08-04 13:20:00}|2113 |35   |
|{2023-08-04 12:40:00, 2023-08-04 12:50:00}|2113 |169  |
|{2023-08-04 13:40:00, 2023-08-04 13:50:00}|2113 |18   |
|{2023-08-04 13:40:00, 2023-08-04 13:50:00}|2113 |54   |
|{2023-08-04 13:40:00, 2023-08-04 13:50:00}|2113 |84   |
|{2023-08-04 13:40:00, 2023-08-04 13:50:00}|2113 |115  |
|{2023-08-04 13:40:00, 2023-08-04 13:50:00}|2113 |144  |
|{2023-08-04 13:40:00, 2023-08-04 13:50:00}|2113 |172  |
|{2023-08-04 13:40:00, 2023-08-04 13:50:00}|2113 |189  |
+------------------------------------------+-----+-----+



In [39]:
query_append.stop()
query_complete.stop()
query_update.stop()
