In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
from datetime import datetime
from pyspark.sql.functions  import from_unixtime
from pyspark.sql.functions  import to_date
from pyspark.sql import Row
from pyspark.sql.functions import to_json, struct
from pyspark.sql import functions as F

In [0]:
#Creating the schema for the vehicle data json structure
jsonschema = StructType() \
.add("id", StringType()) \
.add("timestamp", TimestampType()) \
.add("rpm", IntegerType()) \
.add("speed", IntegerType()) \
.add("kms", IntegerType()) 

In [0]:
#  We are reading from kafkaenabledhub1. Please change this value if you are using a different EventHub name in the Python script.
TOPIC = "kafkaenabledhub1"
BOOTSTRAP_SERVERS = "kafkaenabledeventhubns.servicebus.windows.net:9093"
EH_SASL = "kafkashaded.org.apache.kafka.common.security.plain.PlainLoginModule required username=\"$ConnectionString\" password=\"Endpoint=sb://kafkaenabledeventhubns.servicebus.windows.net/;SharedAccessKeyName=sendreceivekafka;SharedAccessKey=FLdyRBpyGt6Pluis5b79vRTwuHOy/OjwijF7jsCmFnA=\";"
GROUP_ID = "$Default"

# // Read stream using Spark SQL (structured streaming)
# // consider adding .option("startingOffsets", "earliest") to read earliest available offset during testing
kafkaDF = spark.readStream \
    .format("kafka") \
    .option("subscribe", TOPIC) \
    .option("kafka.bootstrap.servers", BOOTSTRAP_SERVERS) \
    .option("kafka.sasl.mechanism", "PLAIN") \
    .option("kafka.security.protocol", "SASL_SSL") \
    .option("kafka.sasl.jaas.config", EH_SASL) \
    .option("kafka.request.timeout.ms", "60000") \
    .option("kafka.session.timeout.ms", "60000") \
    .option("kafka.group.id", GROUP_ID) \
    .option("failOnDataLoss", "false") \
    .option("startingOffsets", "latest") \
    .load()




In [0]:
#Converting binary datatype to string for the dataframe columns. Without this you cannot use from_json function as it expects the column datatype as string not binary
kafkaDF=kafkaDF.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")

In [0]:
#Adding new column vehiclejson which is a struct and has 5 columns id, timestamp,rpm,speed and kms
newkafkaDF=kafkaDF.withColumn('vehiclejson', from_json(col('value'),schema=jsonschema))
kafkajsonDF=newkafkaDF.select("key","value", "vehiclejson.*")

In [0]:
#you can uncomment and run the below command to view the column values
# display(kafkajsonDF)

### Tumbling window non-overlapping event time

In [0]:
kafkajsonDF.groupBy(window('timestamp',"1 minutes"),'id').count().orderBy('window') \
.writeStream.format("delta") \
.outputMode("complete") \
.option("truncate", "false") \
.option("checkpointLocation", "dbfs:/Vehiclechkpointkafkaeventhub_Agg_Chkpoint_Tumbling1/") \
.option("mergeSchema", "true") \
.start("dbfs:/VehiclechkpointKafkaEventHub_Delta_Agg_Tumbling1") 

In [0]:
%sql
DROP TABLE IF EXISTS VehiclechkpointKafkaEventHub_Delta_Agg_Tumbling;
CREATE TABLE IF NOT EXISTS VehiclechkpointKafkaEventHub_Delta_Agg_Tumbling
USING DELTA
LOCATION "dbfs:/VehiclechkpointKafkaEventHub_Delta_Agg_Tumbling1/"

In [0]:
%sql
SELECT * FROM VehiclechkpointKafkaEventHub_Delta_Agg_Tumbling ORDER BY Window desc

window,id,count
"List(2021-07-30T01:55:00.000+0000, 2021-07-30T01:56:00.000+0000)",a529fb2a-5f7c-4aa1-976c-e4b683b919f2,1
"List(2021-07-30T01:55:00.000+0000, 2021-07-30T01:56:00.000+0000)",67f5187f-d1c0-431b-844a-2f99f49d59e3,1
"List(2021-07-30T01:55:00.000+0000, 2021-07-30T01:56:00.000+0000)",afa56211-c75c-4d76-9f17-2e550cd51fa8,2
"List(2021-07-30T01:55:00.000+0000, 2021-07-30T01:56:00.000+0000)",66ce2ce6-3b23-4f92-a3b7-a7dfd3fe7caf,1
"List(2021-07-30T01:55:00.000+0000, 2021-07-30T01:56:00.000+0000)",45e71b4f-f46e-4e58-959d-e2afa7830875,2
"List(2021-07-30T01:55:00.000+0000, 2021-07-30T01:56:00.000+0000)",19eeda6f-0489-4c68-937a-f91ba866643b,1
"List(2021-07-30T01:55:00.000+0000, 2021-07-30T01:56:00.000+0000)",e6a2c890-16f7-4dc8-b062-33b3e44b5c3a,2
"List(2021-07-30T01:55:00.000+0000, 2021-07-30T01:56:00.000+0000)",a1a4b4ba-7f9c-4058-aa15-b79f3fd5dd7d,2
"List(2021-07-30T01:55:00.000+0000, 2021-07-30T01:56:00.000+0000)",c91ff0d5-bffc-458f-9a15-25fac0fc6cc8,1
"List(2021-07-30T01:55:00.000+0000, 2021-07-30T01:56:00.000+0000)",c7bb7aa9-cc1c-453f-a86e-ca171c710e85,2


In [0]:
%sql
SELECT * FROM VehiclechkpointKafkaEventHub_Delta_Agg_Tumbling where id='c7bb7aa9-cc1c-453f-a86e-ca171c710e85'

window,id,count
"List(2021-07-30T01:54:00.000+0000, 2021-07-30T01:55:00.000+0000)",c7bb7aa9-cc1c-453f-a86e-ca171c710e85,10
"List(2021-07-30T01:55:00.000+0000, 2021-07-30T01:56:00.000+0000)",c7bb7aa9-cc1c-453f-a86e-ca171c710e85,2
"List(2021-07-30T01:53:00.000+0000, 2021-07-30T01:54:00.000+0000)",c7bb7aa9-cc1c-453f-a86e-ca171c710e85,9


### Overlapping windows time with sliding time

In [0]:
for s in spark.streams.active:
    s.stop()

In [0]:
kafkajsonDF.groupBy(window('timestamp',"2 minutes","1 minutes"),'id').count().orderBy('window') \
.writeStream.format("delta") \
.outputMode("complete") \
.option("truncate", "false") \
.option("checkpointLocation", "dbfs:/Vehiclechkpointkafkaeventhub_Agg_Chkpoint_Overlapping5/") \
.option("mergeSchema", "true") \
.start("dbfs:/VehiclechkpointKafkaEventHub_Delta_Agg_Overlapping5") 


In [0]:
%sql
-- Creating the table on delta location
DROP  TABLE IF EXISTS VehiclechkpointKafkaEventHub_Delta_Agg_Overlapping;
CREATE TABLE IF NOT EXISTS VehiclechkpointKafkaEventHub_Delta_Agg_Overlapping
USING DELTA
LOCATION "dbfs:/VehiclechkpointKafkaEventHub_Delta_Agg_Overlapping5/"

In [0]:
%sql
select * from VehiclechkpointKafkaEventHub_Delta_Agg_Overlapping
order by window desc

window,id,count
"List(2021-07-30T03:07:00.000+0000, 2021-07-30T03:09:00.000+0000)",c2c7cb35-2f97-4fab-ab23-62fe24eca7af,6
"List(2021-07-30T03:07:00.000+0000, 2021-07-30T03:09:00.000+0000)",fc806137-2796-426b-be35-b5e76db9af16,6
"List(2021-07-30T03:07:00.000+0000, 2021-07-30T03:09:00.000+0000)",14b522d2-e33d-4dd5-a8fe-9a7e637f1cc9,7
"List(2021-07-30T03:07:00.000+0000, 2021-07-30T03:09:00.000+0000)",c872c0b8-4880-48e3-a981-a9430ed2effb,6
"List(2021-07-30T03:07:00.000+0000, 2021-07-30T03:09:00.000+0000)",96d491ff-11ef-4518-82b8-59c0ce521595,6
"List(2021-07-30T03:07:00.000+0000, 2021-07-30T03:09:00.000+0000)",94055835-edea-45a6-95db-0a17d0d3dac6,6
"List(2021-07-30T03:07:00.000+0000, 2021-07-30T03:09:00.000+0000)",544f5760-81bf-478c-b78a-f3cbba753d37,6
"List(2021-07-30T03:07:00.000+0000, 2021-07-30T03:09:00.000+0000)",ff9bff56-b8be-4668-bb81-acc68da68c50,6
"List(2021-07-30T03:07:00.000+0000, 2021-07-30T03:09:00.000+0000)",0efe5ed2-7fc8-4544-b6eb-3016b2faa436,6
"List(2021-07-30T03:07:00.000+0000, 2021-07-30T03:09:00.000+0000)",469cc591-fdc7-4f46-a721-a1a80554c359,6


In [0]:
%sql
select * from VehiclechkpointKafkaEventHub_Delta_Agg_Overlapping
order by window desc

window,id,count
"List(2021-07-30T03:07:00.000+0000, 2021-07-30T03:09:00.000+0000)",c2c7cb35-2f97-4fab-ab23-62fe24eca7af,12
"List(2021-07-30T03:07:00.000+0000, 2021-07-30T03:09:00.000+0000)",fc806137-2796-426b-be35-b5e76db9af16,12
"List(2021-07-30T03:07:00.000+0000, 2021-07-30T03:09:00.000+0000)",14b522d2-e33d-4dd5-a8fe-9a7e637f1cc9,14
"List(2021-07-30T03:07:00.000+0000, 2021-07-30T03:09:00.000+0000)",c872c0b8-4880-48e3-a981-a9430ed2effb,12
"List(2021-07-30T03:07:00.000+0000, 2021-07-30T03:09:00.000+0000)",96d491ff-11ef-4518-82b8-59c0ce521595,12
"List(2021-07-30T03:07:00.000+0000, 2021-07-30T03:09:00.000+0000)",94055835-edea-45a6-95db-0a17d0d3dac6,12
"List(2021-07-30T03:07:00.000+0000, 2021-07-30T03:09:00.000+0000)",544f5760-81bf-478c-b78a-f3cbba753d37,12
"List(2021-07-30T03:07:00.000+0000, 2021-07-30T03:09:00.000+0000)",ff9bff56-b8be-4668-bb81-acc68da68c50,12
"List(2021-07-30T03:07:00.000+0000, 2021-07-30T03:09:00.000+0000)",0efe5ed2-7fc8-4544-b6eb-3016b2faa436,12
"List(2021-07-30T03:07:00.000+0000, 2021-07-30T03:09:00.000+0000)",469cc591-fdc7-4f46-a721-a1a80554c359,12


In [0]:
%sql
select * from VehiclechkpointKafkaEventHub_Delta_Agg_Overlapping
where id ='c2c7cb35-2f97-4fab-ab23-62fe24eca7af' 
order by window desc

window,id,count
"List(2021-07-30T03:08:00.000+0000, 2021-07-30T03:10:00.000+0000)",c2c7cb35-2f97-4fab-ab23-62fe24eca7af,2
"List(2021-07-30T03:07:00.000+0000, 2021-07-30T03:09:00.000+0000)",c2c7cb35-2f97-4fab-ab23-62fe24eca7af,14
"List(2021-07-30T03:06:00.000+0000, 2021-07-30T03:08:00.000+0000)",c2c7cb35-2f97-4fab-ab23-62fe24eca7af,34
"List(2021-07-30T03:05:00.000+0000, 2021-07-30T03:07:00.000+0000)",c2c7cb35-2f97-4fab-ab23-62fe24eca7af,28
"List(2021-07-30T03:04:00.000+0000, 2021-07-30T03:06:00.000+0000)",c2c7cb35-2f97-4fab-ab23-62fe24eca7af,6


##### After Inserting records for the time frame 2021-07-30T03:08:30.000+0000

c2c7cb35-2f97-4fab-ab23-62fe24eca7af

In [0]:
%sql
select * from VehiclechkpointKafkaEventHub_Delta_Agg_Overlapping
where id ='c2c7cb35-2f97-4fab-ab23-62fe24eca7af' 
order by window desc

window,id,count
"List(2021-07-30T03:08:00.000+0000, 2021-07-30T03:10:00.000+0000)",c2c7cb35-2f97-4fab-ab23-62fe24eca7af,2
"List(2021-07-30T03:07:00.000+0000, 2021-07-30T03:09:00.000+0000)",c2c7cb35-2f97-4fab-ab23-62fe24eca7af,14
"List(2021-07-30T03:06:00.000+0000, 2021-07-30T03:08:00.000+0000)",c2c7cb35-2f97-4fab-ab23-62fe24eca7af,34
"List(2021-07-30T03:05:00.000+0000, 2021-07-30T03:07:00.000+0000)",c2c7cb35-2f97-4fab-ab23-62fe24eca7af,28
"List(2021-07-30T03:04:00.000+0000, 2021-07-30T03:06:00.000+0000)",c2c7cb35-2f97-4fab-ab23-62fe24eca7af,6
