In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
from datetime import datetime
from pyspark.sql.functions  import from_unixtime
from pyspark.sql.functions  import to_date
from pyspark.sql import Row
from pyspark.sql.functions import to_json, struct
from pyspark.sql import functions as F

In [0]:
#Creating the schema for the vehicle data json structure
jsonschema = StructType() \
.add("id", StringType()) \
.add("timestamp", TimestampType()) \
.add("rpm", IntegerType()) \
.add("speed", IntegerType()) \
.add("kms", IntegerType()) 

In [0]:
TOPIC = "kafkaenabledhub"
BOOTSTRAP_SERVERS = "kafkaenabledeventhubns.servicebus.windows.net:9093"
EH_SASL = "kafkashaded.org.apache.kafka.common.security.plain.PlainLoginModule required username=\"$ConnectionString\" password=\"Endpoint=sb://kafkaenabledeventhubns.servicebus.windows.net/;SharedAccessKeyName=sendreceivekafka;SharedAccessKey=4vxbVwOGJD7bsasdasdasdasd440+SFHpRyQVIpMeXvoVE=\";"
GROUP_ID = "$Default"

# // Read stream using Spark SQL (structured streaming)
# // consider adding .option("startingOffsets", "earliest") to read earliest available offset during testing
kafkaDF = spark.readStream \
    .format("kafka") \
    .option("subscribe", TOPIC) \
    .option("kafka.bootstrap.servers", BOOTSTRAP_SERVERS) \
    .option("kafka.sasl.mechanism", "PLAIN") \
    .option("kafka.security.protocol", "SASL_SSL") \
    .option("kafka.sasl.jaas.config", EH_SASL) \
    .option("kafka.request.timeout.ms", "60000") \
    .option("kafka.session.timeout.ms", "60000") \
    .option("kafka.group.id", GROUP_ID) \
    .option("failOnDataLoss", "false") \
    .option("startingOffsets", "latest") \
    .load()




In [0]:
#Converting binary datatype to string for the dataframe columns. Without this you cannot use from_json function as it expects the column datatype as string not binary
kafkaDF=kafkaDF.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")

In [0]:
#Adding new column vehiclejson which is a struct and has 5 columns id, timestamp,rpm,speed and kms
newkafkaDF=kafkaDF.withColumn('vehiclejson', from_json(col('value'),schema=jsonschema))
kafkajsonDF=newkafkaDF.select("key","value", "vehiclejson.*")

In [0]:
#you can uncomment and run the below command to view the column values
# display(kafkajsonDF)

### Tumbling window non-overlapping event time

In [0]:
kafkajsonDF.groupBy(window('timestamp',"1 minutes"),'id').count().orderBy('window') \
.writeStream.format("delta") \
.outputMode("complete") \
.option("truncate", "false") \
.option("checkpointLocation", "dbfs:/Vehiclechkpointkafkaeventhub_Agg_Chkpoint_Tumbling/") \
.option("mergeSchema", "true") \
.start("dbfs:/VehiclechkpointKafkaEventHub_Delta_Agg_Tumbling") 

In [0]:
%sql
CREATE TABLE IF NOT EXISTS VehiclechkpointKafkaEventHub_Delta_Agg_Tumbling
USING DELTA
LOCATION "dbfs:/VehiclechkpointKafkaEventHub_Delta_Agg_Tumbling/"

In [0]:
%sql
SELECT * FROM VehiclechkpointKafkaEventHub_Delta_Agg_Tumbling ORDER BY Window desc

window,id,count
"List(2021-03-14T11:02:00.000+0000, 2021-03-14T11:03:00.000+0000)",4ba6e242-7282-4367-b09c-7568e4b12a58,1
"List(2021-03-14T11:02:00.000+0000, 2021-03-14T11:03:00.000+0000)",7e0d39fd-7251-483c-92d6-7d6bb5cc164e,1
"List(2021-03-14T10:47:00.000+0000, 2021-03-14T10:48:00.000+0000)",7e0d39fd-7251-483c-92d6-7d6bb5cc164e,1
"List(2021-03-14T10:46:00.000+0000, 2021-03-14T10:47:00.000+0000)",7e0d39fd-7251-483c-92d6-7d6bb5cc164e,2
"List(2021-03-14T10:46:00.000+0000, 2021-03-14T10:47:00.000+0000)",4ba6e242-7282-4367-b09c-7568e4b12a58,1
"List(2021-03-14T10:45:00.000+0000, 2021-03-14T10:46:00.000+0000)",7e0d39fd-7251-483c-92d6-7d6bb5cc164e,2
"List(2021-03-14T10:45:00.000+0000, 2021-03-14T10:46:00.000+0000)",4ba6e242-7282-4367-b09c-7568e4b12a58,1


### Overlapping windows time with sliding time

In [0]:
kafkajsonDF.groupBy(window('timestamp',"2 minutes","1 minutes"),'id').count().orderBy('window') \
.writeStream.format("delta") \
.outputMode("complete") \
.option("truncate", "false") \
.option("checkpointLocation", "dbfs:/Vehiclechkpointkafkaeventhub_Agg_Chkpoint_Overlapping4/") \
.option("mergeSchema", "true") \
.start("dbfs:/VehiclechkpointKafkaEventHub_Delta_Agg_Overlapping4") 


In [0]:
%sql
-- Creating the table on delta location
-- drop  table VehiclechkpointKafkaEventHub_Delta_Agg_Overlapping
CREATE TABLE IF NOT EXISTS VehiclechkpointKafkaEventHub_Delta_Agg_Overlapping
USING DELTA
LOCATION "dbfs:/VehiclechkpointKafkaEventHub_Delta_Agg_Overlapping4/"

In [0]:
%sql
select * from VehiclechkpointKafkaEventHub_Delta_Agg_Overlapping
where id ='7e0d39fd-7251-483c-92d6-7d6bb5cc164e' 
order by window desc

window,id,count
"List(2021-03-14T11:02:00.000+0000, 2021-03-14T11:04:00.000+0000)",7e0d39fd-7251-483c-92d6-7d6bb5cc164e,1
"List(2021-03-14T11:01:00.000+0000, 2021-03-14T11:03:00.000+0000)",7e0d39fd-7251-483c-92d6-7d6bb5cc164e,1
"List(2021-03-14T10:47:00.000+0000, 2021-03-14T10:49:00.000+0000)",7e0d39fd-7251-483c-92d6-7d6bb5cc164e,2
"List(2021-03-14T10:46:00.000+0000, 2021-03-14T10:48:00.000+0000)",7e0d39fd-7251-483c-92d6-7d6bb5cc164e,4
"List(2021-03-14T10:45:00.000+0000, 2021-03-14T10:47:00.000+0000)",7e0d39fd-7251-483c-92d6-7d6bb5cc164e,4
"List(2021-03-14T10:44:00.000+0000, 2021-03-14T10:46:00.000+0000)",7e0d39fd-7251-483c-92d6-7d6bb5cc164e,2
"List(2021-03-14T10:33:00.000+0000, 2021-03-14T10:35:00.000+0000)",7e0d39fd-7251-483c-92d6-7d6bb5cc164e,6
"List(2021-03-14T10:32:00.000+0000, 2021-03-14T10:34:00.000+0000)",7e0d39fd-7251-483c-92d6-7d6bb5cc164e,6
"List(2021-03-14T10:25:00.000+0000, 2021-03-14T10:27:00.000+0000)",7e0d39fd-7251-483c-92d6-7d6bb5cc164e,8
"List(2021-03-14T10:24:00.000+0000, 2021-03-14T10:26:00.000+0000)",7e0d39fd-7251-483c-92d6-7d6bb5cc164e,8
