In [None]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
from datetime import datetime
from pyspark.sql.functions  import from_unixtime
from pyspark.sql.functions  import to_date
from pyspark.sql import Row
from pyspark.sql.functions import to_json, struct
from pyspark.sql import functions as F
import random

In [None]:
#Creating the schema for the vehicle data json structure
jsonschema = StructType() \
.add("id", StringType()) \
.add("timestamp", TimestampType()) \
.add("rpm", IntegerType()) \
.add("speed", IntegerType()) \
.add("kms", IntegerType()) 

In [None]:
%fs mkdirs /mnt/Gen2Source/Vehicle_Delta/Chkpnt

In [None]:
def checkpoint_dir(): 
  return "/mnt/Gen2Source/Vehicle_Delta/Chkpnt/%s" % str(random.randint(0, 10000))
 

In [None]:
BOOTSTRAP_SERVERS = "kafkaenabledeventhubns.servicebus.windows.net:9093"
EH_SASL = "kafkashaded.org.apache.kafka.common.security.plain.PlainLoginModule required username=\"$ConnectionString\" password=\"Endpoint=sb://kafkaenabledeventhubns.servicebus.windows.net/;SharedAccessKeyName=sendreceivekafka;SharedAccessKey=zzzzzzzz\";"
GROUP_ID = "$Default"


In [None]:
# Function to read data from EventHub and writing as delta format
def append_kafkadata_stream(topic="eventhubsource1"):
  kafkaDF = (spark.readStream \
    .format("kafka") \
    .option("subscribe", topic) \
    .option("kafka.bootstrap.servers", BOOTSTRAP_SERVERS) \
    .option("kafka.sasl.mechanism", "PLAIN") \
    .option("kafka.security.protocol", "SASL_SSL") \
    .option("kafka.sasl.jaas.config", EH_SASL) \
    .option("kafka.request.timeout.ms", "60000") \
    .option("kafka.session.timeout.ms", "60000") \
    .option("kafka.group.id", GROUP_ID) \
    .option("failOnDataLoss", "false") \
    .option("startingOffsets", "latest") \
    .load().withColumn("source", lit(topic)))
  
  newkafkaDF=kafkaDF.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)","source").withColumn('vehiclejson', from_json(col('value'),schema=jsonschema))
  kafkajsonDF=newkafkaDF.select("key","value","source", "vehiclejson.*")
  
  query=kafkajsonDF.selectExpr(
                  "id"	  \
                  ,"timestamp"	   \
                  ,"rpm"	\
                  ,"speed" \
                  ,"kms"
                  ,"source") \
            .writeStream.format("delta") \
            .outputMode("append") \
            .option("checkpointLocation",checkpoint_dir()) \
            .start("/mnt/Gen2Source/Vehicle_Delta/") 
 
  return query

In [None]:
query_source1 = append_kafkadata_stream(topic='eventhubsource1')


In [None]:
query_source2 = append_kafkadata_stream(topic='eventhubsource2')

In [None]:
%sql
-- Creating the table on delta location
CREATE DATABASE IF NOT EXISTS Vehicle;
CREATE TABLE IF NOT EXISTS Vehicle.VehicleDetails_Delta
USING DELTA
LOCATION "/mnt/Gen2Source/Vehicle_Delta/"

In [None]:
%sql

--select count(*) from VehicleDetails_KafkaEnabledEventHub_Delta
select count(*),source from Vehicle.VehicleDetails_Delta group by source


count(1),source
448,eventhubsource2
400,eventhubsource1


In [None]:
display(spark.readStream.format("delta").table("Vehicle.VehicleDetails_Delta").groupBy("source").count().orderBy("source"))

source,count
eventhubsource1,900
eventhubsource2,800
