In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
from datetime import datetime
from pyspark.sql.functions  import from_unixtime
from pyspark.sql.functions  import to_date
from pyspark.sql import Row
from pyspark.sql.functions import to_json, struct
from pyspark.sql import functions as F

In [0]:
#Creating the schema for the vehicle data json structure
jsonschema = StructType() \
.add("id", StringType()) \
.add("timestamp", TimestampType()) \
.add("rpm", IntegerType()) \
.add("speed", IntegerType()) \
.add("kms", IntegerType()) 

In [0]:
# We can use to this to reset the offset from where we want to start reading data from kafak provided data in that offset is available in Kafka Source
offset = '''
  {
  "VehicleDetails":{"0": 1}
  }
'''

print(offset)

In [0]:
TOPIC = "kafkaenabledhub"
BOOTSTRAP_SERVERS = "kafkaenabledeventhubns.servicebus.windows.net:9093"
EH_SASL = "kafkashaded.org.apache.kafka.common.security.plain.PlainLoginModule required username=\"$ConnectionString\" password=\"Endpoint=sb://kafkaenabledeventhubns.servicebus.windows.net/;SharedAccessKeyName=sendreceivekafka;SharedAccessKey=4vxbVwasdasdsdasd4aVcUWBvYp44sdasaasasasasasasvoVE=\";"
GROUP_ID = "$Default"

# // Read stream using Spark SQL (structured streaming)
# // consider adding .option("startingOffsets", "earliest") to read earliest available offset during testing
kafkaDF = spark.readStream \
    .format("kafka") \
    .option("subscribe", TOPIC) \
    .option("kafka.bootstrap.servers", BOOTSTRAP_SERVERS) \
    .option("kafka.sasl.mechanism", "PLAIN") \
    .option("kafka.security.protocol", "SASL_SSL") \
    .option("kafka.sasl.jaas.config", EH_SASL) \
    .option("kafka.request.timeout.ms", "60000") \
    .option("kafka.session.timeout.ms", "60000") \
    .option("kafka.group.id", "POC") \
    .option("failOnDataLoss", "false") \
    .option("startingOffsets", "latest") \
    .load()




In [0]:
#Checking if streaming is on and getting the schema for the kakfa dataframe 
print( kafkaDF.isStreaming)
print( kafkaDF.printSchema())

In [0]:
# display(kafkaDF.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)"))

In [0]:
#Converting binary datatype to string for the dataframe columns. Without this you cannot use from_json function as it expects the column datatype as string not binary
newkafkaDF=kafkaDF.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")

In [0]:
#Adding new column vehiclejson which is a struct and has 5 columns id, timestamp,rpm,speed and kms
newkafkaDF=newkafkaDF.withColumn('vehiclejson', from_json(col('value'),schema=jsonschema))

In [0]:
kafkajsonDF=newkafkaDF.select("key","value", "vehiclejson.*")

In [0]:
#you can uncomment and run the below command to view the column values
# display(kafkajsonDF)

key,value,id,timestamp,rpm,speed,kms
,"{""id"": ""a14b41da-51ab-4339-b691-6306ff64635a"", ""timestamp"": ""2021-03-08 18:18:14.545723"", ""rpm"": 22, ""speed"": 91, ""kms"": 632}",a14b41da-51ab-4339-b691-6306ff64635a,2021-03-08T18:18:14.545+0000,22,91,632
,"{""id"": ""bfa42ce7-3cee-404c-a292-0f19463c3b76"", ""timestamp"": ""2021-03-08 18:18:14.683982"", ""rpm"": 50, ""speed"": 92, ""kms"": 733}",bfa42ce7-3cee-404c-a292-0f19463c3b76,2021-03-08T18:18:14.683+0000,50,92,733
,"{""id"": ""f533a177-a46f-4e77-a3e3-a6bb92d907ff"", ""timestamp"": ""2021-03-08 18:18:14.749337"", ""rpm"": 74, ""speed"": 80, ""kms"": 225}",f533a177-a46f-4e77-a3e3-a6bb92d907ff,2021-03-08T18:18:14.749+0000,74,80,225
,"{""id"": ""854d9673-8119-44cc-a953-c5accc51ec26"", ""timestamp"": ""2021-03-08 18:18:14.798157"", ""rpm"": 98, ""speed"": 90, ""kms"": 436}",854d9673-8119-44cc-a953-c5accc51ec26,2021-03-08T18:18:14.798+0000,98,90,436
,"{""id"": ""bea8bb4e-c113-4fb1-9aa3-2ea19da506fd"", ""timestamp"": ""2021-03-08 18:18:14.842561"", ""rpm"": 53, ""speed"": 80, ""kms"": 598}",bea8bb4e-c113-4fb1-9aa3-2ea19da506fd,2021-03-08T18:18:14.842+0000,53,80,598
,"{""id"": ""b403eaa7-4265-43fc-b0af-44d1ec75ae0f"", ""timestamp"": ""2021-03-08 18:18:14.898453"", ""rpm"": 55, ""speed"": 89, ""kms"": 341}",b403eaa7-4265-43fc-b0af-44d1ec75ae0f,2021-03-08T18:18:14.898+0000,55,89,341
,"{""id"": ""277c71eb-5ed2-4440-834c-691211d2a9f9"", ""timestamp"": ""2021-03-08 18:18:14.935919"", ""rpm"": 78, ""speed"": 87, ""kms"": 958}",277c71eb-5ed2-4440-834c-691211d2a9f9,2021-03-08T18:18:14.935+0000,78,87,958
,"{""id"": ""ce60b483-ca50-4650-8db9-3945b7a2b0a1"", ""timestamp"": ""2021-03-08 18:18:15.001484"", ""rpm"": 61, ""speed"": 78, ""kms"": 718}",ce60b483-ca50-4650-8db9-3945b7a2b0a1,2021-03-08T18:18:15.001+0000,61,78,718
,"{""id"": ""c2dc3c55-e03a-43c3-9da8-017d2c0d49b0"", ""timestamp"": ""2021-03-08 18:18:15.045128"", ""rpm"": 49, ""speed"": 78, ""kms"": 539}",c2dc3c55-e03a-43c3-9da8-017d2c0d49b0,2021-03-08T18:18:15.045+0000,49,78,539
,"{""id"": ""ccc3b132-6fe6-4f98-ac91-670a17de2edb"", ""timestamp"": ""2021-03-08 18:18:15.117411"", ""rpm"": 74, ""speed"": 99, ""kms"": 759}",ccc3b132-6fe6-4f98-ac91-670a17de2edb,2021-03-08T18:18:15.117+0000,74,99,759


In [0]:
#Writing the streaming data to Delta tables
#Location for Delta table is dbfs:/Vehiclechkpoint_Delta. We are using default mount point which is available in Databricks cluster. You can use your own mount point as well. Recommended is to mount external ADLS Gen-2 file system 
kafkajsonDF.selectExpr(
                  "id"	  \
                  ,"timestamp"	   \
                  ,"rpm"	\
                  ,"speed" \
                  ,"kms" ) \
.writeStream.format("delta") \
.outputMode("append") \
.option("checkpointLocation", "dbfs:/Vehiclechkpointkafkaeventhub_Demo/") \
.option("mergeSchema", "true") \
.start("dbfs:/VehiclechkpointKafkaEventHub_Delta") 

In [0]:
%sql
-- Creating the table on delta location
CREATE TABLE IF NOT EXISTS VehicleDetails_KafkaEnabledEventHub_Delta
USING DELTA
LOCATION "dbfs:/VehiclechkpointKafkaEventHub_Delta/"

In [0]:
%sql

--select count(*) from VehicleDetails_KafkaEnabledEventHub_Delta
select * from VehicleDetails_KafkaEnabledEventHub_Delta limit 10


id,timestamp,rpm,speed,kms
27b01765-fb5f-4402-bb0e-45186fa90816,2021-03-08T18:29:51.201+0000,3,86,293
ac2a00c2-a80f-4517-bb73-3d10378948fe,2021-03-08T18:29:51.291+0000,56,74,124
cd12ea20-a4f0-4d73-8b8d-cedc1ac1b721,2021-03-08T18:29:51.340+0000,28,98,741
9a026e69-9a18-4c0c-a83e-809758e47587,2021-03-08T18:29:51.401+0000,35,88,343
8aa71269-d97a-4bd8-8a95-5d5ead0861c3,2021-03-08T18:29:51.444+0000,28,93,133
73d698a2-bce0-4800-bc37-a66624a3ba54,2021-03-08T18:29:51.495+0000,56,80,715
86f89c73-1c6d-48b9-934f-9e5aefd0f88d,2021-03-08T18:29:51.539+0000,8,71,190
a585c043-4e39-4d5f-8ce3-a8ab3165420a,2021-03-08T18:29:51.586+0000,72,76,584
fd420dd2-c4fd-4b73-964b-dad56a1f7422,2021-03-08T18:29:51.633+0000,94,90,646
8c7865e2-2569-4a72-aaaf-178f11b7792d,2021-03-08T18:29:51.680+0000,47,77,901


In [0]:
#Creating folder for parquet file in default dbfs location
dbutils.fs.mkdirs("dbfs:/VehiclechData_KafkaEnabledEventHub/parquetFiles/")

In [0]:
dbutils.fs.ls("dbfs:/VehiclechData_KafkaEnabledEventHub")

In [0]:
#To get data into parquet location, you need to stop the writestream to delta table and following which you will see inout streaming data will be saved as parquet files in the folder mentioned in the following writestream code
kafkajsonDF.selectExpr(
                  "id"	  \
                  ,"timestamp"	   \
                  ,"rpm"	\
                  ,"speed" \
                  ,"kms" ) \
.writeStream.format("parquet").queryName("veh_details1").option("checkpointLocation", "dbfs:/VehiclechData_KafkaEnabledEventHub/chkpoint/").start("dbfs:/VehiclechData_KafkaEnabledEventHub/parquetFiles")#.awaitTermination()

In [0]:
%fs ls dbfs:/VehiclechData_KafkaEnabledEventHub/parquetFiles

path,name,size
dbfs:/VehiclechData_KafkaEnabledEventHub/parquetFiles/_spark_metadata/,_spark_metadata/,0
dbfs:/VehiclechData_KafkaEnabledEventHub/parquetFiles/part-00000-0a47a21b-e20c-4f3c-9f7b-8a799b38a527-c000.snappy.parquet,part-00000-0a47a21b-e20c-4f3c-9f7b-8a799b38a527-c000.snappy.parquet,1923
dbfs:/VehiclechData_KafkaEnabledEventHub/parquetFiles/part-00000-14f18974-b0da-4860-a951-d0d9403ed8b3-c000.snappy.parquet,part-00000-14f18974-b0da-4860-a951-d0d9403ed8b3-c000.snappy.parquet,1986
dbfs:/VehiclechData_KafkaEnabledEventHub/parquetFiles/part-00000-27887015-7580-438d-8216-7b9cab86fac1-c000.snappy.parquet,part-00000-27887015-7580-438d-8216-7b9cab86fac1-c000.snappy.parquet,1979
dbfs:/VehiclechData_KafkaEnabledEventHub/parquetFiles/part-00000-28d57f5a-b173-4f7e-97f9-ded079c31bce-c000.snappy.parquet,part-00000-28d57f5a-b173-4f7e-97f9-ded079c31bce-c000.snappy.parquet,2035
dbfs:/VehiclechData_KafkaEnabledEventHub/parquetFiles/part-00000-462ec753-13b7-4cf0-b31b-4a54a2bdfd78-c000.snappy.parquet,part-00000-462ec753-13b7-4cf0-b31b-4a54a2bdfd78-c000.snappy.parquet,2061
dbfs:/VehiclechData_KafkaEnabledEventHub/parquetFiles/part-00000-6002fa4b-758e-4cfd-8227-e48819cf7148-c000.snappy.parquet,part-00000-6002fa4b-758e-4cfd-8227-e48819cf7148-c000.snappy.parquet,1692
dbfs:/VehiclechData_KafkaEnabledEventHub/parquetFiles/part-00000-63ee395d-84e3-4b2b-93d1-6be7e5ab4d64-c000.snappy.parquet,part-00000-63ee395d-84e3-4b2b-93d1-6be7e5ab4d64-c000.snappy.parquet,2054
dbfs:/VehiclechData_KafkaEnabledEventHub/parquetFiles/part-00000-6410e834-9f0d-4f89-bbc3-cd5306867dfa-c000.snappy.parquet,part-00000-6410e834-9f0d-4f89-bbc3-cd5306867dfa-c000.snappy.parquet,1982
dbfs:/VehiclechData_KafkaEnabledEventHub/parquetFiles/part-00000-673e93d7-bab9-4a11-afcf-cc9d55b255ec-c000.snappy.parquet,part-00000-673e93d7-bab9-4a11-afcf-cc9d55b255ec-c000.snappy.parquet,1918
