In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
from datetime import datetime
from pyspark.sql.functions  import from_unixtime
from pyspark.sql.functions  import to_date
from pyspark.sql import Row
from pyspark.sql.functions import to_json, struct
from pyspark.sql import functions as F

In [0]:
#Creating the schema for the vehicle data json structure
jsonschema = StructType() \
.add("id", StringType()) \
.add("timestamp", TimestampType()) \
.add("rpm", IntegerType()) \
.add("speed", IntegerType()) \
.add("kms", IntegerType()) 

In [0]:
# We can use to this to reset the offset from where we want to start reading data from kafak provided data in that offset is available in Kafka Source
offset = '''
  {
  "VehicleDetails":{"0": 1}
  }
'''

print(offset)

In [0]:
#Reading data from kafka source
kafkaDF = spark.readStream.format("kafka") \
.option("kafka.bootstrap.servers", "10.1.0.13:9092,10.1.0.11:9092") \
.option("subscribe", "VehicleDetails") \
.option("group.id", "Cookbook-demo") \
.option("startingOffsets","latest" ) \
.load()

In [0]:
#Checking if streaming is on and getting the schema for the kakfa dataframe 
print( kafkaDF.isStreaming)
print( kafkaDF.printSchema())

In [0]:
# display(kafkaDF.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)"))

In [0]:
#Converting binary datatype to string for the dataframe columns. Without this you cannot use from_json function as it expects the column datatype as string not binary
newkafkaDF=kafkaDF.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")

In [0]:
#Adding new column vehiclejson which is a struct and has 5 columns id, timestamp,rpm,speed and kms
newkafkaDF=newkafkaDF.withColumn('vehiclejson', from_json(col('value'),schema=jsonschema))

In [0]:
kafkajsonDF=newkafkaDF.select("key","value", "vehiclejson.*")

In [0]:
#you can run the below command to view the column values
#display(kafkajsonDF)

In [0]:
#Writing the streaming data to Delta tables
#Location for Delta table is dbfs:/Vehiclechkpoint_Delta. We are using default mount point which is available in Databricks cluster. You can use your own mount point as well. Recommended is to mount external ADLS Gen-2 file system 
kafkajsonDF.selectExpr(
                  "id"	  \
                  ,"timestamp"	   \
                  ,"rpm"	\
                  ,"speed" \
                  ,"kms" ) \
.writeStream.format("delta") \
.outputMode("append") \
.option("checkpointLocation", "dbfs:/Vehiclechkpoint_Demo/") \
.option("mergeSchema", "true") \
.start("dbfs:/Vehiclechkpoint_Delta") 

In [0]:
%sql
-- Creating the table on delta location
CREATE TABLE IF NOT EXISTS VehicleDetails_Delta
USING DELTA
LOCATION "dbfs:/Vehiclechkpoint_Delta/"

In [0]:
%sql
--select * from VehicleDetails order by timestamp desc limit 20 --2021-03-03T01:32:36.100+0000
--select * from VehicleDetails_Delta limit 10
select count(*) from VehicleDetails_Delta--1200


count(1)
2000


In [0]:
#Creating folder for parquet file in default dbfs location
dbutils.fs.mkdirs("dbfs:/VehiclechData/parquetFiles/")

In [0]:
dbutils.fs.ls("dbfs:/VehiclechData")

In [0]:
kafkajsonDF.selectExpr(
                  "id"	  \
                  ,"timestamp"	   \
                  ,"rpm"	\
                  ,"speed" \
                  ,"kms" ) \
.writeStream.format("parquet").queryName("veh_details").option("checkpointLocation", "dbfs:/Vehiclechkpoint_Demo_Parquet1/").start("dbfs:/VehiclechData/parquetFiles")#.awaitTermination()

In [0]:
%fs ls dbfs:/VehiclechData/parquetFiles

path,name,size
dbfs:/VehiclechData/parquetFiles/_spark_metadata/,_spark_metadata/,0
dbfs:/VehiclechData/parquetFiles/part-00000-01bbb09a-e774-488c-8534-7d96ec8e7b77-c000.snappy.parquet,part-00000-01bbb09a-e774-488c-8534-7d96ec8e7b77-c000.snappy.parquet,1657
dbfs:/VehiclechData/parquetFiles/part-00000-03206e9c-9811-4fe2-b185-ba176e739627-c000.snappy.parquet,part-00000-03206e9c-9811-4fe2-b185-ba176e739627-c000.snappy.parquet,1692
dbfs:/VehiclechData/parquetFiles/part-00000-0614e2d2-4167-4f68-8436-147ade5d78de-c000.snappy.parquet,part-00000-0614e2d2-4167-4f68-8436-147ade5d78de-c000.snappy.parquet,3743
dbfs:/VehiclechData/parquetFiles/part-00000-06604ed0-1416-4f46-a0f7-62e59dc28524-c000.snappy.parquet,part-00000-06604ed0-1416-4f46-a0f7-62e59dc28524-c000.snappy.parquet,1692
dbfs:/VehiclechData/parquetFiles/part-00000-17dfcf25-b469-4f1a-b20e-41d1af482242-c000.snappy.parquet,part-00000-17dfcf25-b469-4f1a-b20e-41d1af482242-c000.snappy.parquet,3765
dbfs:/VehiclechData/parquetFiles/part-00000-31c35cee-70fb-4776-9cc7-40e463f7dbc5-c000.snappy.parquet,part-00000-31c35cee-70fb-4776-9cc7-40e463f7dbc5-c000.snappy.parquet,1808
dbfs:/VehiclechData/parquetFiles/part-00000-3ba692b2-9c79-4dcf-87a0-d001c26f2d4a-c000.snappy.parquet,part-00000-3ba692b2-9c79-4dcf-87a0-d001c26f2d4a-c000.snappy.parquet,1604
dbfs:/VehiclechData/parquetFiles/part-00000-43363fed-5ecb-4a7f-a373-1e456bc097ba-c000.snappy.parquet,part-00000-43363fed-5ecb-4a7f-a373-1e456bc097ba-c000.snappy.parquet,3783
dbfs:/VehiclechData/parquetFiles/part-00000-45604f0f-482c-4b4d-a106-084c9b5d41fd-c000.snappy.parquet,part-00000-45604f0f-482c-4b4d-a106-084c9b5d41fd-c000.snappy.parquet,3718
