In [1]:
# Create the Spark Session
from pyspark.sql import SparkSession

spark = (
    SparkSession
    .builder
    .appName("Read from Kafka")
    .config("spark.streaming.stopGracefullyOnShutdown", True)
    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.1.2")
    .master("local[*]")
    .getOrCreate()
)

spark

In [None]:
# Kafka Sample Input
# {"eventId": "ba2ea9f4-a5d9-434e-8e4d-1c80c2d4b456", "eventOffset": 10000, "eventPublisher": "device", "customerId": "CI00119", "data": {"devices": []}, "eventTime": "2023-01-05 11:13:53.643364"}
# {"eventId": "e3cb26d3-41b2-49a2-84f3-0156ed8d7502", "eventOffset": 10001, "eventPublisher": "device", "customerId": "CI00103", "data": {"devices": [{"deviceId": "D001", "temperature": 15, "measure": "C", "status": "ERROR"}, {"deviceId": "D002", "temperature": 16, "measure": "C", "status": "SUCCESS"}]}, "eventTime": "2023-01-05 11:13:53.643364"}


In [2]:
# Create the kafka_df to read from kafka

raw_kafka_df = (spark
            .readStream
            .format("kafka")
            .option("kafka.bootstrap.servers", "ed-kafka:29092")
            .option("subscribe", "device-data")
            .option("startingOffsets", "earliest")
            .load())


In [3]:
# View schema for raw kafka_df

raw_kafka_df.printSchema()
# raw_kafka_df.show(truncate=False)

root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)



In [4]:
# Parse value from binay to string into kafka_json_df

from pyspark.sql.functions import expr
raw_kafka_df = raw_kafka_df.withColumn("value", expr("cast(value as string)"))

raw_kafka_df.printSchema()
# raw_kafka_df.show(truncate=False)

root
 |-- key: binary (nullable = true)
 |-- value: string (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)



In [5]:
# Schema of the Pyaload
from pyspark.sql.types import StringType, StructField, StructType, ArrayType, LongType

json_schema = (
    StructType(
    [StructField('customerId', StringType(), True), 
    StructField('data', StructType(
        [StructField('devices', 
                     ArrayType(StructType([ 
                        StructField('deviceId', StringType(), True), 
                        StructField('measure', StringType(), True), 
                        StructField('status', StringType(), True), 
                        StructField('temperature', LongType(), True)
                    ]), True), True)
        ]), True), 
    StructField('eventId', StringType(), True), 
    StructField('eventOffset', LongType(), True), 
    StructField('eventPublisher', StringType(), True), 
    StructField('eventTime', StringType(), True)
    ])
)

In [6]:
# Apply the schema to payload to read the data

from pyspark.sql.functions import from_json, col

raw_kafka_df = raw_kafka_df.withColumn("value_json", from_json(col("value"), json_schema)).selectExpr("value_json.*")

# To the schema of the data, place a sample json file and change readStream to read 
raw_kafka_df.printSchema()
# raw_kafka_df.show(truncate=False)

root
 |-- customerId: string (nullable = true)
 |-- data: struct (nullable = true)
 |    |-- devices: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- deviceId: string (nullable = true)
 |    |    |    |-- measure: string (nullable = true)
 |    |    |    |-- status: string (nullable = true)
 |    |    |    |-- temperature: long (nullable = true)
 |-- eventId: string (nullable = true)
 |-- eventOffset: long (nullable = true)
 |-- eventPublisher: string (nullable = true)
 |-- eventTime: string (nullable = true)



In [7]:
# Lets explode the data as devices contains list/array of device reading

from pyspark.sql.functions import explode

stream_exploded_df = raw_kafka_df.withColumn("data_devices", explode("data.devices"))

In [8]:
# Check the schema of the exploded_df, place a sample json file and change readStream to read 
stream_exploded_df.printSchema()

root
 |-- customerId: string (nullable = true)
 |-- data: struct (nullable = true)
 |    |-- devices: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- deviceId: string (nullable = true)
 |    |    |    |-- measure: string (nullable = true)
 |    |    |    |-- status: string (nullable = true)
 |    |    |    |-- temperature: long (nullable = true)
 |-- eventId: string (nullable = true)
 |-- eventOffset: long (nullable = true)
 |-- eventPublisher: string (nullable = true)
 |-- eventTime: string (nullable = true)
 |-- data_devices: struct (nullable = true)
 |    |-- deviceId: string (nullable = true)
 |    |-- measure: string (nullable = true)
 |    |-- status: string (nullable = true)
 |    |-- temperature: long (nullable = true)



In [9]:
# Flatten the exploded df

from pyspark.sql.functions import col

stream_flattened_df = (
    stream_exploded_df
    .withColumn("deviceId", col("data_devices.deviceId"))
    .withColumn("measure", col("data_devices.measure"))
    .withColumn("status", col("data_devices.status"))
    .withColumn("temperature", col("data_devices.temperature"))
    .drop('data', 'data_devices')
)

In [10]:
# Check the schema of the flattened_df, place a sample json file and change readStream to read 
stream_flattened_df.printSchema()

root
 |-- customerId: string (nullable = true)
 |-- eventId: string (nullable = true)
 |-- eventOffset: long (nullable = true)
 |-- eventPublisher: string (nullable = true)
 |-- eventTime: string (nullable = true)
 |-- deviceId: string (nullable = true)
 |-- measure: string (nullable = true)
 |-- status: string (nullable = true)
 |-- temperature: long (nullable = true)



In [None]:
# Write the output to console sink to check the output

( stream_flattened_df
 .writeStream
 .format("console")
 .outputMode("append")
 .option("checkPointLocation", "checkpoint_dir")
 .start()
 .awaitTermination())

In [None]:
# Console Output

Batch: 0
-------------------------------------------
+----------+-------+-----------+--------------+---------+--------+-------+------+-----------+
|customerId|eventId|eventOffset|eventPublisher|eventTime|deviceId|measure|status|temperature|
+----------+-------+-----------+--------------+---------+--------+-------+------+-----------+
+----------+-------+-----------+--------------+---------+--------+-------+------+-----------+

-------------------------------------------
Batch: 1
-------------------------------------------
+----------+--------------------+-----------+--------------+--------------------+--------+-------+-------+-----------+
|customerId|             eventId|eventOffset|eventPublisher|           eventTime|deviceId|measure| status|temperature|
+----------+--------------------+-----------+--------------+--------------------+--------+-------+-------+-----------+
|   CI00103|e3cb26d3-41b2-49a...|      10001|        device|2023-01-05 11:13:...|    D001|      C|  ERROR|         15|
|   CI00103|e3cb26d3-41b2-49a...|      10001|        device|2023-01-05 11:13:...|    D002|      C|SUCCESS|         16|
+----------+--------------------+-----------+--------------+--------------------+--------+-------+-------+-----------+
