In [2]:
# Create the streaming_df to read from input
# print the schema 

In [1]:
# Start the SparkSession
from pyspark.sql import SparkSession
spark = (
    SparkSession
    .builder
    .appName("Streaming Process Files")
    .config("spark.streaming.stopGracefullyOnShutdown", True)
    .master("local[*]")
    .getOrCreate()
)

spark

In [2]:
# Create Batch code
# Explode the 'devices' field containing list/array
# print the schema

batch_df = spark.read.format("json").load("data/02_reading_from_files/input/")
batch_df.printSchema()
batch_df.show(truncate=True)

from pyspark.sql.functions import explode

batch_exploded_df = batch_df.withColumn("data_devices", explode("data.devices"))
batch_exploded_df.printSchema()
batch_exploded_df.show(truncate=True)

root
 |-- customerId: string (nullable = true)
 |-- data: struct (nullable = true)
 |    |-- devices: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- deviceId: string (nullable = true)
 |    |    |    |-- measure: string (nullable = true)
 |    |    |    |-- status: string (nullable = true)
 |    |    |    |-- temperature: long (nullable = true)
 |-- eventId: string (nullable = true)
 |-- eventOffset: long (nullable = true)
 |-- eventPublisher: string (nullable = true)
 |-- eventTime: string (nullable = true)

+----------+--------------------+--------------------+-----------+--------------+--------------------+
|customerId|                data|             eventId|eventOffset|eventPublisher|           eventTime|
+----------+--------------------+--------------------+-----------+--------------+--------------------+
|   CI00103|{[{D001, C, ERROR...|e3cb26d3-41b2-49a...|      10001|        device|2023-01-05 11:13:...|
+----------+-----------

In [3]:
# Flatten the exploded 'data_devices' column
# print the schema

from pyspark.sql.functions import col

batch_flattened_df = (
    batch_exploded_df
    .withColumn("deviceId", col("data_devices.deviceId"))
    .withColumn("measure", col("data_devices.measure"))
    .withColumn("status", col("data_devices.status"))
    .withColumn("temperature", col("data_devices.temperature"))
    .drop('data', 'data_devices')
)

batch_flattened_df.show(truncate=True)

+----------+--------------------+-----------+--------------+--------------------+--------+-------+-------+-----------+
|customerId|             eventId|eventOffset|eventPublisher|           eventTime|deviceId|measure| status|temperature|
+----------+--------------------+-----------+--------------+--------------------+--------+-------+-------+-----------+
|   CI00103|e3cb26d3-41b2-49a...|      10001|        device|2023-01-05 11:13:...|    D001|      C|  ERROR|         15|
|   CI00103|e3cb26d3-41b2-49a...|      10001|        device|2023-01-05 11:13:...|    D002|      C|SUCCESS|         16|
+----------+--------------------+-----------+--------------+--------------------+--------+-------+-------+-----------+



In [2]:
# Create Stream code

# set the inferSchema as true. put option cleanSource, sourceArchiveDir and maxFilesPerTrigger
# cleanSource: noop/archive/delete
# sourceArchiveDir: path to store the archieved files
# maxFilesPerTrigger: no of files taken as a input at some point of time 

spark.conf.set("spark.sql.streaming.schemaInference", True)

streaming_df = (
    spark
    .readStream
    .option("cleanSource", "archive")
    .option("sourceArchiveDir", "arch_dir")
    .option("maxFilesPerTrigger", 1)
    .format("json")
    .load("data/02_reading_from_files/input/")
)

streaming_df.printSchema()

# Explode the 'devices' field containing list/array
# print the schema

from pyspark.sql.functions import explode

stream_exploded_df = streaming_df.withColumn("data_devices", explode("data.devices"))
stream_exploded_df.printSchema()

root
 |-- customerId: string (nullable = true)
 |-- data: struct (nullable = true)
 |    |-- devices: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- deviceId: string (nullable = true)
 |    |    |    |-- measure: string (nullable = true)
 |    |    |    |-- status: string (nullable = true)
 |    |    |    |-- temperature: long (nullable = true)
 |-- eventId: string (nullable = true)
 |-- eventOffset: long (nullable = true)
 |-- eventPublisher: string (nullable = true)
 |-- eventTime: string (nullable = true)

root
 |-- customerId: string (nullable = true)
 |-- data: struct (nullable = true)
 |    |-- devices: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- deviceId: string (nullable = true)
 |    |    |    |-- measure: string (nullable = true)
 |    |    |    |-- status: string (nullable = true)
 |    |    |    |-- temperature: long (nullable = true)
 |-- eventId: string (nullable = true)

In [3]:
# Flatten the exploded 'data_devices' column
# print the schema

from pyspark.sql.functions import col

stream_flattened_df = (
    stream_exploded_df
    .withColumn("deviceId", col("data_devices.deviceId"))
    .withColumn("measure", col("data_devices.measure"))
    .withColumn("status", col("data_devices.status"))
    .withColumn("temperature", col("data_devices.temperature"))
    .drop('data', 'data_devices')
)

stream_flattened_df.printSchema()

root
 |-- customerId: string (nullable = true)
 |-- eventId: string (nullable = true)
 |-- eventOffset: long (nullable = true)
 |-- eventPublisher: string (nullable = true)
 |-- eventTime: string (nullable = true)
 |-- deviceId: string (nullable = true)
 |-- measure: string (nullable = true)
 |-- status: string (nullable = true)
 |-- temperature: long (nullable = true)



In [None]:
# Write the output to console sink 

( stream_flattened_df
 .writeStream
 .format("console")
 .outputMode("append")
 .option("checkPointLocation", "checkpoint_dir")
 .start()
 .awaitTermination())

In [None]:
# console output: 

[I 2025-03-06 13:20:42.795 ServerApp] Saving file at /spark-streaming/02_reading_from_files.ipynb
25/03/06 13:21:05 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
-------------------------------------------
Batch: 0
-------------------------------------------
+----------+--------------------+-----------+--------------+--------------------+--------+-------+-------+-----------+
|customerId|             eventId|eventOffset|eventPublisher|           eventTime|deviceId|measure| status|temperature|
+----------+--------------------+-----------+--------------+--------------------+--------+-------+-------+-----------+
|   CI00103|e3cb26d3-41b2-49a...|      10001|        device|2023-01-05 11:13:...|    D001|      C|  ERROR|         15|
|   CI00103|e3cb26d3-41b2-49a...|      10001|        device|2023-01-05 11:13:...|    D002|      C|SUCCESS|         16|
+----------+--------------------+-----------+--------------+--------------------+--------+-------+-------+-----------+

[I 2025-03-06 13:21:49.594 ServerApp] Saving file at /spark-streaming/02_reading_from_files.ipynb
[I 2025-03-06 13:22:17.122 ServerApp] Saving file at /spark-streaming/02_reading_from_files.ipynb
[I 2025-03-06 13:22:29.842 ServerApp] Copying 'spark-streaming/data/02_reading_from_files/samples/device_02.json' to '/spark-streaming/data/02_reading_from_files/input'
25/03/06 13:23:26 WARN FileStreamSource$SourceFileArchiver: Fail to move file:/home/jupyter/spark-streaming/data/02_reading_from_files/input/device_01.json to file:/home/jupyter/spark-streaming/arch_dir/home/jupyter/spark-streaming/data/02_reading_from_files/input/device_01.json / skip moving file.
-------------------------------------------
Batch: 1
-------------------------------------------
-------------------------------------------
Batch: 1
-------------------------------------------
+----------+--------------------+-----------+--------------+--------------------+--------+-------+-------+-----------+
|customerId|             eventId|eventOffset|eventPublisher|           eventTime|deviceId|measure| status|temperature|
+----------+--------------------+-----------+--------------+--------------------+--------+-------+-------+-----------+
|   CI00108|aa90011f-3967-496...|      10003|        device|2023-01-05 11:13:...|    D004|      C|SUCCESS|         16|
+----------+--------------------+-----------+--------------+--------------------+--------+-------+-------+-----------+

+----------+--------------------+-----------+--------------+--------------------+--------+-------+-------+-----------+
|customerId|             eventId|eventOffset|eventPublisher|           eventTime|deviceId|measure| status|temperature|
+----------+--------------------+-----------+--------------+--------------------+--------+-------+-------+-----------+
|   CI00108|aa90011f-3967-496...|      10003|        device|2023-01-05 11:13:...|    D004|      C|SUCCESS|         16|
+----------+--------------------+-----------+--------------+--------------------+--------+-------+-------+-----------+

[I 2025-03-06 13:24:14.107 ServerApp] Saving file at /spark-streaming/02_reading_from_files.ipynb



In [None]:
# Write the output in csv format
(stream_flattened_df
 .writeStream
 .format("csv")
 .outputMode("append")
 .option("path", "data/02_reading_from_files/output/device_data.csv")
 .option("checkPointLocation", "checkpoint_dir")
 .start()
 .awaitTermination())