# Part 2: Streaming application using Spark Structured Streaming  
In this task, you will implement Spark Structured Streaming to consume the data from task 1 and perform a prediction.    
Important:   
-	This task uses PySpark Structured Streaming with PySpark Dataframe APIs and PySpark ML.  
-	You also need your pipeline model from A2A to make predictions and persist the results.  

1.	Write code to create a SparkSession, which 1) uses four cores with a proper application name; 2) use the Melbourne timezone; 3) ensure a checkpoint location has been set.


In [1]:
import os
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-streaming-kafka-0-10_2.12:3.0.0,org.apache.spark:spark-sql-kafka-0-10_2.12:3.0.0 pyspark-shell'

# Import SparkConf class into program
from pyspark import SparkConf

# local[*]: run Spark in local mode with as many working processors as logical cores on your machine
# If we want Spark to run locally with 'k' worker threads, we can specify as "local[k]".
master = "local[4]"
# The `appName` field is a name to be shown on the Spark cluster UI page
app_name = "Assignment2B"
# Setup configuration parameters for Spark
spark_conf = SparkConf().setMaster(master).setAppName(app_name) \
                        .set("spark.sql.streaming.checkpointLocation", "checkpoints")

# Import SparkContext and SparkSession classes
from pyspark import SparkContext # Spark
from pyspark.sql import SparkSession # Spark SQL

# Method 1: Using SparkSession
spark = SparkSession.builder.config(conf=spark_conf).config("spark.sql.session.timeZone", "GMT+10").getOrCreate()
sc = spark.sparkContext
sc.setLogLevel('ERROR')

from pyspark.sql import functions as F

2.	Write code to define the data schema for the data files, following the data types suggested in the metadata file. Load the static datasets (e.g. building information) into data frames. (You can reuse your code from 2A.)


In [None]:
# Adapted from GPT
from pyspark.sql.types import (
    StructType, StructField,
    IntegerType, StringType, DecimalType, TimestampType
)

# 1. Meters Table
meters_schema = StructType([
    StructField("building_id", IntegerType(), False),
    StructField("meter_type", StringType(), False),   # Char(1) -> StringType
    StructField("ts", TimestampType(), False),
    StructField("value", DecimalType(15, 4), False),
    StructField("row_id", IntegerType(), False)
])

# 2. Buildings Table
buildings_schema = StructType([
    StructField("site_id", IntegerType(), False),
    StructField("building_id", IntegerType(), False),
    StructField("primary_use", StringType(), True),
    StructField("square_feet", IntegerType(), True),
    StructField("floor_count", IntegerType(), True),
    StructField("row_id", IntegerType(), False),
    StructField("year_built", IntegerType(), True),
    StructField("latent_y", DecimalType(6, 4), True),
    StructField("latent_s", DecimalType(6, 4), True),
    StructField("latent_r", DecimalType(6, 4), True)
])

# 3. Weather Table
# weather_schema = StructType([
#     StructField("site_id", StringType(), False),
#     StructField("timestamp", StringType(), False),
#     StructField("air_temperature", StringType(), True),
#     StructField("cloud_coverage", StringType(), True), # Is an Integer, but ends with a ".0", so read as a DecimalType
#     StructField("dew_temperature", StringType(), True),
#     StructField("sea_level_pressure", StringType(), True),
#     StructField("wind_direction", StringType(), True), # Is an Integer, but ends with a ".0", so read as a DecimalType
#     StructField("wind_speed", StringType(), True),
#     StructField("weather_ts", StringType(), False) # new field
# ])

weather_schema = StructType([
    StructField("site_id", StringType(), False),
    StructField("timestamp", TimestampType(), False),
    StructField("air_temperature", DecimalType(5, 3), True),
    StructField("cloud_coverage", DecimalType(5, 3), True), # Is an Integer, but ends with a ".0", so read as a DecimalType
    StructField("dew_temperature", DecimalType(5, 3), True),
    StructField("sea_level_pressure", DecimalType(8, 3), True),
    StructField("wind_direction", DecimalType(5, 3), True), # Is an Integer, but ends with a ".0", so read as a DecimalType
    StructField("wind_speed", DecimalType(5, 3), True),
    StructField("weather_ts", TimestampType(), False) # new field
])

buildings_df = spark.read.csv(
    "data/new_building_information.csv",
    header=True,
    schema=buildings_schema
)

weather_df = spark.read.csv(
    "data/weather.csv",
    header=True,
    schema=weather_schema
)


3.	Using the Kafka topic from the producer in Task 1, ingest the streaming data into Spark Streaming, assuming all data comes in the String format. Except for the 'weather_ts' column, you shall receive it as an Int type. Load the new building information CSV file into a dataframe. Then, the data frames should be transformed into the proper formats following the metadata file schema, similar to assignment 2A.


In [None]:


#configuration
hostip = "192.168.0.6"
topic = 'weather_data'

df_raw = spark \
    .readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", f'{hostip}:9092') \
    .option("subscribe", topic) \
    .load()

df_str = df_raw.selectExpr("CAST(value AS STRING) as json_str")

weather_stream = (
    df_str
    .withColumn("data", F.from_json(F.col("json_str"), F.ArrayType(weather_schema)))
    .select(F.explode(F.col("data")).alias("r"))
    .select("r.*")
)


4.	Use a watermark on weather_ts, if data points are received 5 seconds late, discard the data.

In [4]:
weather_stream = weather_stream.withWatermark("weather_ts", '5 seconds')

5.	Perform the necessary transformation you used in A2A. (note: every student may have used different features, feel free to reuse the code you have written in A2A. If you built an end-to-end pipeline, you can ignore this task.) 

In [None]:

# Get global_means, site_month_means, site_means
# weather_df is history, weather_stream is current

# Split timestamp to date, month, time bucket
weather_df = weather_df.withColumn("date", F.to_date("timestamp")).withColumn(
    "time",
    F.when(F.hour("timestamp") <= 5, "0-6h")
     .when(F.hour("timestamp") <= 11, "6-12h")
     .when(F.hour("timestamp") <= 17, "12-18h")
     .when(F.hour("timestamp") <= 23, "18-24h")
).withColumn("month", F.month("timestamp"))

# Choose which columns to impute
impute_cols = [
    "air_temperature",
    "cloud_coverage",
    "dew_temperature",
    "sea_level_pressure",
    "wind_direction",
    "wind_speed"
]

# Compute global_means, site_month_means, site_means
global_means = weather_df.select(
    *[F.mean(c).alias(c) for c in impute_cols]
).first().asDict()

site_month_means = weather_df.groupBy("site_id", "month").agg(
    *[F.mean(c).alias(f"{c}_site_month_mean") for c in impute_cols]
)

site_means = weather_df.groupBy("site_id").agg(
    *[F.mean(c).alias(f"{c}_site_mean") for c in impute_cols]
)
    
# Skip Garbage collection
# del site_month_means
# del site_means
# del global_means
# spark.catalog.clearCache()

# Data imputation
# Transform weather_stream
# Split timestamp to date, month, time bucket
weather_stream = weather_stream.withColumn("date", F.to_date("timestamp")).withColumn(
    "time",
    F.when(F.hour("timestamp") <= 5, "0-6h")
     .when(F.hour("timestamp") <= 11, "6-12h")
     .when(F.hour("timestamp") <= 17, "12-18h")
     .when(F.hour("timestamp") <= 23, "18-24h")
).withColumn("month", F.month("timestamp"))


# Step 1: site_id + month
weather_stream = weather_stream.join(site_month_means, on=["site_id", "month"], how="left")
for c in impute_cols:
    weather_stream = weather_stream.withColumn(
        c, F.coalesce(c, F.col(f"{c}_site_month_mean"))
    ).drop(f"{c}_site_month_mean")
    
# Step 2: site_id
weather_stream = weather_stream.join(site_means, on="site_id", how="left")
for c in impute_cols:
    weather_stream = weather_stream.withColumn(
        c, F.coalesce(c, F.col(f"{c}_site_mean"))
    ).drop(f"{c}_site_mean")

# Step 3: global fallback
for c in impute_cols:
    weather_stream = weather_stream.withColumn(
        c, F.coalesce(c, F.lit(global_means[c]))
    )
# Aggregate by time bucket
weather_stream = (
    weather_stream
    .groupBy(
        "site_id", "date", "time", "month",
        F.window("weather_ts", "5 seconds")
    )
    .agg(
        F.mean("air_temperature").cast(DecimalType(5, 3)).alias("air_temperature"),
        F.mean("cloud_coverage").cast(DecimalType(5, 3)).alias("cloud_coverage"),
        F.mean("dew_temperature").cast(DecimalType(5, 3)).alias("dew_temperature"),
        F.mean("sea_level_pressure").cast(DecimalType(8, 3)).alias("sea_level_pressure"),
        F.mean("wind_direction").cast(DecimalType(5, 3)).alias("wind_direction"),
        F.mean("wind_speed").cast(DecimalType(5, 3)).alias("wind_speed")     
    )    
)

# Add custom columns
weather_stream = (
    weather_stream
    .withColumn("dew_depression", F.col("air_temperature") - F.col("dew_temperature"))
    .withColumn("nonideal_temp", (F.col("air_temperature") - 18)**2)
    .drop("air_temperature")
    .drop("dew_temperature")
)

# No need to add median temp and peak-offpeak as our pipeline model later does not use them
feature_df = buildings_df.join(weather_stream, ["site_id"])


6.	Load your pipeline model and perform the following aggregations:  
a)	Print the prediction from your model as a stream comes in.  
b)	Every 7 seconds, print the total energy consumption for each 6-hour interval, aggregated by building, and print 20 records. (Note: This is simulating energy data each day in a week)  
c)	Every 14 seconds, for each site, print the daily total energy consumption.  

In [7]:

# --- 1. Spark + model setup ---
# from pyspark.sql import SparkSession
from pyspark.ml import PipelineModel

model = PipelineModel.load("models/best_model_rmsle")

# --- 2. Apply model ---
predictions = model.transform(feature_df).withColumnRenamed("prediction", "log_power_usage")

checkpoint_dir = os.path.abspath("checkpoints/weather_stream")
os.makedirs(checkpoint_dir, exist_ok=True)

In [8]:
weather_stream.printSchema()

root
 |-- site_id: string (nullable = true)
 |-- date: date (nullable = true)
 |-- time: string (nullable = true)
 |-- month: integer (nullable = true)
 |-- window: struct (nullable = false)
 |    |-- start: timestamp (nullable = true)
 |    |-- end: timestamp (nullable = true)
 |-- cloud_coverage: decimal(5,3) (nullable = true)
 |-- sea_level_pressure: decimal(8,3) (nullable = true)
 |-- wind_direction: decimal(5,3) (nullable = true)
 |-- wind_speed: decimal(5,3) (nullable = true)
 |-- dew_depression: decimal(6,3) (nullable = true)
 |-- nonideal_temp: double (nullable = true)



In [9]:
feature_df.printSchema()

root
 |-- site_id: integer (nullable = true)
 |-- building_id: integer (nullable = true)
 |-- primary_use: string (nullable = true)
 |-- square_feet: integer (nullable = true)
 |-- floor_count: integer (nullable = true)
 |-- row_id: integer (nullable = true)
 |-- year_built: integer (nullable = true)
 |-- latent_y: decimal(6,4) (nullable = true)
 |-- latent_s: decimal(6,4) (nullable = true)
 |-- latent_r: decimal(6,4) (nullable = true)
 |-- date: date (nullable = true)
 |-- time: string (nullable = true)
 |-- month: integer (nullable = true)
 |-- window: struct (nullable = false)
 |    |-- start: timestamp (nullable = true)
 |    |-- end: timestamp (nullable = true)
 |-- cloud_coverage: decimal(5,3) (nullable = true)
 |-- sea_level_pressure: decimal(8,3) (nullable = true)
 |-- wind_direction: decimal(5,3) (nullable = true)
 |-- wind_speed: decimal(5,3) (nullable = true)
 |-- dew_depression: decimal(6,3) (nullable = true)
 |-- nonideal_temp: double (nullable = true)



In [10]:
predictions.printSchema()

root
 |-- site_id: integer (nullable = true)
 |-- building_id: integer (nullable = true)
 |-- primary_use: string (nullable = true)
 |-- square_feet: integer (nullable = true)
 |-- floor_count: integer (nullable = true)
 |-- row_id: integer (nullable = true)
 |-- year_built: integer (nullable = true)
 |-- latent_y: decimal(6,4) (nullable = true)
 |-- latent_s: decimal(6,4) (nullable = true)
 |-- latent_r: decimal(6,4) (nullable = true)
 |-- date: date (nullable = true)
 |-- time: string (nullable = true)
 |-- month: integer (nullable = true)
 |-- window: struct (nullable = false)
 |    |-- start: timestamp (nullable = true)
 |    |-- end: timestamp (nullable = true)
 |-- cloud_coverage: decimal(5,3) (nullable = true)
 |-- sea_level_pressure: decimal(8,3) (nullable = true)
 |-- wind_direction: decimal(5,3) (nullable = true)
 |-- wind_speed: decimal(5,3) (nullable = true)
 |-- dew_depression: decimal(6,3) (nullable = true)
 |-- nonideal_temp: double (nullable = true)
 |-- primary_use_idx

In [11]:
# 6a
# Show live predictions
query_live = (
    predictions
        .select("site_id", "building_id", "window", "log_power_usage")
        .writeStream
        .outputMode("append")
        .format("memory")
        .queryName("live_predictions")
        .start()
)
spark.sql("select * from live_predictions").show()


+-------+-----------+------+---------------+
|site_id|building_id|window|log_power_usage|
+-------+-----------+------+---------------+
+-------+-----------+------+---------------+



In [None]:
# 6b


building_6h = (
    predictions
        .groupBy(
            "building_id",
            "window.start",
            "window.end",
        )
        .agg(F.sum("log_power_usage").alias("total_power_6h"))
)
# --- Print to console every 7 seconds ---
query_building_6h = (
    building_6h
        .writeStream
        .outputMode("update")
        .format("memory")
        .queryName("building_6h")
        .trigger(processingTime="7 seconds")
        .start()
)
spark.sql("select * from building_6h").show()



AnalysisException: Detected pattern of possible 'correctness' issue due to global watermark. The query contains stateful operation which can emit rows older than the current watermark plus allowed late record delay, which are "late rows" in downstream stateful operations and these rows can be discarded. Please refer the programming guide doc for more details. If you understand the possible risk of correctness issue and still need to run the query, you can disable this check by setting the config `spark.sql.streaming.statefulOperator.checkCorrectness.enabled` to false.;
Aggregate [building_id#1, window#779-T5000ms.start, window#779-T5000ms.end], [building_id#1, window#779-T5000ms.start AS start#1235, window#779-T5000ms.end AS end#1236, sum(log_power_usage#1137) AS total_power_6h#1232]
+- Project [site_id#0, building_id#1, primary_use#2, square_feet#3, floor_count#4, row_id#5, year_built#6, latent_y#7, latent_s#8, latent_r#9, date#257, time#268, month#280, window#779-T5000ms, cloud_coverage#795, sea_level_pressure#799, wind_direction#801, wind_speed#803, dew_depression#816, nonideal_temp#829, primary_use_idx#943, site_id_idx#971, primary_use_ohe#1001, site_id_ohe#1035, ... 2 more fields]
   +- Project [site_id#0, building_id#1, primary_use#2, square_feet#3, floor_count#4, row_id#5, year_built#6, latent_y#7, latent_s#8, latent_r#9, date#257, time#268, month#280, window#779-T5000ms, cloud_coverage#795, sea_level_pressure#799, wind_direction#801, wind_speed#803, dew_depression#816, nonideal_temp#829, primary_use_idx#943, site_id_idx#971, primary_use_ohe#1001, site_id_ohe#1035, ... 2 more fields]
      +- Project [site_id#0, building_id#1, primary_use#2, square_feet#3, floor_count#4, row_id#5, year_built#6, latent_y#7, latent_s#8, latent_r#9, date#257, time#268, month#280, window#779-T5000ms, cloud_coverage#795, sea_level_pressure#799, wind_direction#801, wind_speed#803, dew_depression#816, nonideal_temp#829, primary_use_idx#943, site_id_idx#971, primary_use_ohe#1001, site_id_ohe#1035, UDF(struct(primary_use_ohe, primary_use_ohe#1001, site_id_ohe, site_id_ohe#1035, square_feet_double_VectorAssembler_dda2dbfd9182, cast(square_feet#3 as double), floor_count_double_VectorAssembler_dda2dbfd9182, cast(floor_count#4 as double), year_built_double_VectorAssembler_dda2dbfd9182, cast(year_built#6 as double), latent_y_double_VectorAssembler_dda2dbfd9182, cast(latent_y#7 as double), latent_s_double_VectorAssembler_dda2dbfd9182, cast(latent_s#8 as double), latent_r_double_VectorAssembler_dda2dbfd9182, cast(latent_r#9 as double), cloud_coverage_double_VectorAssembler_dda2dbfd9182, cast(cloud_coverage#795 as double), sea_level_pressure_double_VectorAssembler_dda2dbfd9182, cast(sea_level_pressure#799 as double), wind_direction_double_VectorAssembler_dda2dbfd9182, cast(wind_direction#801 as double), wind_speed_double_VectorAssembler_dda2dbfd9182, cast(wind_speed#803 as double), ... 4 more fields)) AS features#1078]
         +- Project [site_id#0, building_id#1, primary_use#2, square_feet#3, floor_count#4, row_id#5, year_built#6, latent_y#7, latent_s#8, latent_r#9, date#257, time#268, month#280, window#779-T5000ms, cloud_coverage#795, sea_level_pressure#799, wind_direction#801, wind_speed#803, dew_depression#816, nonideal_temp#829, primary_use_idx#943, site_id_idx#971, primary_use_ohe#1001, UDF(cast(site_id_idx#971 as double), 0) AS site_id_ohe#1035]
            +- Project [site_id#0, building_id#1, primary_use#2, square_feet#3, floor_count#4, row_id#5, year_built#6, latent_y#7, latent_s#8, latent_r#9, date#257, time#268, month#280, window#779-T5000ms, cloud_coverage#795, sea_level_pressure#799, wind_direction#801, wind_speed#803, dew_depression#816, nonideal_temp#829, primary_use_idx#943, site_id_idx#971, UDF(cast(primary_use_idx#943 as double), 0) AS primary_use_ohe#1001]
               +- Project [site_id#0, building_id#1, primary_use#2, square_feet#3, floor_count#4, row_id#5, year_built#6, latent_y#7, latent_s#8, latent_r#9, date#257, time#268, month#280, window#779-T5000ms, cloud_coverage#795, sea_level_pressure#799, wind_direction#801, wind_speed#803, dew_depression#816, nonideal_temp#829, primary_use_idx#943, UDF(cast(site_id#0 as string)) AS site_id_idx#971]
                  +- Project [site_id#0, building_id#1, primary_use#2, square_feet#3, floor_count#4, row_id#5, year_built#6, latent_y#7, latent_s#8, latent_r#9, date#257, time#268, month#280, window#779-T5000ms, cloud_coverage#795, sea_level_pressure#799, wind_direction#801, wind_speed#803, dew_depression#816, nonideal_temp#829, UDF(cast(primary_use#2 as string)) AS primary_use_idx#943]
                     +- Project [site_id#0, building_id#1, primary_use#2, square_feet#3, floor_count#4, row_id#5, year_built#6, latent_y#7, latent_s#8, latent_r#9, date#257, time#268, month#280, window#779-T5000ms, cloud_coverage#795, sea_level_pressure#799, wind_direction#801, wind_speed#803, dew_depression#816, nonideal_temp#829]
                        +- Join Inner, (site_id#0 = cast(site_id#67 as int))
                           :- Relation [site_id#0,building_id#1,primary_use#2,square_feet#3,floor_count#4,row_id#5,year_built#6,latent_y#7,latent_s#8,latent_r#9] csv
                           +- Project [site_id#67, date#257, time#268, month#280, window#779-T5000ms, cloud_coverage#795, sea_level_pressure#799, wind_direction#801, wind_speed#803, dew_depression#816, nonideal_temp#829]
                              +- Project [site_id#67, date#257, time#268, month#280, window#779-T5000ms, cloud_coverage#795, dew_temperature#797, sea_level_pressure#799, wind_direction#801, wind_speed#803, dew_depression#816, nonideal_temp#829]
                                 +- Project [site_id#67, date#257, time#268, month#280, window#779-T5000ms, air_temperature#793, cloud_coverage#795, dew_temperature#797, sea_level_pressure#799, wind_direction#801, wind_speed#803, dew_depression#816, POWER(cast((air_temperature#793 - cast(18 as decimal(2,0))) as double), cast(2 as double)) AS nonideal_temp#829]
                                    +- Project [site_id#67, date#257, time#268, month#280, window#779-T5000ms, air_temperature#793, cloud_coverage#795, dew_temperature#797, sea_level_pressure#799, wind_direction#801, wind_speed#803, (air_temperature#793 - dew_temperature#797) AS dew_depression#816]
                                       +- Aggregate [site_id#67, date#257, time#268, month#280, window#804-T5000ms], [site_id#67, date#257, time#268, month#280, window#804-T5000ms AS window#779-T5000ms, cast(avg(air_temperature#701) as decimal(5,3)) AS air_temperature#793, cast(avg(cloud_coverage#714) as decimal(5,3)) AS cloud_coverage#795, cast(avg(dew_temperature#727) as decimal(5,3)) AS dew_temperature#797, cast(avg(sea_level_pressure#740) as decimal(8,3)) AS sea_level_pressure#799, cast(avg(wind_direction#753) as decimal(5,3)) AS wind_direction#801, cast(avg(wind_speed#766) as decimal(5,3)) AS wind_speed#803]
                                          +- Project [named_struct(start, knownnullable(precisetimestampconversion(((precisetimestampconversion(weather_ts#75-T5000ms, TimestampType, LongType) - CASE WHEN (((precisetimestampconversion(weather_ts#75-T5000ms, TimestampType, LongType) - 0) % 5000000) < cast(0 as bigint)) THEN (((precisetimestampconversion(weather_ts#75-T5000ms, TimestampType, LongType) - 0) % 5000000) + 5000000) ELSE ((precisetimestampconversion(weather_ts#75-T5000ms, TimestampType, LongType) - 0) % 5000000) END) - 0), LongType, TimestampType)), end, knownnullable(precisetimestampconversion((((precisetimestampconversion(weather_ts#75-T5000ms, TimestampType, LongType) - CASE WHEN (((precisetimestampconversion(weather_ts#75-T5000ms, TimestampType, LongType) - 0) % 5000000) < cast(0 as bigint)) THEN (((precisetimestampconversion(weather_ts#75-T5000ms, TimestampType, LongType) - 0) % 5000000) + 5000000) ELSE ((precisetimestampconversion(weather_ts#75-T5000ms, TimestampType, LongType) - 0) % 5000000) END) - 0) + 5000000), LongType, TimestampType))) AS window#804-T5000ms, site_id#67, month#280, timestamp#68, air_temperature#701, cloud_coverage#714, dew_temperature#727, sea_level_pressure#740, wind_direction#753, wind_speed#766, weather_ts#75-T5000ms, date#257, time#268]
                                             +- Filter isnotnull(weather_ts#75-T5000ms)
                                                +- Project [site_id#67, month#280, timestamp#68, air_temperature#701, cloud_coverage#714, dew_temperature#727, sea_level_pressure#740, wind_direction#753, coalesce(wind_speed#675, cast(3.5605274 as decimal(9,7))) AS wind_speed#766, weather_ts#75-T5000ms, date#257, time#268]
                                                   +- Project [site_id#67, month#280, timestamp#68, air_temperature#701, cloud_coverage#714, dew_temperature#727, sea_level_pressure#740, coalesce(wind_direction#647, 35.0490762) AS wind_direction#753, wind_speed#675, weather_ts#75-T5000ms, date#257, time#268]
                                                      +- Project [site_id#67, month#280, timestamp#68, air_temperature#701, cloud_coverage#714, dew_temperature#727, coalesce(sea_level_pressure#617, cast(1016.1580380 as decimal(12,7))) AS sea_level_pressure#740, wind_direction#647, wind_speed#675, weather_ts#75-T5000ms, date#257, time#268]
                                                         +- Project [site_id#67, month#280, timestamp#68, air_temperature#701, cloud_coverage#714, coalesce(dew_temperature#585, cast(7.3501582 as decimal(9,7))) AS dew_temperature#727, sea_level_pressure#617, wind_direction#647, wind_speed#675, weather_ts#75-T5000ms, date#257, time#268]
                                                            +- Project [site_id#67, month#280, timestamp#68, air_temperature#701, coalesce(cloud_coverage#551, cast(2.1493059 as decimal(9,7))) AS cloud_coverage#714, dew_temperature#585, sea_level_pressure#617, wind_direction#647, wind_speed#675, weather_ts#75-T5000ms, date#257, time#268]
                                                               +- Project [site_id#67, month#280, timestamp#68, coalesce(air_temperature#515, 14.4181065) AS air_temperature#701, cloud_coverage#551, dew_temperature#585, sea_level_pressure#617, wind_direction#647, wind_speed#675, weather_ts#75-T5000ms, date#257, time#268]
                                                                  +- Project [site_id#67, month#280, timestamp#68, air_temperature#515, cloud_coverage#551, dew_temperature#585, sea_level_pressure#617, wind_direction#647, wind_speed#675, weather_ts#75-T5000ms, date#257, time#268]
                                                                     +- Project [site_id#67, month#280, timestamp#68, air_temperature#515, cloud_coverage#551, dew_temperature#585, sea_level_pressure#617, wind_direction#647, coalesce(wind_speed#471, wind_speed_site_mean#249) AS wind_speed#675, weather_ts#75-T5000ms, date#257, time#268, wind_speed_site_mean#249]
                                                                        +- Project [site_id#67, month#280, timestamp#68, air_temperature#515, cloud_coverage#551, dew_temperature#585, sea_level_pressure#617, wind_direction#647, wind_speed#471, weather_ts#75-T5000ms, date#257, time#268, wind_speed_site_mean#249]
                                                                           +- Project [site_id#67, month#280, timestamp#68, air_temperature#515, cloud_coverage#551, dew_temperature#585, sea_level_pressure#617, coalesce(wind_direction#443, wind_direction_site_mean#247) AS wind_direction#647, wind_speed#471, weather_ts#75-T5000ms, date#257, time#268, wind_direction_site_mean#247, wind_speed_site_mean#249]
                                                                              +- Project [site_id#67, month#280, timestamp#68, air_temperature#515, cloud_coverage#551, dew_temperature#585, sea_level_pressure#617, wind_direction#443, wind_speed#471, weather_ts#75-T5000ms, date#257, time#268, wind_direction_site_mean#247, wind_speed_site_mean#249]
                                                                                 +- Project [site_id#67, month#280, timestamp#68, air_temperature#515, cloud_coverage#551, dew_temperature#585, coalesce(sea_level_pressure#413, sea_level_pressure_site_mean#245) AS sea_level_pressure#617, wind_direction#443, wind_speed#471, weather_ts#75-T5000ms, date#257, time#268, sea_level_pressure_site_mean#245, wind_direction_site_mean#247, wind_speed_site_mean#249]
                                                                                    +- Project [site_id#67, month#280, timestamp#68, air_temperature#515, cloud_coverage#551, dew_temperature#585, sea_level_pressure#413, wind_direction#443, wind_speed#471, weather_ts#75-T5000ms, date#257, time#268, sea_level_pressure_site_mean#245, wind_direction_site_mean#247, wind_speed_site_mean#249]
                                                                                       +- Project [site_id#67, month#280, timestamp#68, air_temperature#515, cloud_coverage#551, coalesce(dew_temperature#381, dew_temperature_site_mean#243) AS dew_temperature#585, sea_level_pressure#413, wind_direction#443, wind_speed#471, weather_ts#75-T5000ms, date#257, time#268, dew_temperature_site_mean#243, sea_level_pressure_site_mean#245, wind_direction_site_mean#247, wind_speed_site_mean#249]
                                                                                          +- Project [site_id#67, month#280, timestamp#68, air_temperature#515, cloud_coverage#551, dew_temperature#381, sea_level_pressure#413, wind_direction#443, wind_speed#471, weather_ts#75-T5000ms, date#257, time#268, dew_temperature_site_mean#243, sea_level_pressure_site_mean#245, wind_direction_site_mean#247, wind_speed_site_mean#249]
                                                                                             +- Project [site_id#67, month#280, timestamp#68, air_temperature#515, coalesce(cloud_coverage#347, cloud_coverage_site_mean#241) AS cloud_coverage#551, dew_temperature#381, sea_level_pressure#413, wind_direction#443, wind_speed#471, weather_ts#75-T5000ms, date#257, time#268, cloud_coverage_site_mean#241, dew_temperature_site_mean#243, sea_level_pressure_site_mean#245, wind_direction_site_mean#247, wind_speed_site_mean#249]
                                                                                                +- Project [site_id#67, month#280, timestamp#68, air_temperature#515, cloud_coverage#347, dew_temperature#381, sea_level_pressure#413, wind_direction#443, wind_speed#471, weather_ts#75-T5000ms, date#257, time#268, cloud_coverage_site_mean#241, dew_temperature_site_mean#243, sea_level_pressure_site_mean#245, wind_direction_site_mean#247, wind_speed_site_mean#249]
                                                                                                   +- Project [site_id#67, month#280, timestamp#68, coalesce(air_temperature#311, air_temperature_site_mean#239) AS air_temperature#515, cloud_coverage#347, dew_temperature#381, sea_level_pressure#413, wind_direction#443, wind_speed#471, weather_ts#75-T5000ms, date#257, time#268, air_temperature_site_mean#239, cloud_coverage_site_mean#241, dew_temperature_site_mean#243, sea_level_pressure_site_mean#245, wind_direction_site_mean#247, wind_speed_site_mean#249]
                                                                                                      +- Project [site_id#67, month#280, timestamp#68, air_temperature#311, cloud_coverage#347, dew_temperature#381, sea_level_pressure#413, wind_direction#443, wind_speed#471, weather_ts#75-T5000ms, date#257, time#268, air_temperature_site_mean#239, cloud_coverage_site_mean#241, dew_temperature_site_mean#243, sea_level_pressure_site_mean#245, wind_direction_site_mean#247, wind_speed_site_mean#249]
                                                                                                         +- Join LeftOuter, (site_id#67 = site_id#20)
                                                                                                            :- Project [site_id#67, month#280, timestamp#68, air_temperature#311, cloud_coverage#347, dew_temperature#381, sea_level_pressure#413, wind_direction#443, wind_speed#471, weather_ts#75-T5000ms, date#257, time#268]
                                                                                                            :  +- Project [site_id#67, month#280, timestamp#68, air_temperature#311, cloud_coverage#347, dew_temperature#381, sea_level_pressure#413, wind_direction#443, coalesce(cast(wind_speed#74 as decimal(9,7)), wind_speed_site_month_mean#217) AS wind_speed#471, weather_ts#75-T5000ms, date#257, time#268, wind_speed_site_month_mean#217]
                                                                                                            :     +- Project [site_id#67, month#280, timestamp#68, air_temperature#311, cloud_coverage#347, dew_temperature#381, sea_level_pressure#413, wind_direction#443, wind_speed#74, weather_ts#75-T5000ms, date#257, time#268, wind_speed_site_month_mean#217]
                                                                                                            :        +- Project [site_id#67, month#280, timestamp#68, air_temperature#311, cloud_coverage#347, dew_temperature#381, sea_level_pressure#413, coalesce(cast(wind_direction#73 as decimal(9,7)), wind_direction_site_month_mean#215) AS wind_direction#443, wind_speed#74, weather_ts#75-T5000ms, date#257, time#268, wind_direction_site_month_mean#215, wind_speed_site_month_mean#217]
                                                                                                            :           +- Project [site_id#67, month#280, timestamp#68, air_temperature#311, cloud_coverage#347, dew_temperature#381, sea_level_pressure#413, wind_direction#73, wind_speed#74, weather_ts#75-T5000ms, date#257, time#268, wind_direction_site_month_mean#215, wind_speed_site_month_mean#217]
                                                                                                            :              +- Project [site_id#67, month#280, timestamp#68, air_temperature#311, cloud_coverage#347, dew_temperature#381, coalesce(cast(sea_level_pressure#72 as decimal(12,7)), sea_level_pressure_site_month_mean#213) AS sea_level_pressure#413, wind_direction#73, wind_speed#74, weather_ts#75-T5000ms, date#257, time#268, sea_level_pressure_site_month_mean#213, wind_direction_site_month_mean#215, wind_speed_site_month_mean#217]
                                                                                                            :                 +- Project [site_id#67, month#280, timestamp#68, air_temperature#311, cloud_coverage#347, dew_temperature#381, sea_level_pressure#72, wind_direction#73, wind_speed#74, weather_ts#75-T5000ms, date#257, time#268, sea_level_pressure_site_month_mean#213, wind_direction_site_month_mean#215, wind_speed_site_month_mean#217]
                                                                                                            :                    +- Project [site_id#67, month#280, timestamp#68, air_temperature#311, cloud_coverage#347, coalesce(cast(dew_temperature#71 as decimal(9,7)), dew_temperature_site_month_mean#211) AS dew_temperature#381, sea_level_pressure#72, wind_direction#73, wind_speed#74, weather_ts#75-T5000ms, date#257, time#268, dew_temperature_site_month_mean#211, sea_level_pressure_site_month_mean#213, wind_direction_site_month_mean#215, wind_speed_site_month_mean#217]
                                                                                                            :                       +- Project [site_id#67, month#280, timestamp#68, air_temperature#311, cloud_coverage#347, dew_temperature#71, sea_level_pressure#72, wind_direction#73, wind_speed#74, weather_ts#75-T5000ms, date#257, time#268, dew_temperature_site_month_mean#211, sea_level_pressure_site_month_mean#213, wind_direction_site_month_mean#215, wind_speed_site_month_mean#217]
                                                                                                            :                          +- Project [site_id#67, month#280, timestamp#68, air_temperature#311, coalesce(cast(cloud_coverage#70 as decimal(9,7)), cloud_coverage_site_month_mean#209) AS cloud_coverage#347, dew_temperature#71, sea_level_pressure#72, wind_direction#73, wind_speed#74, weather_ts#75-T5000ms, date#257, time#268, cloud_coverage_site_month_mean#209, dew_temperature_site_month_mean#211, sea_level_pressure_site_month_mean#213, wind_direction_site_month_mean#215, wind_speed_site_month_mean#217]
                                                                                                            :                             +- Project [site_id#67, month#280, timestamp#68, air_temperature#311, cloud_coverage#70, dew_temperature#71, sea_level_pressure#72, wind_direction#73, wind_speed#74, weather_ts#75-T5000ms, date#257, time#268, cloud_coverage_site_month_mean#209, dew_temperature_site_month_mean#211, sea_level_pressure_site_month_mean#213, wind_direction_site_month_mean#215, wind_speed_site_month_mean#217]
                                                                                                            :                                +- Project [site_id#67, month#280, timestamp#68, coalesce(cast(air_temperature#69 as decimal(9,7)), air_temperature_site_month_mean#207) AS air_temperature#311, cloud_coverage#70, dew_temperature#71, sea_level_pressure#72, wind_direction#73, wind_speed#74, weather_ts#75-T5000ms, date#257, time#268, air_temperature_site_month_mean#207, cloud_coverage_site_month_mean#209, dew_temperature_site_month_mean#211, sea_level_pressure_site_month_mean#213, wind_direction_site_month_mean#215, wind_speed_site_month_mean#217]
                                                                                                            :                                   +- Project [site_id#67, month#280, timestamp#68, air_temperature#69, cloud_coverage#70, dew_temperature#71, sea_level_pressure#72, wind_direction#73, wind_speed#74, weather_ts#75-T5000ms, date#257, time#268, air_temperature_site_month_mean#207, cloud_coverage_site_month_mean#209, dew_temperature_site_month_mean#211, sea_level_pressure_site_month_mean#213, wind_direction_site_month_mean#215, wind_speed_site_month_mean#217]
                                                                                                            :                                      +- Join LeftOuter, ((site_id#67 = site_id#20) AND (month#280 = month#109))
                                                                                                            :                                         :- Project [site_id#67, timestamp#68, air_temperature#69, cloud_coverage#70, dew_temperature#71, sea_level_pressure#72, wind_direction#73, wind_speed#74, weather_ts#75-T5000ms, date#257, time#268, month(cast(timestamp#68 as date)) AS month#280]
                                                                                                            :                                         :  +- Project [site_id#67, timestamp#68, air_temperature#69, cloud_coverage#70, dew_temperature#71, sea_level_pressure#72, wind_direction#73, wind_speed#74, weather_ts#75-T5000ms, date#257, CASE WHEN (hour(timestamp#68, Some(GMT+10)) <= 5) THEN 0-6h WHEN (hour(timestamp#68, Some(GMT+10)) <= 11) THEN 6-12h WHEN (hour(timestamp#68, Some(GMT+10)) <= 17) THEN 12-18h WHEN (hour(timestamp#68, Some(GMT+10)) <= 23) THEN 18-24h END AS time#268]
                                                                                                            :                                         :     +- Project [site_id#67, timestamp#68, air_temperature#69, cloud_coverage#70, dew_temperature#71, sea_level_pressure#72, wind_direction#73, wind_speed#74, weather_ts#75-T5000ms, to_date(timestamp#68, None, Some(GMT+10), false) AS date#257]
                                                                                                            :                                         :        +- EventTimeWatermark weather_ts#75: timestamp, 5 seconds
                                                                                                            :                                         :           +- Project [r#65.site_id AS site_id#67, r#65.timestamp AS timestamp#68, r#65.air_temperature AS air_temperature#69, r#65.cloud_coverage AS cloud_coverage#70, r#65.dew_temperature AS dew_temperature#71, r#65.sea_level_pressure AS sea_level_pressure#72, r#65.wind_direction AS wind_direction#73, r#65.wind_speed AS wind_speed#74, r#65.weather_ts AS weather_ts#75]
                                                                                                            :                                         :              +- Project [r#65]
                                                                                                            :                                         :                 +- Generate explode(data#61), false, [r#65]
                                                                                                            :                                         :                    +- Project [json_str#59, from_json(ArrayType(StructType(StructField(site_id,StringType,false),StructField(timestamp,TimestampType,false),StructField(air_temperature,DecimalType(5,3),true),StructField(cloud_coverage,DecimalType(5,3),true),StructField(dew_temperature,DecimalType(5,3),true),StructField(sea_level_pressure,DecimalType(8,3),true),StructField(wind_direction,DecimalType(5,3),true),StructField(wind_speed,DecimalType(5,3),true),StructField(weather_ts,TimestampType,false)),true), json_str#59, Some(GMT+10)) AS data#61]
                                                                                                            :                                         :                       +- Project [cast(value#46 as string) AS json_str#59]
                                                                                                            :                                         :                          +- StreamingRelationV2 org.apache.spark.sql.kafka010.KafkaSourceProvider@4e528bc6, kafka, org.apache.spark.sql.kafka010.KafkaSourceProvider$KafkaTable@526fce40, [kafka.bootstrap.servers=192.168.0.6:9092, subscribe=weather_data], [key#45, value#46, topic#47, partition#48, offset#49L, timestamp#50, timestampType#51], StreamingRelation DataSource(org.apache.spark.sql.SparkSession@41d95ac5,kafka,List(),None,List(),None,Map(kafka.bootstrap.servers -> 192.168.0.6:9092, subscribe -> weather_data),None), kafka, [key#38, value#39, topic#40, partition#41, offset#42L, timestamp#43, timestampType#44]
                                                                                                            :                                         +- Aggregate [site_id#20, month#109], [site_id#20, month#109, avg(air_temperature#22) AS air_temperature_site_month_mean#207, avg(cloud_coverage#23) AS cloud_coverage_site_month_mean#209, avg(dew_temperature#24) AS dew_temperature_site_month_mean#211, avg(sea_level_pressure#25) AS sea_level_pressure_site_month_mean#213, avg(wind_direction#26) AS wind_direction_site_month_mean#215, avg(wind_speed#27) AS wind_speed_site_month_mean#217]
                                                                                                            :                                            +- Project [site_id#20, timestamp#21, air_temperature#22, cloud_coverage#23, dew_temperature#24, sea_level_pressure#25, wind_direction#26, wind_speed#27, weather_ts#28, date#85, time#97, month(cast(timestamp#21 as date)) AS month#109]
                                                                                                            :                                               +- Project [site_id#20, timestamp#21, air_temperature#22, cloud_coverage#23, dew_temperature#24, sea_level_pressure#25, wind_direction#26, wind_speed#27, weather_ts#28, date#85, CASE WHEN (hour(timestamp#21, Some(GMT+10)) <= 5) THEN 0-6h WHEN (hour(timestamp#21, Some(GMT+10)) <= 11) THEN 6-12h WHEN (hour(timestamp#21, Some(GMT+10)) <= 17) THEN 12-18h WHEN (hour(timestamp#21, Some(GMT+10)) <= 23) THEN 18-24h END AS time#97]
                                                                                                            :                                                  +- Project [site_id#20, timestamp#21, air_temperature#22, cloud_coverage#23, dew_temperature#24, sea_level_pressure#25, wind_direction#26, wind_speed#27, weather_ts#28, to_date(timestamp#21, None, Some(GMT+10), false) AS date#85]
                                                                                                            :                                                     +- Relation [site_id#20,timestamp#21,air_temperature#22,cloud_coverage#23,dew_temperature#24,sea_level_pressure#25,wind_direction#26,wind_speed#27,weather_ts#28] csv
                                                                                                            +- Aggregate [site_id#20], [site_id#20, avg(air_temperature#22) AS air_temperature_site_mean#239, avg(cloud_coverage#23) AS cloud_coverage_site_mean#241, avg(dew_temperature#24) AS dew_temperature_site_mean#243, avg(sea_level_pressure#25) AS sea_level_pressure_site_mean#245, avg(wind_direction#26) AS wind_direction_site_mean#247, avg(wind_speed#27) AS wind_speed_site_mean#249]
                                                                                                               +- Project [site_id#20, timestamp#21, air_temperature#22, cloud_coverage#23, dew_temperature#24, sea_level_pressure#25, wind_direction#26, wind_speed#27, weather_ts#28, date#85, time#97, month(cast(timestamp#21 as date)) AS month#109]
                                                                                                                  +- Project [site_id#20, timestamp#21, air_temperature#22, cloud_coverage#23, dew_temperature#24, sea_level_pressure#25, wind_direction#26, wind_speed#27, weather_ts#28, date#85, CASE WHEN (hour(timestamp#21, Some(GMT+10)) <= 5) THEN 0-6h WHEN (hour(timestamp#21, Some(GMT+10)) <= 11) THEN 6-12h WHEN (hour(timestamp#21, Some(GMT+10)) <= 17) THEN 12-18h WHEN (hour(timestamp#21, Some(GMT+10)) <= 23) THEN 18-24h END AS time#97]
                                                                                                                     +- Project [site_id#20, timestamp#21, air_temperature#22, cloud_coverage#23, dew_temperature#24, sea_level_pressure#25, wind_direction#26, wind_speed#27, weather_ts#28, to_date(timestamp#21, None, Some(GMT+10), false) AS date#85]
                                                                                                                        +- Relation [site_id#20,timestamp#21,air_temperature#22,cloud_coverage#23,dew_temperature#24,sea_level_pressure#25,wind_direction#26,wind_speed#27,weather_ts#28] csv


In [None]:
# 6c
site_daily = (
    predictions
        .groupBy(
            "site_id", 
            "date"
        .agg(F.sum("log_power_usage").alias("total_power_day"))
)

# --- Print to console every 14 seconds ---
query_site_daily = (
    site_daily
        .writeStream
        .outputMode("update")
        .format("memory")
        .queryName("site_daily")
        .trigger(processingTime="14 seconds")
        .start()
)
spark.sql("select * from site_daily").show()



7.	Save the data from 6 to Parquet files as streams. (Hint: Parquet files support streaming writing/reading. The file keeps updating while new batches arrive.)

In [None]:
# 7a(save 6a)

# Save predictions to Parquet incrementally
query_live_parquet = (
    predictions
        .select("site_id", "building_id", "time", "log_power_usage")
        .writeStream
        .outputMode("append")
        .format("parquet")
        .option("path", "data/live_predictions")
        .option("checkpointLocation", checkpoint_dir + "/live_predictions")
        .start()
)

In [None]:
# 7b(save 6b)

query_building_6h_parquet = (
    building_6h
        .writeStream
        .outputMode("update")
        .format("parquet")
        .option("path", "data/building_6h")
        .option("checkpointLocation", checkpoint_dir + "/building_6h")
        .start()
)

In [None]:
# 7c(save 6c)


query_site_daily_parquet = (
    site_daily
        .writeStream
        .outputMode("update")
        .format("parquet")
        .option("path", "data/site_daily")
        .option("checkpointLocation", checkpoint_dir + "/site_daily")
        .trigger(processingTime="14 seconds")
        .start()
)z

In [None]:
spark.streams.awaitAnyTermination()

8.	Read the parquet files from task 7 as data streams and send them to Kafka topics with appropriate names.
(Note: You shall read the parquet files as a streaming data frame and send messages to the Kafka topic when new data appears in the parquet file.)

In [None]:
import json
# Stream 1
live_predictions = (
    spark.readStream
         .format("parquet")
         .schema(predictions.schema)
         .load("data/live_predictions")
)

# send predictions to Kafka
(
    live_predictions
        .selectExpr("CAST(\"predictions\") AS key", 
                    "json.dumps(live_predictions, ensure_ascii=False) AS value")
        .writeStream
        .format("kafka")
        .option("kafka.bootstrap.servers", "localhost:9092")
        .option("topic", "live_predictions")
        .option("checkpointLocation", "checkpoints/predictions_to_kafka")
        .outputMode("append")
        .start()
)


In [None]:
# Stream 2
building_6h = (
    spark.readStream
         .format("parquet")
         .schema(predictions.schema)
         .load("data/building_6h")
)

# send predictions to Kafka
(
    building_6h
        .selectExpr("CAST(\"predictions\") AS key", 
                    "json.dumps(building_6h, ensure_ascii=False) AS value")
        .writeStream
        .format("kafka")
        .option("kafka.bootstrap.servers", "localhost:9092")
        .option("topic", "building_6h")
        .option("checkpointLocation", "checkpoints/predictions_to_kafka")
        .outputMode("append")
        .start()
)


In [None]:
# Stream 3
site_daily = (
    spark.readStream
         .format("parquet")
         .schema(predictions.schema)
         .load("data/site_daily")
)

# send predictions to Kafka
(
    site_daily
        .selectExpr("CAST(\"predictions\") AS key", 
                    "json.dumps(site_daily, ensure_ascii=False) AS value")
        .writeStream
        .format("kafka")
        .option("kafka.bootstrap.servers", "localhost:9092")
        .option("topic", "site_daily")
        .option("checkpointLocation", "checkpoints/predictions_to_kafka")
        .outputMode("append")
        .start()
)