In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
import pandas as pd

spark = SparkSession.builder.appName("ETL Pipline").getOrCreate()
spark

EXTRACT

In [0]:
df = spark.read.csv("file:/Workspace/Shared/cleaned_large_sensor_log.csv", header=True, inferSchema=True)
df.display()

log_id,device_id,room_id,timestamp,energy_kwh,hour,is_peak
1,101,1,2025-06-26T00:15:00Z,0.1,0,off_peak
2,102,1,2025-06-26T01:30:00Z,0.2,1,off_peak
3,103,2,2025-06-26T02:00:00Z,0.5,2,off_peak
4,104,2,2025-06-26T03:15:00Z,0.3,3,off_peak
5,105,3,2025-06-26T04:00:00Z,1.0,4,off_peak
6,101,1,2025-06-26T05:45:00Z,0.2,5,off_peak
7,102,1,2025-06-26T06:00:00Z,0.4,6,off_peak
8,103,2,2025-06-26T07:30:00Z,0.7,7,off_peak
9,104,2,2025-06-26T08:00:00Z,0.6,8,off_peak
10,105,3,2025-06-26T09:15:00Z,1.1,9,off_peak


TRANSFORM

In [0]:
daily_summary = df.withColumn("date", to_date("timestamp")).groupBy("device_id", "date").agg(sum("energy_kwh").alias("total_kwh"))
daily_summary.display()

device_id,date,total_kwh
103,2025-06-27,2.8
104,2025-06-27,2.6
101,2025-06-27,1.1
102,2025-06-27,1.7000000000000002
102,2025-06-26,2.0
101,2025-06-26,1.2
105,2025-06-26,4.7
105,2025-06-27,7.299999999999999
104,2025-06-26,2.4
103,2025-06-26,3.3


In [0]:
weekly_summary = df.withColumn("week", weekofyear("timestamp")).groupBy("device_id", "week").agg(sum("energy_kwh").alias("total_kwh"))
weekly_summary.display()

device_id,week,total_kwh
104,26,5.0
103,26,6.099999999999999
102,26,3.7
101,26,2.3000000000000003
105,26,12.0


LOAD

In [0]:
daily_summary.write.format("delta").mode("overwrite").save("file:/Workspace/Shared/daily")
weekly_summary.write.format("delta").mode("overwrite").save("file:/Workspace/Shared/weekly")

daily_summary.toPandas().to_csv("daily_summary.csv", index=False)
weekly_summary.toPandas().to_csv("weekly_summary.csv", index=False)


SQL to query energy savings opportunities

In [0]:
spark.sql("""
    SELECT device_id, date, total_kwh
    FROM daily_summary_view
    WHERE total_kwh > 1.0;
""").show()

+---------+----------+------------------+
|device_id|      date|         total_kwh|
+---------+----------+------------------+
|      103|2025-06-27|               2.8|
|      104|2025-06-27|               2.6|
|      101|2025-06-27|               1.1|
|      102|2025-06-27|1.7000000000000002|
|      102|2025-06-26|               2.0|
|      101|2025-06-26|               1.2|
|      105|2025-06-26|               4.7|
|      105|2025-06-27| 7.299999999999999|
|      104|2025-06-26|               2.4|
|      103|2025-06-26|               3.3|
+---------+----------+------------------+

