#1. Upload cleaned logs to Databricks

In [0]:
from pyspark.sql import functions as F

df = spark.read.csv("dbfs:/FileStore/tables/cleaned_energy_logs.csv", header=True, inferSchema=True)
devices_df = spark.read.csv("dbfs:/FileStore/tables/devices.csv", header=True, inferSchema=True)

print("cleaned energy logs:")
df.show()

print("devices:")
devices_df.show()


cleaned energy logs:
+------+---------+---------------+-------------------+
|log_id|device_id|energy_used_kwh|           log_time|
+------+---------+---------------+-------------------+
|     1|      101|            2.5|2025-07-20 09:00:00|
|     2|      101|            3.0|2025-07-20 18:00:00|
|     3|      102|            1.2|2025-07-20 12:00:00|
|     4|      103|            0.0|2025-07-20 13:00:00|
|     5|      104|            4.5|2025-07-20 07:00:00|
|     6|      104|            5.0|2025-07-20 20:00:00|
|     7|      105|            1.8|2025-07-20 10:00:00|
+------+---------+---------------+-------------------+

devices:
+---------+---------------+-------+-----------+------+------------+
|device_id|    device_name|room_id|  room_name|status|power_rating|
+---------+---------------+-------+-----------+------+------------+
|      101|Air Conditioner|      1|Living Room|    ON|        1500|
|      102|   Refrigerator|      2|    Kitchen|    ON|         200|
|      103|             

#2. Build an ETL pipeline to calculate daily/weekly summaries

In [0]:
df = df.withColumn("date", F.to_date("log_time")).withColumn("week", F.weekofyear("log_time"))
df_joined = df.join(devices_df.select("device_id", "room_name"), on="device_id", how="left")
daily_summary = df_joined.groupBy("date", "room_name", "device_id").agg(F.sum("energy_used_kwh").alias("daily_energy_kwh"))
weekly_summary = df_joined.groupBy("week", "room_name", "device_id").agg(F.sum("energy_used_kwh").alias("weekly_energy_kwh"))

print("daily summary of each room and device:")
daily_summary.show()
print("weekly summary of each room and device:")
weekly_summary.show()


daily summary of each room and device:
+----------+-----------+---------+----------------+
|      date|  room_name|device_id|daily_energy_kwh|
+----------+-----------+---------+----------------+
|2025-07-20|    Kitchen|      105|             1.8|
|2025-07-20|    Bedroom|      104|             9.5|
|2025-07-20|    Kitchen|      102|             1.2|
|2025-07-20|Living Room|      103|             0.0|
|2025-07-20|Living Room|      101|             5.5|
+----------+-----------+---------+----------------+

weekly summary of each room and device:
+----+-----------+---------+-----------------+
|week|  room_name|device_id|weekly_energy_kwh|
+----+-----------+---------+-----------------+
|  29|    Kitchen|      102|              1.2|
|  29|Living Room|      103|              0.0|
|  29|Living Room|      101|              5.5|
|  29|    Bedroom|      104|              9.5|
|  29|    Kitchen|      105|              1.8|
+----+-----------+---------+-----------------+



#3. Save final results in Delta format or CSV

In [0]:
# Save results in Delta 
daily_summary.write.format("delta").mode("overwrite").save("dbfs:/FileStore/tables/daily_energy_summary")
weekly_summary.write.format("delta").mode("overwrite").save("dbfs:/FileStore/tables/weekly_energy_summary")

# Save as CSV 
daily_summary.write.csv("dbfs:/FileStore/tables/daily_energy_summary_csv", header=True, mode="overwrite")
weekly_summary.write.csv("dbfs:/FileStore/tables/weekly_energy_summary_csv", header=True, mode="overwrite")

#4. Optional: use SQL to query 

In [0]:
daily_energy_summary = spark.read.format("delta").load("dbfs:/FileStore/tables/daily_energy_summary")
weekly_energy_summary = spark.read.format("delta").load("dbfs:/FileStore/tables/weekly_energy_summary")

In [0]:
weekly_energy_summary.createOrReplaceTempView("weekly_summary")
daily_energy_summary.createOrReplaceTempView("daily_summary")

In [0]:
spark.sql("select * from weekly_summary where weekly_energy_kwh > 3").show()

+----+-----------+---------+-----------------+
|week|  room_name|device_id|weekly_energy_kwh|
+----+-----------+---------+-----------------+
|  29|Living Room|      101|              5.5|
|  29|    Bedroom|      104|              9.5|
+----+-----------+---------+-----------------+



In [0]:
spark.sql("select * from daily_summary where daily_energy_kwh < 5 and daily_energy_kwh != 0 ").show()

+----------+---------+---------+----------------+
|      date|room_name|device_id|daily_energy_kwh|
+----------+---------+---------+----------------+
|2025-07-20|  Kitchen|      105|             1.8|
|2025-07-20|  Kitchen|      102|             1.2|
+----------+---------+---------+----------------+

