In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum as _sum, weekofyear, to_date
import pandas as pd

# Create Spark session
spark = SparkSession.builder.appName("SmartEnergyETL").getOrCreate()

Create CSV in Colab python



In [2]:
%%writefile cleaned_energy_usage.csv
timestamp,room_id,device_id,energy_kwh
2025-08-01 08:00:00,LivingRoom,AC,2.5
2025-08-01 09:00:00,LivingRoom,TV,0.3
2025-08-01 10:00:00,Kitchen,Fridge,1.2
2025-08-01 11:00:00,Bedroom,Heater,1.8
2025-08-01 12:00:00,LivingRoom,Light,0.15
2025-08-01 13:00:00,Kitchen,Oven,2.0
2025-08-01 14:00:00,Bedroom,Fan,0.25
2025-08-01 15:00:00,LivingRoom,AC,2.7
2025-08-01 16:00:00,Kitchen,Fridge,1.1
2025-08-01 17:00:00,Bedroom,Light,0.1


Writing cleaned_energy_usage.csv


 Load Data into Spark

In [3]:
df_spark = spark.read.csv("cleaned_energy_usage.csv", header=True, inferSchema=True)

# Convert timestamp to date
df_spark = df_spark.withColumn("date", to_date(col("timestamp")))


Daily Summary python



In [4]:
daily_summary = df_spark.groupBy("date", "room_id").agg(
    _sum("energy_kwh").alias("total_energy_kwh")
).orderBy("date")

# Convert to Pandas for display
daily_summary_pd = daily_summary.toPandas()
print("\n=== Daily Summary ===")
print(daily_summary_pd)



=== Daily Summary ===
         date     room_id  total_energy_kwh
0  2025-08-01     Kitchen              4.30
1  2025-08-01  LivingRoom              5.65
2  2025-08-01     Bedroom              2.15


Weekly Summary

In [5]:
weekly_summary = df_spark.withColumn("week_number", weekofyear(col("date"))) \
    .groupBy("week_number", "room_id").agg(
        _sum("energy_kwh").alias("total_energy_kwh")
    ).orderBy("week_number")

weekly_summary_pd = weekly_summary.toPandas()
print("\n=== Weekly Summary ===")
print(weekly_summary_pd)



=== Weekly Summary ===
   week_number     room_id  total_energy_kwh
0           31  LivingRoom              5.65
1           31     Kitchen              4.30
2           31     Bedroom              2.15


Save Outputs

In [6]:
daily_summary.write.csv("daily_summary", header=True, mode="overwrite")
weekly_summary.write.csv("weekly_summary", header=True, mode="overwrite")
