In [2]:
csv_data = """timestamp,room_id,device_id,energy_kwh
2025-08-01 08:00:00,LivingRoom,AC,2.5
2025-08-01 09:00:00,LivingRoom,TV,0.3
2025-08-01 10:00:00,Kitchen,Fridge,1.2
2025-08-01 11:00:00,Bedroom,Heater,1.8
2025-08-01 12:00:00,LivingRoom,Light,0.15
2025-08-01 13:00:00,Kitchen,Oven,2.0
2025-08-01 14:00:00,Bedroom,Fan,0.25
2025-08-01 15:00:00,LivingRoom,AC,2.7
2025-08-01 16:00:00,Kitchen,Fridge,1.1
2025-08-01 17:00:00,Bedroom,Light,0.1"""


with open("energy_usage.csv", "w") as file:
    file.write(csv_data)

In [3]:
!pip install pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum as _sum, hour

# Create Spark session
spark = SparkSession.builder.appName("SmartHomeEnergy").getOrCreate()




# Step 3: Load CSV into Spark

In [4]:
df_spark = spark.read.csv("energy_usage.csv", header=True, inferSchema=True)

# Show first few rows
df_spark.show()


+-------------------+----------+---------+----------+
|          timestamp|   room_id|device_id|energy_kwh|
+-------------------+----------+---------+----------+
|2025-08-01 08:00:00|LivingRoom|       AC|       2.5|
|2025-08-01 09:00:00|LivingRoom|       TV|       0.3|
|2025-08-01 10:00:00|   Kitchen|   Fridge|       1.2|
|2025-08-01 11:00:00|   Bedroom|   Heater|       1.8|
|2025-08-01 12:00:00|LivingRoom|    Light|      0.15|
|2025-08-01 13:00:00|   Kitchen|     Oven|       2.0|
|2025-08-01 14:00:00|   Bedroom|      Fan|      0.25|
|2025-08-01 15:00:00|LivingRoom|       AC|       2.7|
|2025-08-01 16:00:00|   Kitchen|   Fridge|       1.1|
|2025-08-01 17:00:00|   Bedroom|    Light|       0.1|
+-------------------+----------+---------+----------+



# Step 4: Device-Level Aggregation

In [5]:
# Convert timestamp to hour for peak/off-peak
df_spark = df_spark.withColumn("hour", hour(col("timestamp")))

In [6]:
# Aggregate total usage per device
device_usage = df_spark.groupBy("device_id").agg(
    _sum("energy_kwh").alias("total_usage_kwh")
).orderBy(col("total_usage_kwh").desc())

print("=== Total Usage by Device ===")
device_usage.show()

=== Total Usage by Device ===
+---------+---------------+
|device_id|total_usage_kwh|
+---------+---------------+
|       AC|            5.2|
|   Fridge|            2.3|
|     Oven|            2.0|
|   Heater|            1.8|
|       TV|            0.3|
|    Light|           0.25|
|      Fan|           0.25|
+---------+---------------+



In [7]:

# Peak vs Off-Peak usage (example: Peak = 8-20, Off-peak = rest)
peak_usage = df_spark.filter((col("hour") >= 8) & (col("hour") <= 20)) \
    .groupBy("device_id") \
    .agg(_sum("energy_kwh").alias("peak_usage_kwh"))

offpeak_usage = df_spark.filter((col("hour") < 8) | (col("hour") > 20)) \
    .groupBy("device_id") \
    .agg(_sum("energy_kwh").alias("offpeak_usage_kwh"))

print("=== Peak Usage ===")
peak_usage.show()

print("=== Off-Peak Usage ===")
offpeak_usage.show()


=== Peak Usage ===
+---------+--------------+
|device_id|peak_usage_kwh|
+---------+--------------+
|       TV|           0.3|
|     Oven|           2.0|
|    Light|          0.25|
|      Fan|          0.25|
|   Fridge|           2.3|
|   Heater|           1.8|
|       AC|           5.2|
+---------+--------------+

=== Off-Peak Usage ===
+---------+-----------------+
|device_id|offpeak_usage_kwh|
+---------+-----------------+
+---------+-----------------+



In [8]:
# Save top devices usage to file
device_usage.coalesce(1).write.csv("top_devices_usage", header=True, mode="overwrite")