In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
import pandas as pd

spark = SparkSession.builder.appName("SmartHomeAggregation").getOrCreate()
spark

In [4]:
df = spark.read.option("header", True).csv("large_sensor_log.csv", inferSchema=True)
df.show()

+------+---------+-------+-------------------+----------+
|log_id|device_id|room_id|          timestamp|energy_kwh|
+------+---------+-------+-------------------+----------+
|     1|      101|      1|2025-06-26 00:15:00|       0.1|
|     2|      102|      1|2025-06-26 01:30:00|       0.2|
|     3|      103|      2|2025-06-26 02:00:00|       0.5|
|     4|      104|      2|2025-06-26 03:15:00|       0.3|
|     5|      105|      3|2025-06-26 04:00:00|       1.0|
|     6|      101|      1|2025-06-26 05:45:00|       0.2|
|     7|      102|      1|2025-06-26 06:00:00|       0.4|
|     8|      103|      2|2025-06-26 07:30:00|       0.7|
|     9|      104|      2|2025-06-26 08:00:00|       0.6|
|    10|      105|      3|2025-06-26 09:15:00|       1.1|
|    11|      101|      1|2025-06-26 10:00:00|       0.3|
|    12|      102|      1|2025-06-26 11:30:00|       0.6|
|    13|      103|      2|2025-06-26 12:45:00|       0.8|
|    14|      104|      2|2025-06-26 13:00:00|       0.7|
|    15|      

In [6]:
device_usage = df.groupBy("device_id").agg(sum("energy_kwh").alias("total_kwh"),avg("energy_kwh").alias("avg_kwh"))
device_usage.show()

+---------+------------------+-------------------+
|device_id|         total_kwh|            avg_kwh|
+---------+------------------+-------------------+
|      101|2.3000000000000003|0.23000000000000004|
|      103| 6.099999999999999| 0.6099999999999999|
|      102|3.6999999999999997|               0.37|
|      105|              12.0|                1.2|
|      104|               5.0|                0.5|
+---------+------------------+-------------------+



In [11]:
df = df.withColumn("hour", hour(col("timestamp")))
df = df.withColumn("is_peak", when((col("hour") >= 18) & (col("hour") <= 23), "peak").otherwise("off_peak"))
df.show()
df.toPandas().to_csv("cleaned_large_sensor_log.csv", index=False)

+------+---------+-------+-------------------+----------+----+--------+
|log_id|device_id|room_id|          timestamp|energy_kwh|hour| is_peak|
+------+---------+-------+-------------------+----------+----+--------+
|     1|      101|      1|2025-06-26 00:15:00|       0.1|   0|off_peak|
|     2|      102|      1|2025-06-26 01:30:00|       0.2|   1|off_peak|
|     3|      103|      2|2025-06-26 02:00:00|       0.5|   2|off_peak|
|     4|      104|      2|2025-06-26 03:15:00|       0.3|   3|off_peak|
|     5|      105|      3|2025-06-26 04:00:00|       1.0|   4|off_peak|
|     6|      101|      1|2025-06-26 05:45:00|       0.2|   5|off_peak|
|     7|      102|      1|2025-06-26 06:00:00|       0.4|   6|off_peak|
|     8|      103|      2|2025-06-26 07:30:00|       0.7|   7|off_peak|
|     9|      104|      2|2025-06-26 08:00:00|       0.6|   8|off_peak|
|    10|      105|      3|2025-06-26 09:15:00|       1.1|   9|off_peak|
|    11|      101|      1|2025-06-26 10:00:00|       0.3|  10|of

In [8]:
peak_usage = df.groupBy("device_id", "is_peak").agg(sum("energy_kwh").alias("usage_kwh")).groupBy("device_id").pivot("is_peak").sum("usage_kwh")
peak_usage.show()

+---------+------------------+------------------+
|device_id|          off_peak|              peak|
+---------+------------------+------------------+
|      101|1.9000000000000001|               0.4|
|      103|4.3999999999999995|1.7000000000000002|
|      102|               3.1|               0.6|
|      105|               8.1|               3.9|
|      104|               3.2|1.8000000000000003|
+---------+------------------+------------------+



In [9]:
final = device_usage.join(peak_usage, on="device_id", how="left")
final.show()

+---------+------------------+-------------------+------------------+------------------+
|device_id|         total_kwh|            avg_kwh|          off_peak|              peak|
+---------+------------------+-------------------+------------------+------------------+
|      101|2.3000000000000003|0.23000000000000004|1.9000000000000001|               0.4|
|      103| 6.099999999999999| 0.6099999999999999|4.3999999999999995|1.7000000000000002|
|      102|3.6999999999999997|               0.37|               3.1|               0.6|
|      105|              12.0|                1.2|               8.1|               3.9|
|      104|               5.0|                0.5|               3.2|1.8000000000000003|
+---------+------------------+-------------------+------------------+------------------+



In [10]:
final.orderBy(col("total_kwh").desc()).toPandas().to_csv("top_uasage.csv", index=False)