## Consuming data using Kafka and Visualise (20%)
In this task, we will implement an Apache Kafka consumer to consume the data from Part 2.  
  
Important:   
-	In this part, Kafka consumers are used to consume the streaming data published from task 2.8.

In [None]:
# This is your corrected Task 3 cell
from kafka import KafkaConsumer
import json
import matplotlib.pyplot as plt
from datetime import datetime
from threading import Thread
import time

# This magic command is necessary for live plotting in Jupyter
%matplotlib notebook

# --- 1. Configuration ---
hostip = "192.168.0.6"
topic_building = "building_6h"
topic_site = "site_daily"

def connect_kafka_consumer(topic):
    """Connects to a specific Kafka topic."""
    return KafkaConsumer(
        topic,
        bootstrap_servers=[f'{hostip}:9092'],
        auto_offset_reset='earliest', # Start from the beginning
        value_deserializer=lambda x: json.loads(x.decode('utf-8'))
    )

# --- 2. Plotting Function for "building_6h" ---

def consume_building_6h(consumer, fig, axes):
    """
    Consumes from 'building_6h' and updates a 2x2 grid plot 
    showing the Top 8 buildings for each 6-hour time bucket.
    """
    # 4 time buckets, one for each subplot
    time_buckets = ["0-6h", "6-12h", "12-18h", "18-24h"]
    
    # Map time buckets to the 2x2 grid of axes
    ax_map = {
        "0-6h": axes[0, 0],
        "6-12h": axes[0, 1],
        "12-18h": axes[1, 0],
        "18-24h": axes[1, 1]
    }

    # Data store: { "0-6h": {"bldg_1": 10, "bldg_2": 20}, "6-12h": {...}, ... }
    data_store = {bucket: {} for bucket in time_buckets}
    current_date = "..."
    
    print(f"Starting consumer for topic: {topic_building}")
    
    try:
        for message in consumer:
            msg = message.value
            
            # Extract data from the message
            bldg_id = msg.get('building_id')
            time_bucket = msg.get('time')
            val = msg.get('total_power_6h')
            # The 'date' field you added from the window
            date_str = msg.get('date', "Unknown Date") 
            
            if not all([bldg_id, time_bucket, val]):
                print(f"[{topic_building}] Skipping malformed message: {msg}")
                continue
                
            bldg_id_str = str(bldg_id)

            # --- Date Change Logic ---
            if date_str != current_date:
                print(f"[{topic_building}] New Date detected: {date_str}. Clearing data.")
                current_date = date_str
                # Reset data for the new day
                data_store = {bucket: {} for bucket in time_buckets}
                # Update the main figure title
                fig.suptitle(f"Building Top 8 Power Consumption (Date: {current_date})", 
                             y=1.02)
            
            # --- Update Data Store ---
            data_store[time_bucket][bldg_id_str] = val
            
            # --- Redraw the specific subplot that changed ---
            ax = ax_map[time_bucket]
            bucket_data = data_store[time_bucket]
            
            # Sort by value (highest first) and take Top 8
            sorted_items = sorted(bucket_data.items(), 
                                  key=lambda item: item[1], 
                                  reverse=True)
            top_8_items = sorted_items[:8]

            # Clear this specific subplot
            ax.cla()

            if top_8_items:
                # Unzip the (key, value) pairs
                labels, values = zip(*top_8_items)
                
                # Plot the new bars
                ax.bar(labels, values)
                ax.set_xticklabels(labels, rotation=75) # Rotate for readability
            
            ax.set_title(f"Time Bucket: {time_bucket}")
            ax.set_ylabel("Total Power (6h)")
            
            # Redraw the canvas
            fig.tight_layout(rect=[0, 0, 1, 0.96]) # Adjust for suptitle
            fig.canvas.draw()
            plt.pause(0.01)

    except Exception as e:
        print(f"Error in consumer for {topic_building}: {e}")
    finally:
        print(f"Closing consumer for {topic_building}")
        consumer.close()


# --- 3. Plotting Function for "site_daily" ---

def consume_site_daily(consumer, fig, axes):
    """
    Consumes from 'site_daily' and updates a 2x1 grid plot 
    showing current vs. previous day's usage by site.
    """
    ax_prev, ax_curr = axes[0], axes[1] # Top and bottom plots
    
    # Static x-axis with all site IDs (0-15)
    all_sites = [str(i) for i in range(16)]
    
    # Data stores, initialized to 0 for all sites
    current_day_data = {site: 0 for site in all_sites}
    previous_day_data = {site: 0 for site in all_sites}
    current_date = None
    
    print(f"Starting consumer for topic: {topic_site}")
    
    try:
        for message in consumer:
            msg = message.value
            
            site_id = msg.get('site_id')
            val = msg.get('total_power_day')
            # The 'date' field from the window
            date_str = msg.get('date', "Unknown Date") 
            
            if site_id is None or val is None:
                print(f"[{topic_site}] Skipping malformed message: {msg}")
                continue
                
            site_id_str = str(site_id)

            # --- Date Change Logic ---
            if current_date is None:
                current_date = date_str
            
            if date_str != current_date:
                print(f"[{topic_site}] New Date detected: {date_str}. Shifting data.")
                # The "current" day becomes the "previous" day
                previous_day_data = current_day_data.copy()
                # Start a new "current" day
                current_day_data = {site: 0 for site in all_sites}
                current_date = date_str

            # --- Update Data Store ---
            # Update the value for the specific site
            current_day_data[site_id_str] = val
            
            # --- Redraw BOTH subplots ---
            
            # 1. Previous Day Plot (Top)
            ax_prev.cla()
            prev_values = [previous_day_data[site] for site in all_sites]
            ax_prev.bar(all_sites, prev_values, color='gray')
            ax_prev.set_title("Previous Day's Total Usage")
            ax_prev.set_ylabel("Total Power (Daily)")
            ax_prev.set_ylim(bottom=0) # Keep y-axis from 0

            # 2. Current Day Plot (Bottom)
            ax_curr.cla()
            curr_values = [current_day_data[site] for site in all_sites]
            ax_curr.bar(all_sites, curr_values, color='blue')
            ax_curr.set_title(f"Current Day's Total Usage (Date: {current_date})")
            ax_curr.set_xlabel("Site ID")
            ax_curr.set_ylabel("Total Power (Daily)")
            ax_curr.set_ylim(bottom=0) # Keep y-axis from 0

            # Redraw the canvas
            fig.tight_layout()
            fig.canvas.draw()
            plt.pause(0.01)

    except Exception as e:
        print(f"Error in consumer for {topic_site}: {e}")
    finally:
        print(f"Closing consumer for {topic_site}")
        consumer.close()


# --- 4. Main execution block ---
try:
    # Connect to Kafka
    consumer_building = connect_kafka_consumer(topic_building)
    consumer_site = connect_kafka_consumer(topic_site)

    # Initialize plots
    # Plot 1: 2x2 grid for Building data
    fig_building, axes_building = plt.subplots(2, 2, figsize=(10, 8))
    fig_building.show()
    
    # Plot 2: 2x1 grid for Site data
    fig_site, axes_site = plt.subplots(2, 1, figsize=(10, 8))
    fig_site.show()

    # Create and start threads
    thread_building = Thread(target=consume_building_6h, 
                             args=(consumer_building, fig_building, axes_building))
    thread_site = Thread(target=consume_site_daily, 
                         args=(consumer_site, fig_site, axes_site))
    
    thread_building.start()
    thread_site.start()
    
    print(f"All consumer threads started.")

except Exception as e:
    print(f"Failed to start consumers: {e}")

# Note: The threads will run in the background. 
# You will need to "Interrupt" or "Restart" the kernel to stop them.

1.	Load the new meters CSV file into a data frame.

In [10]:
from pyspark.sql.types import (
    StructType, StructField,
    IntegerType, StringType, DecimalType, TimestampType, DateType, DoubleType
)
# 1. Meters Table
meters_schema = StructType([
    StructField("building_id", IntegerType(), False),
    StructField("meter_type", StringType(), False),   # Char(1) -> StringType
    StructField("ts", TimestampType(), False),
    StructField("value", DecimalType(15, 4), False),
    StructField("row_id", IntegerType(), False)
])

new_meters_df = spark.read.csv(
    "data/new_meters.csv",
    header=True,
    schema=meters_schema
)

[building_6h] Received: (764, 16.00)


2.	Plot two diagrams to show data from 6b and 6c. You are free to choose the type of plot.

3.	Plot a diagram to visualise the daily shortfall/excess energy in each site. The shortfall/excess energy is defined as the predicted total sum of energy in each site, minus the metered data (the value can be positive or negative, depending on the model and data quality).