<div style= "text-align: right">
    <p style= "text-align: right; font-weight: bold; font-size: x-large;">FIT3182 Big Data Management and Processing</p>
    <p style= "text-align: right; font-weight: bold; font-size: large;">Assignment 2</p>
    <p style= "text-align: right">Foo Kai Yan</p>
    <p style= "text-align: right">kfoo0012@student.monash.edu<br><br><i>33085625<br><br><i>22<sup>th</sup> May 2024</i></p>
<div>
<hr style="border-color: black;">

## Student Statement
The assignment was completed with the assistance of some code obtained from seminar/tutorial/lab/applied class.

### Installing PyMongo

In [1]:
!pip install pymongo
!pip install pygeohash



### Import required Libraries

In [2]:
import os
import json
import time
import datetime
import pygeohash as pgh
from pprint import pprint
from pymongo import MongoClient
from pyspark.sql import SparkSession
from pyspark import SparkContext, SparkConf # spark
from pyspark.streaming import StreamingContext # spark streaming
from pyspark.sql.functions import col, split, element_at, when

### Check working directory

In [3]:
os.getcwd()

'/home/student/ASSIGNMENT2'

### Set os environment

In [4]:
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-streaming-kafka-0-10_2.12:3.3.0,org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.0 pyspark-shell'

### Set host ip

In [5]:
# hostip obtained using `ipconfig` command in command prompt
hostip = "10.192.45.141"

### Streaming Application Processing Data

In [6]:
# Check proximity based on geohash, precision 5
def geohash_proximity_5(record, climate):
    record_geohash = pgh.encode(record["longitude"], record["latitude"], precision = 5)
    climate_geohash = pgh.encode(climate[0]["longitude"], climate[0]["latitude"], precision = 5)
    return record_geohash == climate_geohash

# Check proximity based on geohash, precision 3
def geohash_proximity(record, climate):
    record_geohash = pgh.encode(record["longitude"], record["latitude"], precision = 3)
    climate_geohash = pgh.encode(climate[0]["longitude"], climate[0]["latitude"], precision = 3)
    return record_geohash == climate_geohash

def process_producer_data(batch_df, batch_id):
    all_data = batch_df.collect()  # Returns all elements as an array 
    # Send stream data to be transformed & analysed
    producer_data = [row.asDict() for row in all_data] 
    
    # Initialize empty lists for aqua_hotspots_record, terra_hotspots_record, and an empty dictionary for climate_record
    climate_record = []
    aqua_hotspots_record = []
    terra_hotspots_record = [] 
    
    # Producer 1: Climate topic_name = "Climate", ["producer_id"] = "producer1_climate"
    # Producer 2: AQUA topic_name = "Hotspot_AQUA", ["producer_id"] = "producer2_hotspot_aqua"
    # Producer 3: TERRA topic_name = "Hotspot_TERRA", ["producer_id"] = "producer3_hotspot_terra"
    
    # For each item from the data batch from the Kafka stream 
    for eachdata in range(len(producer_data)): 
        # Check if 'value' key exists and is a bytearray
        if 'value' in producer_data[eachdata] and isinstance(producer_data[eachdata]['value'], bytearray):
            # Convert byte array to string and then to JSON
            newdata = producer_data[eachdata]['value'].decode('utf-8')
            produced_data = json.loads(newdata)
            producer_data[eachdata] = produced_data
            
    for producer in producer_data:
        producer_id = producer["producer_id"]

        # Sort data depending on the producer_id information
        if producer_id == "producer1_climate":
            climate_record.append(producer)
            # print("climate record", climate_record)
        elif producer_id == "producer2_hotspot_aqua":
            aqua_hotspots_record.append(producer)
            # print("aqua_hotspots_record", aqua_hotspots_record)
        elif producer_id == "producer3_hotspot_terra":
            terra_hotspots_record.append(producer)
            # print("terra_hotspots_record", terra_hotspots_record)
                
    # If there are no climate_record, skip processing this batch
    # Check if the climate_record is empty, have 1 or more than 1 climate_record
    if len(climate_record) == 1:
        climate_record = climate_record
    elif len(climate_record) > 1:
        # If more than 1 then select the first climate_record only
        climate_record = climate_record[0]
    else:
        print("No climate record present in this batch. Skipping...")
        return  # Exit the function

    # Analyse hotspots data, find if any are close by & merge
    new_hotspots_record = process_hotspots(aqua_hotspots_record, terra_hotspots_record, climate_record)
    # Merge hotspots with climate depending if close & label if natural or other
    new_climate_record = process_climate(climate_record, new_hotspots_record)
    print("BACK TO MAIN FUNCTION")
    return addtodatabase(new_climate_record)

def process_hotspots(aqua_hotspots_record, terra_hotspots_record, climate_record):
    # Initialize empty array for hotspots records from both terra and aqua
    hotspots = []
    print("IN PROCESS HOTSPOTS")
    
    # Process aqua_hotspots_record
    for each_record in aqua_hotspots_record:
        print("IN AQUA")
        if geohash_proximity(each_record, climate_record):
            hotspots.append(each_record)
            
    # Process terra_hotspots_record
    for each_record in terra_hotspots_record:
        print("IN TERRA")
        if geohash_proximity(each_record, climate_record):
            hotspots.append(each_record)
            
    return hotspots

def process_climate(climate_record, hotspots_record):
    # Initialize empty array for fire events records
    print("IN PROCESS CLIMATE")
    fire_events = []
    for each_record in hotspots_record:
        fire_event = process_fire_event(climate_record, each_record)
        if fire_event is not None:
            fire_events.append(fire_event)
    print("FIRE_EVENTS: ", fire_events)
    # Add new key to climate_record and add the fire_events as the value to the key
    # climate_record format: [{}] ; fire_events format: [{}]
    new_key = "fire_events"
    new_value = fire_events
    
    # Since the top-level structure is a list, access the first dictionary
    if climate_record:  # Check if the list is not empty
        climate_record[0][new_key] = new_value
    else:
        # If the list is empty, create a new dictionary with the new key
        climate_record.append({new_key: new_value})
    print("CLIMATE RECORD FROM PROCESS CLIMATE: ", climate_record)
    return climate_record

def process_fire_event(climate_record, hotspots_record):
    print("IN PROCESS FIRE EVENT")
    # Initialize an empty dictionary to store fire event data
    fire_happening = {}

    # Check if the hotspot is geographically close to the climate data with a precision of 5
    if geohash_proximity_5(hotspots_record, climate_record) is True:
        print("hotspots_record: ", hotspots_record)
        
        # Get the created_time for Data Visualisation 
        fire_happening["created_time"] = hotspots_record.get("created_time")
        fire_happening["created_hour"] = hotspots_record.get("created_time").split(':')[0]
        
        # Calculate the average surface temperature from both hotspot and climate data
        average_temp = (hotspots_record.get("surface_temperature_celcius") + hotspots_record.get("surface_temperature_celcius")) / 2
        fire_happening["average_surface_temp"] = average_temp

        # Calculate the average confidence level from both hotspot and climate data
        average_confidence = (hotspots_record.get("confidence") + hotspots_record.get("confidence")) / 2
        fire_happening["confidence"] = average_confidence

        # Get the air_temperature_celcius and GHI_w/m2 value
        if climate_record:  # Check if the list is not empty
            air_temp = climate_record[0].get("air_temperature_celcius")
            solar_irradiance = climate_record[0].get("GHI_w/m2")
        else:
            air_temp = None
            solar_irradiance = None
        # Determine the cause of the fire based on climate conditions            
        if air_temp > 20 and solar_irradiance > 180:
            fire_happening["cause"] = "natural"
        else:
            fire_happening["cause"] = "others"
        
        print("FIRE HAPPENING: ", fire_happening)
        # Return the dictionary containing fire event data
        return fire_happening
    # If the hotspot is not close to the climate data, return None
    return None

def addtodatabase(dictionary_in_array):
    print("ADD TO DATABASE")
    # Only save data with contents
    if len(dictionary_in_array) >= 1:

        client = MongoClient(f'mongodb://{hostip}:27017/')

        db = client.fit3182_assignment_db
        climate_collection = db.climate_collection

        # Insert climate data into database
        inserting = climate_collection.insert_one(dictionary_in_array[0])
        print("ADD SUCCESSFUL: ", inserting)

        client.close()

In [7]:
# names of three Kafka topics separated by commas (Obtained from Producer1, 2 and 3)
topic = "Climate, Hotspot_AQUA, Hotspot_TERRA"

spark = (
    SparkSession.builder
    .master('local[*]') # local[*] means Spark will run locally using all available cores on the machine
    .appName('Streaming Climate Data')
    .getOrCreate() # Creates a SparkSession if it does not exist or returns the existing one
)

# This is basically connecting to kafka server
topic_stream_df = (
    spark.readStream.format('kafka') # 'kafka' because u are receiving from kafka
    .option('kafka.bootstrap.servers', f'{hostip}:9092')
    .option('subscribe', topic) # Subscribes to the Kafka topics listed in the topic variable
    .load() # Loads the data stream from Kafka and returns it as a DataFrame
)

# Creates a new DataFrame data_sdf by selecting only the ‘value’ column from the topic_stream_df DataFrame
# This ‘value’ column contains the actual message data obtained from Kafka
data_sdf = topic_stream_df.select('value')

db_writer = (
    data_sdf
    .writeStream # Indicates that the data in data_sdf will be written to an output sink
    .outputMode('append') # Only new rows will be written to the output sink as they arrive
    # collect data for 10 seconds
    .trigger(processingTime='10 seconds')
    # each batch of 10 seconds will run func
    .foreachBatch(process_producer_data)) 
    # For each batch collected, the process_producer_data function will be called to process the data
    # process_producer_data function is essentially the main function for the stream data processing

In [8]:
'''
This code section manage the lifecycle of a streaming query in a controlled manner
It allows for a graceful shutdown when the user decides to interrupt the process
'''
try:
    query = db_writer.start() # Starts the streaming query
    query.awaitTermination() # Waits for the streaming query to finish
except KeyboardInterrupt:
    print('Interrupted by CTRL-C. Stopping query.') # Gracefully handle a user’s request to interrupt the program
finally:
    query.stop() # Stops the streaming query

No climate record present in this batch. Skipping...
No climate record present in this batch. Skipping...
IN PROCESS HOTSPOTS
IN AQUA
IN AQUA
IN AQUA
IN AQUA
IN AQUA
IN AQUA
IN AQUA
IN AQUA
IN AQUA
IN AQUA
IN TERRA
IN TERRA
IN PROCESS CLIMATE
IN PROCESS FIRE EVENT
IN PROCESS FIRE EVENT
IN PROCESS FIRE EVENT
IN PROCESS FIRE EVENT
IN PROCESS FIRE EVENT
IN PROCESS FIRE EVENT
IN PROCESS FIRE EVENT
FIRE_EVENTS:  []
CLIMATE RECORD FROM PROCESS CLIMATE:  [{'latitude': -37.478, 'longitude': 148.117, 'air_temperature_celcius': 11.0, 'relative_humidity': 43.9, 'windspeed_knots': 11.2, 'max_wind_speed': 16.9, 'GHI_w/m2': 99.0, 'precipitation_flag': 'G', 'precipitation': 0.12, 'latest_date': '2024-01-10T00:00:00', 'producer_id': 'producer1_climate', 'fire_events': []}]
BACK TO MAIN FUNCTION
ADD TO DATABASE
ADD SUCCESSFUL:  <pymongo.results.InsertOneResult object at 0x7f0330654370>
IN PROCESS HOTSPOTS
IN AQUA
IN AQUA
IN AQUA
IN AQUA
IN AQUA
IN AQUA
IN AQUA
IN AQUA
IN AQUA
IN AQUA
IN TERRA
IN TERRA


ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/opt/conda/lib/python3.8/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/opt/conda/lib/python3.8/site-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/opt/conda/lib/python3.8/socket.py", line 669, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


Interrupted by CTRL-C. Stopping query.


### Double Check data is added to MongoDB

client = MongoClient(f'mongodb://{hostip}:27017/')
db = client.fit3182_assignment_db
climate_collection = db.climate_collection
cursor = db.climate_collection.find({})

for document in cursor:
    pprint(document)

If there are fire events:
```
{'GHI_w/m2': 152.0,
 '_id': ObjectId('664db85c06058a4bb12f4f43'),
 'air_temperature_celcius': 18.0,
 'fire_events': [{'average_surface_temp': 48.0,
                  'cause': 'others',
                  'confidence': 74.0,
                  'created_hour': '17',
                  'created_time': '17:15:55'}],
 'latest_date': '2027-10-20T00:00:00',
 'latitude': -36.2032,
 'longitude': 145.3025,
 'max_wind_speed': 14.0,
 'precipitation': 0.0,
 'precipitation_flag': 'I',
 'producer_id': 'producer1_climate',
 'relative_humidity': 52.0,
 'windspeed_knots': 7.1}
```

If there are no fire events:
```
{'GHI_w/m2': 102.0,
 '_id': ObjectId('664d92c887f00ff1a1c1bc4d'),
 'air_temperature_celcius': 11.0,
 'fire_events': [],
 'latest_date': '2025-03-03T00:00:00',
 'latitude': -37.8088,
 'longitude': 142.2291,
 'max_wind_speed': 8.9,
 'precipitation': 0.0,
 'precipitation_flag': 'G',
 'producer_id': 'producer1_climate',
 'relative_humidity': 40.8,
 'windspeed_knots': 6.4}
```