# Week 11 : Streaming data from open data

This week's code is about getting parking sensor data from a Kafka topic and storing that in MongoDB for further processing. The code is more or less the same as Week 10, with the exception of the `DbWriter` class that has been modified to do some preprocessing on the streaming data.

In [1]:
import json
from pymongo import MongoClient
from pyspark.sql import SparkSession

import os
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-streaming-kafka-0-10_2.12:3.3.0,org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.0 pyspark-shell'

In [2]:
host_ip = "10.192.45.93"

spark = (
    SparkSession.builder
    .master('local[*]')
    .appName('Streaming Parking Data')
    .getOrCreate()
)

In [3]:
topic = 'week11'

In [4]:
kafka_sdf = (
    spark.readStream
    .format('kafka')
    .option('kafka.bootstrap.servers', f'{host_ip}:9092')
    .option('subscribe', topic)
    .load()
)

We don't care about any columns other than `value`, which contains our serialized (JSON) data.

In [5]:
parking_sdf = kafka_sdf.select('value')

In [6]:
class DbWriter:
    # called at the start of processing each partition in each output micro-batch
    def open(self, partition_id, epoch_id):
        self.mongo_client = MongoClient(
            host=f'{host_ip}',
            port=27017
        )
        self.db = self.mongo_client['fit3182_db']
        return True
    
    # called once per row of the result dataframe
    # the current code DOES NOT handle duplicate processing
    #   e.g., query fails and restarts just before current micro-batch was fully inserted
    def process(self, row):
        data = json.loads(row.value)
        
        db_record = {}
        db_record['_id'] = data.get('kerbsideid')
        db_record['latitude'] = data.get('location')[0]
        db_record['longitude'] = data.get('location')[1]
        db_record['status'] = data.get('status_description')
        
        self.db[topic].replace_one({'_id': data.get('kerbsideid')}, db_record, upsert=True)
    
    # called once all rows have been processed (possibly with error)
    def close(self, err):
        self.mongo_client.close()

In [7]:
# for debugging, print on console
class ConsoleWriter:
    def open(self, partition_id, epoch_id):
        return True
    
    def process(self, row):
        data = json.loads(row.value)
        
        record = {}
        record['_id'] = data.get('kerbsideid')
        record['latitude'] = data.get('location')[0]
        record['longitude'] = data.get('location')[1]
        record['status'] = data.get('status_description')
        print(record)
        
    def close(self, err):
        return True

In [8]:
writer = (
    parking_sdf.writeStream.format("console")
    .option("checkpointLocation", "./parking_sdf_checkpoints")
    .outputMode('append').foreach(DbWriter())
)

In [9]:
try:
    query = writer.start()
    query.awaitTermination()
except KeyboardInterrupt:
    print('Interrupted by CTRL-C. Stopping query.')
finally:
    query.stop()

StreamingQueryException: Query [id = 199b2ae0-dd94-42cf-b438-62fb7677e659, runId = afd2b860-21f2-4750-afa7-a7a3a0cba304] terminated with exception: Writing job aborted