In [1]:
import os
from pyspark import SparkContext,SparkConf
from pyspark.streaming import StreamingContext
from pyspark.sql import Row, SparkSession
from pyspark.sql.functions import regexp_extract
import pyspark.sql.functions as F
from kafka3 import KafkaConsumer

In [2]:
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-streaming-kafka-0-10_2.12:3.4.0,org.apache.spark:spark-sql-kafka-0-10_2.12:3.4.0 pyspark-shell'
spark = SparkSession \
    .builder \
    .appName("rsin0045") \
    .getOrCreate()

In [4]:
hostip = "192.168.0.140:9092"
topic = "prediction"

In [None]:
consumer = KafkaConsumer(topic,bootstrap_servers=hostip,auto_offset_reset='earliest', value_deserializer=lambda z: z.decode('utf-8'))

In [15]:
df = spark \
    .readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", hostip) \
    .option("subscribe", topic) \
    .load()

In [16]:
df.printSchema()

root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)



In [17]:
df = df.selectExpr("CAST(value AS STRING)")

In [18]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType, TimestampType
schema = StructType([
    StructField('ts', TimestampType(), True),
    StructField('value', IntegerType(), True)          
])

In [19]:
df=df.select(F.from_json(F.col("value").cast("string"), schema).alias('parsed_value'))

In [20]:
df_formatted = df.select(
                    F.col("parsed_value.ts").alias("ts"),
                    F.col("parsed_value.value").alias("value")
                )

In [21]:
query_all = df_formatted \
    .writeStream \
    .outputMode("append") \
    .format("console") \
    .queryName("allvalues") \
    .trigger(processingTime='5 seconds') \
    .start()

In [22]:
query_all.stop()

In [26]:
def init_plots():
    try:
        width = 9.5
        height = 6
        fig = plt.figure(figsize=(width,height)) # create new figure
        fig.subplots_adjust(hspace=0.8)
        ax = fig.add_subplot(111) # adding the subplot axes to the given grid position
        ax.set_xlabel('Time')
        ax.set_ylabel('Sales')
        ax.title.set_text('Time Vs Sales')
        fig.suptitle('Time Vs Sales visualization') # giving figure a title
        fig.show() 
        fig.canvas.draw() 
        return fig, ax
    except Exception as ex:
        print(str(ex))

In [28]:
import time
import matplotlib.pyplot as plt
%matplotlib notebook
fig, ax = init_plots()
while True:
    df_all = spark.sql("select * from all_values order by ts desc limit 90").toPandas()
    # Get starting timestamp to plot both graphs
    start_time = df_all['ts'][len(df_all)-1]
    df_reduced = df
    x_all = df_all['ts'].to_list()
    y_all = df_all['prediction'].to_list()
    x_reduced = df_reduced['time'].to_list()
    y_reduced = df_reduced['prediction'].to_list()
    ax.clear()
    ax.plot(x_all, y_all, '-b', label='Original')
    ax.plot(x_reduced, y_reduced, '--r', label='Reduced')
    ax.set_xlabel('Time')
    ax.set_ylabel('prediction')
    leg = ax.legend()
    fig.canvas.draw()
    time.sleep(5)