In [None]:
from pyspark.ml import PipelineModel
from pyspark.sql.functions import col, window
from time import sleep
from IPython.display import clear_output
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
from datetime import datetime

import warnings
warnings.filterwarnings('ignore')

In [None]:
streamingRawDF = spark \
  .readStream \
  .format("kafka") \
  .option("kafka.bootstrap.servers", "localhost:9092") \
  .option("subscribe", "sparkTopic") \
  .load()

streamingDF = streamingRawDF.selectExpr("CAST(value AS STRING) as text", "timestamp")

In [None]:
lrModel = PipelineModel.load("file:/home/student/Desktop/twitch-big-data-project/models/lr_regParam0.3")

In [None]:
streamingPredictionDF = lrModel.transform(streamingDF).select('text', 'prediction', 'timestamp')

In [None]:
logs_path = 'message-logs'
checkpoint_path = 'message-checkpoints'

streamingPredictionDF \
    .writeStream \
    .format("parquet") \
    .queryName("changes_ingestion") \
    .option("checkpointLocation", checkpoint_path) \
    .option("path", logs_path) \
    .outputMode("append") \
    .start()

In [None]:
mySchema = spark.read.parquet(logs_path).schema

df_stream = (
    spark \
    .readStream \
    .schema(mySchema) \
    .format("parquet") \
    .load(logs_path) \
)

In [None]:
df_count = (
    df_stream \
    .withWatermark("timestamp", "10 minutes") \
    .groupBy(window(col("timestamp"), "10 minutes", "10 minutes"), col("prediction")) \
    .count())

In [None]:
queryStream = (df_count \
 .writeStream \
 .format("memory") \
 .queryName("msg_changes") \
 .outputMode("update") \
 .start())

In [None]:
matplotlib.rc('font', family='DejaVu Sans')
sns.set(style="whitegrid")


try:
    i=1
    while True:
        # Clear output
        clear_output(wait=True)
        df = spark.sql(
                """
                    select
                        window.start
                        ,window.end
                        ,prediction
                        ,sum(count) message_count
                    from
                        msg_changes
                    where
                        window.start = (select max(window.start) from msg_changes)
                    group by
                        window.start
                        ,window.end
                        ,prediction
                    order by
                        prediction desc
                """
        ).toPandas()
        
        sns.set_color_codes("muted")
        
        display(df)
        
        plt.figure(figsize=(8,6))
        try:
            # Barplot
            sns.barplot(x="prediction", y="message_count", data=df).set(title='Just Chatting')
            fig = plt.gcf()
            # Show barplot
            plt.show()
            sleep(10)
            i=i+1
        except ValueError:
            # If Dataframe is empty, pass
            pass
        
except KeyboardInterrupt:
    print("process interrupted.")