### Testing application

In [1]:
import os
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-sql-kafka-0-10_2.11:2.4.4 pyspark-shell'

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode, window, max, min, avg
from pyspark.sql.functions import split
from pyspark.sql.types import StringType, FloatType, StructType, StructField, IntegerType, TimestampType
from pyspark.sql.functions import from_json, col, to_timestamp

In [3]:
# Create a spark session
spark = SparkSession.builder.appName("Twitter").getOrCreate()

In [4]:
# Define the timestamp format
timestampFormat = "E MMM dd HH:mm:ss +0000 yyyy"

# Define the schema of the incoming tweets
schema = (StructType()
  .add('created_at', TimestampType())
  .add('id_str', StringType())
  .add('text', StringType()))

In [5]:
# Read kafka stream and subscribe to twitter topic
df = (spark.readStream
          .format('kafka')
          .option('kafka.bootstrap.servers', 'kafka-1:9092')
          .option('startingOffsets', 'latest')
          .option('subscribe', 'twitter')
          .load()
          .select(col("key").cast("string"), \
                  from_json(col("value").cast("string"), schema, \
                  { "timestampFormat": timestampFormat }).alias("value")))

df.printSchema()

root
 |-- key: string (nullable = true)
 |-- value: struct (nullable = true)
 |    |-- created_at: timestamp (nullable = true)
 |    |-- id_str: string (nullable = true)
 |    |-- text: string (nullable = true)



In [7]:
tweets = df.select("value.*")
tweets.printSchema()

root
 |-- created_at: timestamp (nullable = true)
 |-- id_str: string (nullable = true)
 |-- text: string (nullable = true)



In [8]:
raw_df = (tweets
         .writeStream
         .queryName("tweets")
         .format("memory")
         .start())

In [10]:
raw = spark.sql("select * from tweets order by created_at desc")
raw.show()

+-------------------+-------------------+------------------------------------+
|         created_at|             id_str|                                text|
+-------------------+-------------------+------------------------------------+
|2019-10-29 14:42:01|1189190655601336320|                UP is the most ha...|
|2019-10-29 14:42:01|1189190659485270017|                RT @alfuckuhard: ...|
|2019-10-29 14:42:01|1189190655852986370|                RT @steelers1288:...|
|2019-10-29 14:42:01|1189190659111960578|                RT @renminrise: j...|
|2019-10-29 14:42:00|1189190652824698880|                RT @MickstapeShow...|
|2019-10-29 14:41:59|1189190648307408902|                RT @CentristDan: ...|
|2019-10-29 14:41:58|1189190643668488194|                .@richarddeitsch ...|
|2019-10-29 14:41:57|1189190640317255681|                RT @CultureCentra...|
|2019-10-29 14:41:56|1189190638228529152|                Today's MSU baske...|
|2019-10-29 14:41:55|1189190631563771905|           

In [70]:
# Get the count of tweets per 10 seconds
tweet_count_df = tweets.groupBy(window(tweets.created_at, '10 seconds')).count()

In [71]:
query = (tweet_count_df
        .writeStream
        .format("memory")
        .queryName("window_count")
        .outputMode("complete")
        .start())

In [88]:
raw = spark.sql("select * from window_count order by window desc")
raw.show()

+--------------------+-----+
|              window|count|
+--------------------+-----+
|[2019-10-23 16:05...|   21|
|[2019-10-23 16:05...|   15|
|[2019-10-23 16:05...|   11|
|[2019-10-23 16:05...|   23|
|[2019-10-23 16:04...|   15|
|[2019-10-23 16:04...|   14|
|[2019-10-23 16:04...|   16|
|[2019-10-23 16:04...|   20|
|[2019-10-23 16:04...|   15|
|[2019-10-23 16:04...|   17|
|[2019-10-23 16:03...|   13|
|[2019-10-23 16:03...|   10|
|[2019-10-23 16:03...|   15|
|[2019-10-23 16:03...|   12|
|[2019-10-23 16:03...|    5|
|[2019-10-23 16:03...|   13|
|[2019-10-23 16:02...|   13|
|[2019-10-23 16:02...|   23|
|[2019-10-23 16:02...|   22|
|[2019-10-23 16:02...|   13|
+--------------------+-----+
only showing top 20 rows

