### Testing application

In [10]:
import os
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-sql-kafka-0-10_2.11:2.4.4 pyspark-shell'

In [11]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode, window, max, min, avg
from pyspark.sql.functions import split
from pyspark.sql.types import StringType, FloatType, StructType, StructField
from pyspark.sql.functions import from_json, col, to_timestamp

In [12]:
# Create a spark session
spark = SparkSession.builder.appName("Twitter").getOrCreate()

In [13]:
# Read kafka stream and subscribe to tweet topic
df = spark.readStream \
          .format('kafka') \
          .option('kafka.bootstrap.servers', 'kafka-1:9092') \
          .option('startingOffsets', 'earliest') \
          .option('subscribe', 'twitter') \
          .load()

In [14]:
df.printSchema()

root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)



In [None]:
raw_df = df \
         .writeStream \
         .queryName("rawdata")\
         .format("memory")\
         .start()

In [16]:
raw = spark.sql("select * from rawdata")
raw.show()

+----+--------------------+-------+---------+------+--------------------+-------------+
| key|               value|  topic|partition|offset|           timestamp|timestampType|
+----+--------------------+-------+---------+------+--------------------+-------------+
|null|[7B 22 68 65 6C 6...|twitter|        0|     0|2019-10-21 21:13:...|            0|
|null|[7B 22 68 65 6C 6...|twitter|        0|     1|2019-10-21 21:13:...|            0|
|null|[7B 22 68 65 6C 6...|twitter|        0|     2|2019-10-21 21:13:...|            0|
|null|[7B 22 68 65 6C 6...|twitter|        0|     3|2019-10-21 21:13:...|            0|
|null|[7B 22 68 65 6C 6...|twitter|        0|     4|2019-10-21 21:13:...|            0|
|null|[7B 22 68 65 6C 6...|twitter|        0|     5|2019-10-21 21:14:...|            0|
|null|[7B 22 68 65 6C 6...|twitter|        0|     6|2019-10-21 21:14:...|            0|
|null|[7B 22 68 65 6C 6...|twitter|        0|     7|2019-10-21 21:14:...|            0|
|null|[7B 22 68 65 6C 6...|twitt