### Testing application

In [1]:
import os
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-sql-kafka-0-10_2.11:2.4.4 pyspark-shell'

In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import window, avg, count
from pyspark.sql.types import StringType, FloatType, StructType, StructField, IntegerType, TimestampType, DoubleType
from pyspark.sql.functions import from_json, col, to_timestamp

### Create spark session

In [4]:
# Create a spark session
spark = SparkSession.builder.appName("Twitter").getOrCreate()

### Subscribe to twitter topic

In [5]:
# Define the timestamp format
timestampFormat = "dd-MM-yyyy HH:mm:ss"

# Create the schema of incoming data
twitter_schema = StructType([
    StructField('timestamp', TimestampType(), False),
    StructField('text', StringType(), False),
    StructField('sentiment', DoubleType(), False)
])

In [6]:
# Read kafka stream and subscribe to twitter topic
twitter_df = (spark.readStream
          .format('kafka')
          .option('kafka.bootstrap.servers', 'kafka:9092')
          .option('startingOffsets', 'latest')
          .option('subscribe', 'twitter')
          .load()
          .select(col("key").cast("string"), \
                  from_json(col("value").cast("string"), twitter_schema, \
                  { "timestampFormat": timestampFormat }).alias("value")))

twitter_df.printSchema()

root
 |-- key: string (nullable = true)
 |-- value: struct (nullable = true)
 |    |-- timestamp: timestamp (nullable = true)
 |    |-- text: string (nullable = true)
 |    |-- sentiment: double (nullable = true)



In [7]:
twitter = twitter_df.select('value.*')
twitter.printSchema()

root
 |-- timestamp: timestamp (nullable = true)
 |-- text: string (nullable = true)
 |-- sentiment: double (nullable = true)



In [8]:
twitter_df_stream = (twitter
         .writeStream
         .queryName("twitter")
         .format("memory")
         .start())

In [10]:
raw = spark.sql("select * from twitter")
raw.show()

+-------------------+--------------------+---------+
|          timestamp|                text|sentiment|
+-------------------+--------------------+---------+
|2019-11-14 21:53:44|If only the RDD i...|      0.0|
|2019-11-14 21:53:45|RT @LosingSergej:...|      0.0|
|2019-11-14 21:53:45|RT @extstock: Rec...|   0.5562|
|2019-11-14 21:53:46|Thu Nov 14 22:52:...|   0.2732|
|2019-11-14 21:53:46|Aye! Getting this...|   0.9575|
|2019-11-14 21:53:46|@stevekinslow I c...|   0.6798|
|2019-11-14 21:53:47|Sales Representat...|      0.0|
|2019-11-14 21:53:50|RT @qhfofficial: ...|      0.0|
|2019-11-14 21:53:53|WikiLeaks has rec...|     0.34|
|2019-11-14 21:53:57|RT @HamEggsnSam: ...|   0.4939|
+-------------------+--------------------+---------+



### Subscribe to crypto topic

In [11]:
# Create the schema of incoming data
crypto_schema = StructType([
    StructField('timestamp', TimestampType(), False),
    StructField('price', DoubleType(), False)
])

In [12]:
# Define the timestamp format
timestampFormat = "dd-MM-yyyy HH:mm:ss"

# Read kafka stream and subscribe to crypto topic
crypto_df = (spark.readStream
          .format('kafka')
          .option('kafka.bootstrap.servers', 'kafka:9092')
          .option('startingOffsets', 'latest')
          .option('subscribe', 'crypto')
          .load()
          .select(col("key").cast("string"), \
                  from_json(col("value").cast("string"), crypto_schema, \
                  { "timestampFormat": timestampFormat }).alias("value")))

crypto_df.printSchema()

root
 |-- key: string (nullable = true)
 |-- value: struct (nullable = true)
 |    |-- timestamp: timestamp (nullable = true)
 |    |-- price: double (nullable = true)



In [13]:
crypto = crypto_df.select('value.*')
crypto.printSchema()

root
 |-- timestamp: timestamp (nullable = true)
 |-- price: double (nullable = true)



In [14]:
crypto_df_stream = (crypto
         .writeStream
         .queryName("crypto")
         .format("memory")
         .start())

In [17]:
raw = spark.sql("select * from crypto")
raw.show()

+-------------------+-------+
|          timestamp|  price|
+-------------------+-------+
|2019-11-14 21:54:14|7857.57|
+-------------------+-------+



### Twitter aggregation

In [18]:
tweet_aggregation = (twitter
                     .withWatermark('timestamp', '1 minute')
                     .groupBy(window('timestamp', '30 seconds', '5 seconds'))
                     .agg(avg('sentiment').alias('sentiment'), count('timestamp').alias('n_tweets')))

In [19]:
tweet_agg_stream = (tweet_aggregation
    .writeStream
    .outputMode('append')
    .queryName('tweets_aggs')
    .format('memory')
    .start())

In [20]:
spark.sql('select * from tweets_aggs').printSchema()

root
 |-- window: struct (nullable = true)
 |    |-- start: timestamp (nullable = true)
 |    |-- end: timestamp (nullable = true)
 |-- sentiment: double (nullable = true)
 |-- n_tweets: long (nullable = false)



In [21]:
tweet_agg_df = spark.sql('select window.start, sentiment, n_tweets from tweets_aggs')
tweet_agg_df.show(truncate=False)

+-----+---------+--------+
|start|sentiment|n_tweets|
+-----+---------+--------+
+-----+---------+--------+



In [133]:
tweet_agg_stream.stop()
tweet_agg_stream.status

{'message': 'Stopped', 'isDataAvailable': False, 'isTriggerActive': False}

### Crypto aggregation

In [46]:
crypto_aggregation = (crypto
                     .withWatermark('timestamp', '2 seconds')
                     .groupBy(window('timestamp', '30 seconds', '5 seconds'))
                     .agg(avg('price').alias('price'))
                     .select(['window.start', 'price', 'window.end']))

crypto_aggregation

# words = ...  # streaming DataFrame of schema { timestamp: Timestamp, word: String }

# # Group the data by window and word and compute the count of each group
# windowedCounts = words.groupBy(
#     window(words.timestamp, "10 minutes", "5 minutes"),
#     words.word
# ).count()

DataFrame[start: timestamp, price: double, end: timestamp]

In [47]:
crypto_agg_stream = (crypto_aggregation
    .writeStream
    .outputMode('complete')
    .queryName('crypto_agg')
    .format('memory')
    .start())

In [50]:
crypto_agg_df = spark.sql('select start,end, price from crypto_agg')
crypto_agg_df.show(truncate=False)

+-------------------+-------------------+-----------------+
|start              |end                |price            |
+-------------------+-------------------+-----------------+
|2019-11-14 21:59:50|2019-11-14 22:00:20|7857.868333333333|
|2019-11-14 21:59:30|2019-11-14 22:00:00|7857.518333333333|
|2019-11-14 22:00:20|2019-11-14 22:00:50|7859.1625        |
|2019-11-14 21:59:35|2019-11-14 22:00:05|7857.535         |
|2019-11-14 22:00:25|2019-11-14 22:00:55|7859.170000000001|
|2019-11-14 21:59:10|2019-11-14 21:59:40|7857.596666666667|
|2019-11-14 21:59:55|2019-11-14 22:00:25|7858.143333333333|
|2019-11-14 21:59:00|2019-11-14 21:59:30|7857.65          |
|2019-11-14 22:00:10|2019-11-14 22:00:40|7858.895         |
|2019-11-14 21:59:25|2019-11-14 21:59:55|7857.543333333334|
|2019-11-14 22:00:35|2019-11-14 22:01:05|7859.17          |
|2019-11-14 22:00:05|2019-11-14 22:00:35|7858.658333333333|
|2019-11-14 21:59:40|2019-11-14 22:00:10|7857.578333333334|
|2019-11-14 21:59:45|2019-11-14 22:00:15

In [45]:
crypto_agg_stream.stop()
crypto_agg_stream.status

{'message': 'Terminated with exception: Job 187 cancelled part of cancelled job group 64faa655-b488-402b-856c-18c622a2ec91',
 'isDataAvailable': False,
 'isTriggerActive': False}

### Joining the two streams

In [146]:
tweet_agg_df.show()

+-------------------+-------------------+--------+
|              start|          sentiment|n_tweets|
+-------------------+-------------------+--------+
|2019-11-14 21:19:35|0.16271666666666665|       6|
|2019-11-14 21:19:40| 0.2368888888888889|       9|
|2019-11-14 21:19:45|0.14646666666666666|      12|
|2019-11-14 21:19:50|          0.1613125|      16|
|2019-11-14 21:19:55|0.15598695652173913|      23|
|2019-11-14 21:20:00|0.19555185185185187|      27|
|2019-11-14 21:20:05|            0.20652|      25|
|2019-11-14 21:20:10|0.23004285714285716|      28|
|2019-11-14 21:20:15|           0.272624|      25|
|2019-11-14 21:20:25| 0.3045962962962963|      27|
|2019-11-14 21:20:20|0.24680357142857146|      28|
|2019-11-14 21:20:30| 0.2979185185185185|      27|
+-------------------+-------------------+--------+



In [147]:
crypto_agg_df.show()

+-----+-----+
|start|price|
+-----+-----+
+-----+-----+



In [57]:
tweet_agg_df.join(crypto_agg_df, 'start').show()

+-------------------+-------------------+--------+------------------+
|              start|          sentiment|n_tweets|             price|
+-------------------+-------------------+--------+------------------+
|2019-11-14 20:04:40|0.25397142857142857|       7|7850.0025000000005|
|2019-11-14 20:03:45|0.20998965517241377|      29| 7848.673333333333|
|2019-11-14 20:03:50| 0.1462103448275862|      29| 7848.803333333333|
|2019-11-14 20:04:10|0.19134285714285718|      28|          7848.965|
|2019-11-14 20:04:35| 0.3238230769230769|      13|          7849.938|
|2019-11-14 20:04:05|             0.1619|      27|          7848.845|
|2019-11-14 20:04:45|0.44444999999999996|       4| 7850.110000000001|
|2019-11-14 20:04:50|0.44914999999999994|       2|           7850.35|
|2019-11-14 20:03:15| 0.2464421052631579|      19|           7847.27|
|2019-11-14 20:03:10|0.27543529411764706|      17|           7847.27|
|2019-11-14 20:03:55|0.15807878787878787|      33| 7848.926666666666|
|2019-11-14 20:03:30