### Testing application

In [1]:
import os
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-sql-kafka-0-10_2.11:2.4.4 pyspark-shell'

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import window, avg, count
from pyspark.sql.types import StringType, FloatType, StructType, StructField, IntegerType, TimestampType, DoubleType
from pyspark.sql.functions import from_json, col, to_timestamp

### Create spark session

In [3]:
# Create a spark session
spark = SparkSession.builder.appName("Twitter").getOrCreate()

### Subscribe to twitter topic

In [4]:
# Define the timestamp format
timestampFormat = "E MMM dd HH:mm:ss +0000 yyyy"

# Create the schema of incoming data
twitter_schema = StructType([
    StructField('timestamp', TimestampType(), False),
    StructField('text', StringType(), False),
    StructField('sentiment', DoubleType(), False)
])

In [5]:
# Read kafka stream and subscribe to twitter topic
twitter_df = (spark.readStream
          .format('kafka')
          .option('kafka.bootstrap.servers', 'kafka-1:9092')
          .option('startingOffsets', 'latest')
          .option('subscribe', 'twitter')
          .load()
          .select(col("key").cast("string"), \
                  from_json(col("value").cast("string"), twitter_schema, \
                  { "timestampFormat": timestampFormat }).alias("value")))

twitter_df.printSchema()

root
 |-- key: string (nullable = true)
 |-- value: struct (nullable = true)
 |    |-- timestamp: timestamp (nullable = true)
 |    |-- text: string (nullable = true)
 |    |-- sentiment: double (nullable = true)



In [6]:
twitter = twitter_df.select('value.*')
twitter.printSchema()

root
 |-- timestamp: timestamp (nullable = true)
 |-- text: string (nullable = true)
 |-- sentiment: double (nullable = true)



In [7]:
twitter_df_stream = (twitter
         .writeStream
         .queryName("twitter")
         .format("memory")
         .start())

In [15]:
raw = spark.sql("select * from twitter")
raw.show()

+-------------------+--------------------+---------+
|          timestamp|                text|sentiment|
+-------------------+--------------------+---------+
|2019-11-01 11:18:30|Free 300 GH Cloud...|   0.5106|
|2019-11-01 11:18:31|@Crystamped Crypt...|   0.4404|
|2019-11-01 11:18:31|https://t.co/ip7K...|      0.0|
|2019-11-01 11:18:32|RT @santimentfeed...|      0.0|
|2019-11-01 11:18:39|Bakkt: Bitcoin Fu...|      0.0|
|2019-11-01 11:18:39|Don’t do a @MadBi...|      0.0|
|2019-11-01 11:18:43|RT @crypfo1: http...|      0.0|
|2019-11-01 11:18:44|RT @stacyherbert:...|      0.0|
|2019-11-01 11:18:47|RT @DACX_io: 💫 T...|      0.0|
|2019-11-01 11:18:48|RT @xcardbymobilu...|   0.6114|
|2019-11-01 11:18:49|Bitcoin, last mon...|   0.5267|
|2019-11-01 11:18:52|    bitcoin, stocks,|      0.0|
|2019-11-01 11:18:54|RT @Rhythmtrader:...|   0.5267|
|2019-11-01 11:18:54|Bitcoin’s Defense...|   0.4939|
|2019-11-01 11:18:55|RT @skwp: If you'...|     0.34|
|2019-11-01 11:18:58|RT @KarlTurner5: ...|     

### Subscribe to crypto topic

In [16]:
# Create the schema of incoming data
crypto_schema = StructType([
    StructField('timestamp', TimestampType(), False),
    StructField('price', DoubleType(), False)
])

In [17]:
# Define the timestamp format
timestampFormat = "dd-mm-yyyy HH:mm:ss"

# Read kafka stream and subscribe to crypto topic
crypto_df = (spark.readStream
          .format('kafka')
          .option('kafka.bootstrap.servers', 'kafka-1:9092')
          .option('startingOffsets', 'latest')
          .option('subscribe', 'crypto')
          .load()
          .select(col("key").cast("string"), \
                  from_json(col("value").cast("string"), crypto_schema, \
                  { "timestampFormat": timestampFormat }).alias("value")))

crypto_df.printSchema()

root
 |-- key: string (nullable = true)
 |-- value: struct (nullable = true)
 |    |-- timestamp: timestamp (nullable = true)
 |    |-- price: double (nullable = true)



In [18]:
crypto = crypto_df.select('value.*')
crypto.printSchema()

root
 |-- timestamp: timestamp (nullable = true)
 |-- price: double (nullable = true)



In [19]:
crypto_df_stream = (crypto
         .writeStream
         .queryName("crypto")
         .format("memory")
         .start())

In [20]:
raw = spark.sql("select * from crypto")
raw.show()

+---------+-----+
|timestamp|price|
+---------+-----+
+---------+-----+



### Twitter aggregation

In [21]:
tweet_aggregation = (twitter
                     .groupBy(window(twitter.timestamp, '30 seconds'))
                     .agg(avg('sentiment').alias('sentiment'), count('timestamp').alias('n_tweets')))

In [22]:
tweet_agg_stream = (tweet_aggregation
    .writeStream
    .outputMode("complete")
    .queryName("tweets_aggs")
    .format("memory")
    .start())

In [23]:
spark.sql("select * from tweets_aggs").printSchema()

root
 |-- window: struct (nullable = false)
 |    |-- start: timestamp (nullable = true)
 |    |-- end: timestamp (nullable = true)
 |-- sentiment: double (nullable = true)
 |-- n_tweets: long (nullable = false)



In [25]:
tweet_agg_df = spark.sql("select window.start, sentiment, n_tweets from tweets_aggs")
tweet_agg_df.show(truncate=False)

+-----+---------+--------+
|start|sentiment|n_tweets|
+-----+---------+--------+
+-----+---------+--------+



In [47]:
tweet_agg_stream.stop()
tweet_agg_stream.status

{'message': 'Stopped', 'isDataAvailable': False, 'isTriggerActive': False}

### Crypto aggregation

In [26]:
crypto_aggregation = (crypto
                     .groupBy(window(crypto.timestamp, '30 seconds'))
                     .agg(avg('price').alias('price'))
                     .select(['window.start', 'price']))

crypto_aggregation

DataFrame[start: timestamp, price: double]

In [27]:
crypto_agg_stream = (crypto_aggregation
    .writeStream
    .outputMode("complete")
    .queryName("crypto_agg")
    .format("memory")
    .start())

In [29]:
crypto_agg_df = spark.sql("select start, price from crypto_agg")
crypto_agg_df.show(truncate=False)

+-------------------+-------+
|start              |price  |
+-------------------+-------+
|2019-01-01 11:20:00|8324.27|
+-------------------+-------+



In [46]:
crypto_agg_stream.stop()
crypto_agg_stream.status

{'message': 'Stopped', 'isDataAvailable': False, 'isTriggerActive': False}

### Joining the two streams

In [33]:
tweet_agg_df.show()

+-------------------+-------------------+--------+
|              start|          sentiment|n_tweets|
+-------------------+-------------------+--------+
|2019-11-01 11:21:00| 0.1809111111111111|      18|
|2019-11-01 11:20:00|        0.160221875|      32|
|2019-11-01 11:19:30|0.09883636363636362|      11|
|2019-11-01 11:22:30|0.22011666666666665|      30|
|2019-11-01 11:23:00|0.10823333333333333|       3|
|2019-11-01 11:21:30|0.15878421052631578|      19|
|2019-11-01 11:22:00|0.21068846153846155|      26|
|2019-11-01 11:20:30| 0.3449722222222222|      18|
+-------------------+-------------------+--------+



In [34]:
crypto_agg_df.show()

+-------------------+-----------------+
|              start|            price|
+-------------------+-----------------+
|2019-01-01 11:21:30|8319.603333333334|
|2019-01-01 11:21:00|8323.869999999999|
|2019-01-01 11:20:00|         8325.196|
|2019-01-01 11:20:30|8326.328333333333|
|2019-01-01 11:22:30|         8305.995|
|2019-01-01 11:23:00|8299.199999999999|
|2019-01-01 11:22:00|8311.749999999998|
+-------------------+-----------------+



In [32]:
tweet_agg_df.join(crypto_agg_df, 'start').show()

+-----+---------+--------+-----+
|start|sentiment|n_tweets|price|
+-----+---------+--------+-----+
+-----+---------+--------+-----+

