### Testing application

In [1]:
import os
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-sql-kafka-0-10_2.11:2.4.4 pyspark-shell'

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import window, avg, count
from pyspark.sql.types import StringType, FloatType, StructType, StructField, IntegerType, TimestampType, DoubleType
from pyspark.sql.functions import from_json, col, to_timestamp

### Create spark session

In [3]:
# Create a spark session
spark = SparkSession.builder.appName("Twitter").getOrCreate()

### Subscribe to twitter topic

In [4]:
# Define the timestamp format
timestampFormat = "E MMM dd HH:mm:ss +0000 yyyy"

# Create the schema of incoming data
twitter_schema = StructType([
    StructField('timestamp', TimestampType(), False),
    StructField('text', StringType(), False),
    StructField('sentiment', DoubleType(), False)
])

In [5]:
# Read kafka stream and subscribe to twitter topic
twitter_df = (spark.readStream
          .format('kafka')
          .option('kafka.bootstrap.servers', 'kafka-1:9092')
          .option('startingOffsets', 'latest')
          .option('subscribe', 'twitter')
          .load()
          .select(col("key").cast("string"), \
                  from_json(col("value").cast("string"), twitter_schema, \
                  { "timestampFormat": timestampFormat }).alias("value")))

twitter_df.printSchema()

root
 |-- key: string (nullable = true)
 |-- value: struct (nullable = true)
 |    |-- timestamp: timestamp (nullable = true)
 |    |-- text: string (nullable = true)
 |    |-- sentiment: double (nullable = true)



In [6]:
twitter = twitter_df.select('value.*')
twitter.printSchema()

root
 |-- timestamp: timestamp (nullable = true)
 |-- text: string (nullable = true)
 |-- sentiment: double (nullable = true)



In [7]:
twitter_df_stream = (twitter
         .writeStream
         .queryName("twitter")
         .format("memory")
         .start())

In [17]:
raw = spark.sql("select * from twitter")
raw.show()

+-------------------+--------------------+---------+
|          timestamp|                text|sentiment|
+-------------------+--------------------+---------+
|2019-10-31 08:52:24|Yesterday results...|      0.0|
|2019-10-31 08:52:25|RT @Cointelegraph...|      0.0|
+-------------------+--------------------+---------+



### Subscribe to crypto topic

In [18]:
# Create the schema of incoming data
crypto_schema = StructType([
    StructField('timestamp', TimestampType(), False),
    StructField('price', DoubleType(), False)
])

In [19]:
# Define the timestamp format
timestampFormat = "dd-mm-yyyy HH:mm:ss"

# Read kafka stream and subscribe to crypto topic
crypto_df = (spark.readStream
          .format('kafka')
          .option('kafka.bootstrap.servers', 'kafka-1:9092')
          .option('startingOffsets', 'latest')
          .option('subscribe', 'crypto')
          .load()
          .select(col("key").cast("string"), \
                  from_json(col("value").cast("string"), crypto_schema, \
                  { "timestampFormat": timestampFormat }).alias("value")))

crypto_df.printSchema()

root
 |-- key: string (nullable = true)
 |-- value: struct (nullable = true)
 |    |-- timestamp: timestamp (nullable = true)
 |    |-- price: double (nullable = true)



In [20]:
crypto = crypto_df.select('value.*')
crypto.printSchema()

root
 |-- timestamp: timestamp (nullable = true)
 |-- price: double (nullable = true)



In [21]:
crypto_df_stream = (crypto
         .writeStream
         .queryName("crypto")
         .format("memory")
         .start())

In [22]:
raw = spark.sql("select * from crypto")
raw.show()

+---------+-----+
|timestamp|price|
+---------+-----+
+---------+-----+



### Twitter aggregation

In [23]:
tweet_aggregation = (twitter
                     .groupBy(window(twitter.timestamp, '30 seconds'))
                     .agg(avg('sentiment').alias('sentiment'), count('timestamp').alias('n_tweets')))

In [24]:
tweet_agg_stream = (tweet_aggregation
    .writeStream
    .outputMode("complete")
    .queryName("tweets_aggs")
    .format("memory")
    .start())

In [32]:
spark.sql("select * from tweets_aggs").printSchema()

root
 |-- window: struct (nullable = false)
 |    |-- start: timestamp (nullable = true)
 |    |-- end: timestamp (nullable = true)
 |-- sentiment: double (nullable = true)
 |-- n_tweets: long (nullable = false)



In [44]:
tweet_agg_df = spark.sql("select window.start, sentiment, n_tweets from tweets_aggs")
tweet_agg_df.show(truncate=False)

+-------------------+-------------------+--------+
|start              |sentiment          |n_tweets|
+-------------------+-------------------+--------+
|2019-10-31 08:53:00|0.331532           |25      |
|2019-10-31 08:52:30|0.10249473684210526|19      |
+-------------------+-------------------+--------+



In [47]:
tweet_agg_stream.stop()
tweet_agg_stream.status

{'message': 'Stopped', 'isDataAvailable': False, 'isTriggerActive': False}

### Crypto aggregation

In [34]:
crypto_aggregation = (crypto
                     .groupBy(window(crypto.timestamp, '30 seconds'))
                     .agg(avg('price').alias('price'))
                     .select(['window.start', 'price']))

crypto_aggregation

DataFrame[start: timestamp, price: double]

In [35]:
crypto_agg_stream = (crypto_aggregation
    .writeStream
    .outputMode("complete")
    .queryName("crypto_agg")
    .format("memory")
    .start())

In [42]:
crypto_agg_df = spark.sql("select start, price from crypto_agg")
crypto_agg_df.show(truncate=False)

+-------------------+-------+
|start              |price  |
+-------------------+-------+
|2019-01-31 08:58:30|8184.05|
|2019-01-31 08:59:30|8180.18|
|2019-01-31 08:59:00|8181.29|
|2019-01-31 09:00:00|8179.71|
+-------------------+-------+



In [46]:
crypto_agg_stream.stop()
crypto_agg_stream.status

{'message': 'Stopped', 'isDataAvailable': False, 'isTriggerActive': False}

### Joining the two streams

In [87]:
tweet_agg_df.show()

+-------------------+--------------------+--------+
|              start|           sentiment|n_tweets|
+-------------------+--------------------+--------+
|2019-10-30 20:07:00|             0.09854|      20|
|2019-10-30 20:07:30|-0.02256190476190...|      21|
|2019-10-30 20:06:30|  0.1661153846153846|      26|
|2019-10-30 20:08:00| 0.18208461538461543|      13|
+-------------------+--------------------+--------+



In [82]:
crypto_agg_df.show()

+-------------------+-------+
|              start|  price|
+-------------------+-------+
|2019-01-30 20:10:00|8255.23|
|2019-01-30 20:09:00|8252.86|
|2019-01-30 20:08:00|8258.44|
|2019-01-30 20:13:30|8267.39|
|2019-01-30 20:08:30|8257.95|
|2019-01-30 20:13:00| 8263.5|
|2019-01-30 20:06:30|8258.71|
|2019-01-30 20:09:30|8255.41|
|2019-01-30 20:07:00|8257.89|
|2019-01-30 20:07:30|8258.62|
|2019-01-30 20:11:00|8258.62|
|2019-01-30 20:11:30|8260.95|
|2019-01-30 20:10:30|8255.58|
|2019-01-30 20:12:30|8261.06|
|2019-01-30 20:12:00|8261.08|
|2019-01-30 20:14:00|8267.99|
+-------------------+-------+



In [45]:
tweet_agg_df.join(crypto_agg_df, 'start').show()

+-----+---------+--------+-----+
|start|sentiment|n_tweets|price|
+-----+---------+--------+-----+
+-----+---------+--------+-----+

