### Testing application

In [1]:
import os
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-sql-kafka-0-10_2.11:2.4.4 pyspark-shell'

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import window, avg, count
from pyspark.sql.types import StringType, FloatType, StructType, StructField, IntegerType, TimestampType, DoubleType
from pyspark.sql.functions import from_json, col, to_timestamp

### Create spark session

In [3]:
# Create a spark session
spark = SparkSession.builder.appName("Twitter").getOrCreate()

### Subscribe to twitter topic

In [4]:
# Define the timestamp format
timestampFormat = "E MMM dd HH:mm:ss +0000 yyyy"

# Create the schema of incoming data
twitter_schema = StructType([
    StructField('timestamp', TimestampType(), False),
    StructField('text', StringType(), False),
    StructField('sentiment', DoubleType(), False)
])

In [5]:
# Read kafka stream and subscribe to twitter topic
twitter_df = (spark.readStream
          .format('kafka')
          .option('kafka.bootstrap.servers', 'kafka-1:9092')
          .option('startingOffsets', 'latest')
          .option('subscribe', 'twitter')
          .load()
          .select(col("key").cast("string"), \
                  from_json(col("value").cast("string"), twitter_schema, \
                  { "timestampFormat": timestampFormat }).alias("value")))

twitter_df.printSchema()

root
 |-- key: string (nullable = true)
 |-- value: struct (nullable = true)
 |    |-- timestamp: timestamp (nullable = true)
 |    |-- text: string (nullable = true)
 |    |-- sentiment: double (nullable = true)



In [6]:
twitter = twitter_df.select('value.*')
twitter.printSchema()

root
 |-- timestamp: timestamp (nullable = true)
 |-- text: string (nullable = true)
 |-- sentiment: double (nullable = true)



In [7]:
twitter_df_stream = (twitter
         .writeStream
         .queryName("twitter")
         .format("memory")
         .start())

In [8]:
raw = spark.sql("select * from twitter")
raw.show()

+---------+----+---------+
|timestamp|text|sentiment|
+---------+----+---------+
+---------+----+---------+



### Subscribe to crypto topic

In [9]:
# Create the schema of incoming data
crypto_schema = StructType([
    StructField('timestamp', TimestampType(), False),
    StructField('price', DoubleType(), False)
])

In [10]:
# Define the timestamp format
timestampFormat = "dd-MM-yyyy HH:mm:ss"

# Read kafka stream and subscribe to crypto topic
crypto_df = (spark.readStream
          .format('kafka')
          .option('kafka.bootstrap.servers', 'kafka-1:9092')
          .option('startingOffsets', 'latest')
          .option('subscribe', 'crypto')
          .load()
          .select(col("key").cast("string"), \
                  from_json(col("value").cast("string"), crypto_schema, \
                  { "timestampFormat": timestampFormat }).alias("value")))

crypto_df.printSchema()

root
 |-- key: string (nullable = true)
 |-- value: struct (nullable = true)
 |    |-- timestamp: timestamp (nullable = true)
 |    |-- price: double (nullable = true)



In [11]:
crypto = crypto_df.select('value.*')
crypto.printSchema()

root
 |-- timestamp: timestamp (nullable = true)
 |-- price: double (nullable = true)



In [12]:
crypto_df_stream = (crypto
         .writeStream
         .queryName("crypto")
         .format("memory")
         .start())

In [24]:
raw = spark.sql("select * from crypto")
raw.show()

+-------------------+-------+
|          timestamp|  price|
+-------------------+-------+
|2019-11-04 10:39:16|8258.99|
|2019-11-04 10:39:21|8258.99|
+-------------------+-------+



### Twitter aggregation

In [25]:
tweet_aggregation = (twitter
                     .groupBy(window(twitter.timestamp, '30 seconds'))
                     .agg(avg('sentiment').alias('sentiment'), count('timestamp').alias('n_tweets')))

In [26]:
tweet_agg_stream = (tweet_aggregation
    .writeStream
    .outputMode("complete")
    .queryName("tweets_aggs")
    .format("memory")
    .start())

In [27]:
spark.sql("select * from tweets_aggs").printSchema()

root
 |-- window: struct (nullable = false)
 |    |-- start: timestamp (nullable = true)
 |    |-- end: timestamp (nullable = true)
 |-- sentiment: double (nullable = true)
 |-- n_tweets: long (nullable = false)



In [36]:
tweet_agg_df = spark.sql("select window.start, sentiment, n_tweets from tweets_aggs")
tweet_agg_df.show(truncate=False)

+-------------------+------------------+--------+
|start              |sentiment         |n_tweets|
+-------------------+------------------+--------+
|2019-11-04 10:39:00|0.39036           |5       |
|2019-11-04 10:39:30|0.2583142857142857|7       |
+-------------------+------------------+--------+



In [47]:
tweet_agg_stream.stop()
tweet_agg_stream.status

{'message': 'Stopped', 'isDataAvailable': False, 'isTriggerActive': False}

### Crypto aggregation

In [29]:
crypto_aggregation = (crypto
                     .groupBy(window(crypto.timestamp, '30 seconds'))
                     .agg(avg('price').alias('price'))
                     .select(['window.start', 'price']))

crypto_aggregation

DataFrame[start: timestamp, price: double]

In [30]:
crypto_agg_stream = (crypto_aggregation
    .writeStream
    .outputMode("complete")
    .queryName("crypto_agg")
    .format("memory")
    .start())

In [35]:
crypto_agg_df = spark.sql("select start, price from crypto_agg")
crypto_agg_df.show(truncate=False)

+-------------------+-------+
|start              |price  |
+-------------------+-------+
|2019-11-04 10:39:30|8253.86|
+-------------------+-------+



In [46]:
crypto_agg_stream.stop()
crypto_agg_stream.status

{'message': 'Stopped', 'isDataAvailable': False, 'isTriggerActive': False}

### Joining the two streams

In [38]:
tweet_agg_df.show()

+-------------------+---------+--------+
|              start|sentiment|n_tweets|
+-------------------+---------+--------+
|2019-11-04 10:39:00|  0.39036|       5|
|2019-11-04 10:39:30| 0.226025|       8|
+-------------------+---------+--------+



In [39]:
crypto_agg_df.show()

+-------------------+---------+
|              start|    price|
+-------------------+---------+
|2019-11-04 10:39:30|8256.8275|
|2019-11-04 10:40:00|  8259.78|
+-------------------+---------+



In [42]:
tweet_agg_df.join(crypto_agg_df, 'start').show()

+-------------------+-------------------+--------+---------+
|              start|          sentiment|n_tweets|    price|
+-------------------+-------------------+--------+---------+
|2019-11-04 10:41:30|            0.15418|      15| 8259.895|
|2019-11-04 10:40:30|0.16556315789473683|      19|  8260.22|
|2019-11-04 10:42:00|0.05328666666666667|      15| 8261.076|
|2019-11-04 10:39:30| 0.1889090909090909|      11|8256.8275|
|2019-11-04 10:40:00|0.08643684210526316|      19| 8259.896|
|2019-11-04 10:41:00|0.20303703703703704|      27| 8259.045|
+-------------------+-------------------+--------+---------+

