### Testing application

In [1]:
import os
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-sql-kafka-0-10_2.11:2.4.4 pyspark-shell'

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode, window, max, min, avg
from pyspark.sql.functions import split
from pyspark.sql.types import StringType, FloatType, StructType, StructField, IntegerType, TimestampType
from pyspark.sql.functions import from_json, col, to_timestamp

In [3]:
# Create a spark session
spark = SparkSession.builder.appName("Twitter").getOrCreate()

In [4]:
# Define the timestamp format
timestampFormat = "E MMM dd HH:mm:ss +0000 yyyy"

# Define the schema of the incoming tweets
schema = (StructType()
  .add('created_at', TimestampType())
  .add('id_str', StringType())
  .add('text', StringType()))

In [5]:
# Read kafka stream and subscribe to twitter topic
df = (spark.readStream
          .format('kafka')
          .option('kafka.bootstrap.servers', 'kafka-1:9092')
          .option('startingOffsets', 'latest')
          .option('subscribe', 'twitter')
          .load()
          .select(col("key").cast("string"), \
                  from_json(col("value").cast("string"), schema, \
                  { "timestampFormat": timestampFormat }).alias("value")))

df.printSchema()

root
 |-- key: string (nullable = true)
 |-- value: struct (nullable = true)
 |    |-- created_at: timestamp (nullable = true)
 |    |-- id_str: string (nullable = true)
 |    |-- text: string (nullable = true)



In [6]:
tweets = df.select("value.*")
tweets.printSchema()

root
 |-- created_at: timestamp (nullable = true)
 |-- id_str: string (nullable = true)
 |-- text: string (nullable = true)



In [7]:
raw_df = (tweets
         .writeStream
         .queryName("tweets")
         .format("memory")
         .start())

In [45]:
raw = spark.sql("select * from tweets order by created_at desc")
raw.show()

+-------------------+-------------------+--------------------+
|         created_at|             id_str|                text|
+-------------------+-------------------+--------------------+
|2019-10-29 16:15:15|1189214119834071047|RT @MickstapeShow...|
|2019-10-29 16:15:14|1189214114536665089|Ready to crush so...|
|2019-10-29 16:15:13|1189214110531149827|RT @HOCROSAINTS: ...|
|2019-10-29 16:15:13|1189214111667765248|Крис Пол хотел, ч...|
|2019-10-29 16:15:12|1189214109121814528|Get to know the p...|
|2019-10-29 16:15:11|1189214104155820038|mayra pud na wa k...|
|2019-10-29 16:15:10|1189214098359226368|RT @ChrisReading6...|
|2019-10-29 16:15:10|1189214097981804544|RT @carmelsportsc...|
|2019-10-29 16:15:09|1189214095150637056|RT @kryptonprobet...|
|2019-10-29 16:15:08|1189214089077313536|RT @mad_liberals:...|
|2019-10-29 16:15:08|1189214092248207362|Cleveland Cavalie...|
|2019-10-29 16:15:08|1189214092743061506|Oklahoma City Thu...|
|2019-10-29 16:15:07|1189214086720114689|The@BUKnights 

In [25]:
from textblob import TextBlob
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from pyspark.sql.functions import udf

import nltk
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/jovyan/nltk_data...


True

In [29]:
analyzer = SentimentIntensityAnalyzer()

def get_sentiment(tweet):
    scores = analyzer.polarity_scores(tweet)
    
    return scores['compound']

In [31]:
sentiment = udf(get_sentiment)

In [34]:
tweet_sentiment = raw.withColumn("sentiment", sentiment(raw['text']))
tweet_sentiment.show()

+-------------------+-------------------+--------------------+---------+
|         created_at|             id_str|                text|sentiment|
+-------------------+-------------------+--------------------+---------+
|2019-10-29 16:15:15|1189214119834071047|RT @MickstapeShow...|  -0.5423|
|2019-10-29 16:15:14|1189214114536665089|Ready to crush so...|   0.2263|
|2019-10-29 16:15:13|1189214110531149827|RT @HOCROSAINTS: ...|      0.0|
|2019-10-29 16:15:13|1189214111667765248|Крис Пол хотел, ч...|      0.0|
|2019-10-29 16:15:12|1189214109121814528|Get to know the p...|      0.0|
|2019-10-29 16:15:11|1189214104155820038|mayra pud na wa k...|      0.0|
|2019-10-29 16:15:10|1189214097981804544|RT @carmelsportsc...|    0.555|
|2019-10-29 16:15:10|1189214098359226368|RT @ChrisReading6...|   0.8316|
|2019-10-29 16:15:09|1189214095150637056|RT @kryptonprobet...|   0.5622|
|2019-10-29 16:15:08|1189214089077313536|RT @mad_liberals:...|   0.4404|
|2019-10-29 16:15:08|1189214092248207362|Cleveland 

In [47]:
tweet_aggregate = (tweet_sentiment
                   .groupBy(window(tweet_sentiment.created_at, '5 minutes'))
                   .agg({'sentiment': 'mean'})
                   .alias('tweet_aggregate'))

In [48]:
tweet_aggregate_raw = spark.sql("select * from tweet_aggregate order by created_at desc")
tweet_aggregate_raw.show()

AnalysisException: 'Table or view not found: tweet_aggregate; line 1 pos 14'

In [36]:
query = (tweet_aggregate
        .writeStream
        .format("memory")
        .queryName("window_count")
        .outputMode("complete")
        .start())

AnalysisException: "'writeStream' can be called only on streaming Dataset/DataFrame;"

In [88]:
raw = spark.sql("select * from window_count order by window desc")
raw.show()

+--------------------+-----+
|              window|count|
+--------------------+-----+
|[2019-10-23 16:05...|   21|
|[2019-10-23 16:05...|   15|
|[2019-10-23 16:05...|   11|
|[2019-10-23 16:05...|   23|
|[2019-10-23 16:04...|   15|
|[2019-10-23 16:04...|   14|
|[2019-10-23 16:04...|   16|
|[2019-10-23 16:04...|   20|
|[2019-10-23 16:04...|   15|
|[2019-10-23 16:04...|   17|
|[2019-10-23 16:03...|   13|
|[2019-10-23 16:03...|   10|
|[2019-10-23 16:03...|   15|
|[2019-10-23 16:03...|   12|
|[2019-10-23 16:03...|    5|
|[2019-10-23 16:03...|   13|
|[2019-10-23 16:02...|   13|
|[2019-10-23 16:02...|   23|
|[2019-10-23 16:02...|   22|
|[2019-10-23 16:02...|   13|
+--------------------+-----+
only showing top 20 rows

