##Twitter Streaming Analytics

I have a twitter listener running in another notebook collecting tweets in Blockchain, I save tweets in a file every 1 minute in the folder /tmp/blockchain

In [0]:
%fs ls /tmp/blockchain

In [0]:
tweets=spark.read.json("/tmp/blockchain")

Create a window streaming function to show most recent tweets

In [0]:
#create a streaming data frame
from pyspark.sql.functions import to_timestamp
spark.sql("set spark.sql.legacy.timeParserPolicy=LEGACY")

tweets_streaming = spark\
  .readStream\
  .schema(tweets.schema)\
  .option("maxFilesPerTrigger", 1)\
  .json('/tmp/blockchain')

from pyspark.sql.functions import window, col

tweets_streaming.writeStream\
  .format("parquet")\
  .option("checkpointLocation", "/tmp/")\
  .outputMode("append")\
  .start('/tmp/blockchain_output')

In [0]:
# further process tweets_streaming df

tweets_streaming1=tweets_streaming.withColumn('date', to_timestamp('created_at','E MMM dd HH:mm:ss +0000 yyyy')).select("date", 'text')

In [0]:
display(tweets_streaming1, streamName="tweets")

In [0]:
%sql

select minute(date) as minute, count(*)
from example
group by minute
order by minute

### Create a streaming function to show number of tweets by 5 minutes window

In [0]:
#create a streaming Data Frame
from pyspark.sql.functions import to_timestamp

tweets_streaming = spark\
  .readStream\
  .schema(tweets.schema)\
  .option("maxFilesPerTrigger", 1)\
  .json('/tmp/blockchain/')

from pyspark.sql.functions import window, col, lower

tweets_streaming.withColumn('date', to_timestamp('created_at','E MMM dd HH:mm:ss +0000 yyyy'))\
  .groupBy(window(col('date'), "5 minutes")).count()\
  .writeStream\
  .queryName("tweetsBy5Minutes")\
  .format("memory")\
  .outputMode("complete")\
  .start()

In [0]:
from time import sleep
for x in range(10):
  spark.sql("select * from tweetsBy5Minutes order by window desc").show(10, False)
  sleep(2)

## Create a streaming function to show number of tweets by 10 minutes window in particular location

In [0]:
#create a streaming Data Frame
from pyspark.sql.functions import to_timestamp

tweets_streaming = spark\
  .readStream\
  .schema(tweets.schema)\
  .option("maxFilesPerTrigger", 1)\
  .json('/tmp/blockchain/')

from pyspark.sql.functions import window, col, lower

tweets_streaming.withColumn('date', to_timestamp('created_at','E MMM dd HH:mm:ss +0000 yyyy'))\
  .filter(instr(lower(col('user.location')), 'japan')>=1)\
  .groupBy(window(col('date'), "10 minutes")).count()\
  .writeStream\
  .queryName("tweetsByLocation")\
  .format("memory")\
  .outputMode("complete")\
  .start()

In [0]:
from time import sleep
for x in range(10):
  spark.sql("select * from tweetsByLocation order by window desc").show(10, False)
  sleep(2)

Create a sliding window function to show number of tweets by 10 minutes, with 5 minutes interval.

In [0]:
#create a streaming data frame
from pyspark.sql.functions import to_timestamp

tweets_streaming = spark\
  .readStream\
  .schema(tweets.schema)\
  .option("maxFilesPerTrigger", 1)\
  .json('/tmp/blockchain')

from pyspark.sql.functions import window, col

tweets_streaming.withColumn('date', to_timestamp('created_at','E MMM dd HH:mm:ss +0000 yyyy'))\
  .groupBy(window(col('date'), "10 minutes", "5 minutes")).count()\
  .writeStream\
  .queryName("tweetsCountsby10Minutes")\
  .format("memory")\
  .outputMode("complete")\
  .start()

In [0]:
# Check the results
from time import sleep
for x in range(10):
  spark.sql("select * from tweetsCountsby10Minutes order by window desc").show(10, False)
  sleep(5)