# Exercise 3.3

Import the tweepy library

In [None]:
import sys
!conda install --yes --prefix {sys.prefix} -c conda-forge tweepy

In [None]:
import socket
import tweepy
from tweepy import OAuthHandler

Set the keys and secrets from the Twitter developer API 

In [None]:
ACCESS_TOKEN = '...'
ACCESS_SECRET = '...'
CONSUMER_KEY = '...'
CONSUMER_SECRET = '...'

Create a stream listener

In [None]:
class MyStreamListener(tweepy.StreamListener):
    def on_error(self, status_code):
        if status_code == 420:
            return False

    def on_data(self, data):
        print(data)

        # send the entire tweet to the socket on localhost where pySpark is listening
        client_socket.sendall(bytes(data, encoding='utf-8'))
        return True

Connect to Twitter

In [None]:
def connect_to_twitter():
    auth = OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
    auth.set_access_token(ACCESS_TOKEN, ACCESS_SECRET)
    api = tweepy.API(auth)

    my_stream_listener = MyStreamListener()
    my_stream = tweepy.Stream(auth=api.auth, listener=my_stream_listener)

    # select a (limited) tweet stream
    my_stream.filter(track=['#AI'])

Open a socket and wait for the connection from pySpark

In [None]:
s = socket.socket()
s.bind(("localhost", 1234))
print("Waiting for connection...")

s.listen(1)  # wait for client connection, this should come from pySpark
client_socket, address = s.accept()  # connect to the pySpark client
print("Received request from: " + str(address))

connect_to_twitter()  # now that we have a connection to pySpark, connect to Twitter

Import the pySpark libraries

In [None]:
import sys
!conda install --yes --prefix {sys.prefix} -c conda-forge pyspark

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json, window, to_timestamp
from pyspark.sql.types import StructType, StructField, StringType

Set up the Twitter date-time format

In [None]:
tweet_datetime_format = 'EEE MMM dd HH:mm:ss ZZZZ yyyy'

Create the schema to parse a tweet in JSON format; we only need two columns

In [None]:
schema = StructType([StructField('created_at', StringType(), True),
                     StructField('text', StringType(), True)])

Launch a Spark session

In [None]:
spark = SparkSession.builder.appName('Packt').getOrCreate()

Get the raw data from a local socket

In [None]:
raw_stream = spark.readStream.format('socket').option('host', 'localhost').option('port', 1234).load()

Parse the json to get separate fields

In [None]:
tweet_stream = raw_stream.select(from_json('value', schema).alias('tweet'))

Create a timestamp by parsing the created_at field

In [None]:
timed_stream = tweet_stream.select(
    to_timestamp('tweet.created_at', tweet_datetime_format).alias('timestamp'),
    'tweet.text')

Create a sliding window of 1 minute with a slide of 10 seconds, with a 'slack time' of 2 seconds

In [None]:
windowed = timed_stream \
    .withWatermark('timestamp', '2 seconds') \
    .groupBy(window('timestamp', '1 minute', '10 seconds'))

Count the tweets per window

In [None]:
counts_per_window = windowed.count().orderBy('window')

output the windows and counts to the console

In [None]:
query = counts_per_window.writeStream.outputMode('complete').format('console').option("truncate", False).start()
query.awaitTermination()