In [1]:
import findspark
findspark.init()

In [2]:
import json, requests, sys
from nltk.corpus import stopwords
from operator import add
from pyspark import SparkContext
from pyspark import StorageLevel
from pyspark.streaming import StreamingContext
from textblob import TextBlob

In [3]:
# text classification
def getSentiment(text):
    sent = TextBlob(text).sentiment.polarity
    neutral_threshold = 0.05
    
    if sent >= neutral_threshold:
        return (1, 0, 0) # positive
    elif sent > neutral_threshold:
        return (0, 1, 0) # neutral
    else:
        return (0, 0, 1) # negative

In [4]:
def getTweetsCounter(dstream_tweets_sentiment_analysed, window_length, sliding_interval):

    tweets_to_count = dstream_tweets_sentiment_analysed. \
        map(lambda x: ('count', (1, x[2])))

    tweets_count_acc_sent = tweets_to_count. \
        reduceByKeyAndWindow(lambda x, y: (x[0] + y[0], (x[1][0] + y[1][0], x[1][1] + y[1][1], x[1][2] + y[1][2])), None,
                             window_length, sliding_interval)

    total_count = tweets_count_acc_sent. \
        map(lambda x: x[1])
    
    total_count.pprint()
    return total_count
    
def sendTweetsCounter(sentiments, url):
    def takeAndSend(time, rdd):
        if not rdd.isEmpty():
            (name, (total, (pos, neutral, neg))) = rdd.first()

            json_data = {'positive': pos, 'neutral': neutral, 'negative': neg, 'total': total}
            #print(json_data)

            response = requests.post(url, data=json_data)

    sentiments.foreachRDD(takeAndSend)

In [5]:
def getTweets(kvs):
    tweets_text = kvs.map(lambda x: json.loads(x)) \
                .map(lambda json_object: (json_object["user"]["screen_name"], json_object["text"], json_object["user"]["followers_count"], json_object["id"])) \
                .transform(lambda rdd: rdd.sortBy(lambda x: x[2], ascending = False))
    
    tweets_text.pprint()
    return tweets_text
    
def sendTweets(tweets, url):
    def takeAndSend(time, rdd):
        if not rdd.isEmpty():
            tweets_data = rdd.take(10)

            users = []
            texts = []
            tweet_ids = []

            for (user, text, follower_count, tweet_id) in tweets_data:
                users.append(user)
                texts.append(text)
                tweet_ids.append(tweet_id)

            json_data = {'user': str(users), 'text': str(texts), 'id': str(tweet_ids)}
            print(json_data)

            response = requests.post(url, data=json_data)

    tweets.foreachRDD(takeAndSend)

In [6]:
def getTopWords(tweets, window_length, sliding_interval):
    lines = tweets.flatMap(lambda line: line[1].split(" "))

    ## This part does the word count
    sw = stopwords.words('english')
    sw.extend(['rt', 'https', 'http', 'coronavirus', 'covid19', 'covid-19'])
    
    counts = lines.map(lambda word: word.strip().lower()) \
                  .filter(lambda word: word not in sw) \
                  .filter(lambda word: len(word) >= 2 and word[0] != '#' and word[0] != '@') \
                  .map(lambda word: (word, 1)) \
                  .reduceByKeyAndWindow(add, None,  window_length, sliding_interval)\
                  .transform(lambda rdd: rdd.sortBy(lambda x: x[1], ascending = False))
    
    counts.pprint()
    return counts
    
def sendTopWords(counts, url):
    def takeAndSend(time, rdd):
        if not rdd.isEmpty():
            word_counts = rdd.take(10)

            words = []
            values = []

            for (word, count) in word_counts:
                words.append(word)
                values.append(count)

            json_data = {'words': str(words), 'counts': str(values)}
            print(json_data)

            response = requests.post(url, data=json_data)

    counts.foreachRDD(takeAndSend)

In [7]:
def getTopHashTags(dstream_tweets_sentiment_analysed, window_length, sliding_interval):
    hashtags = dstream_tweets_sentiment_analysed.\
            map(lambda x: ((x[0], x[2]), x[1])).\
            flatMapValues(lambda text: text.split(" ")).\
            filter(lambda x: len(x[1]) > 1 and x[1][0] == '#'). \
            map(lambda x: (x[1], (1, x[0][1])))
    
    hashtags_count_acc_sent = hashtags. \
        reduceByKeyAndWindow(lambda x, y: (x[0] + y[0], (x[1][0] + y[1][0], x[1][1] + y[1][1], x[1][2] + y[1][2])), None,
                             window_length, sliding_interval)
    
    sorted_hashtags_count = hashtags_count_acc_sent. \
        map(lambda x: (x[1][0], (x[0], x[1][1]))). \
        transform(lambda rdd: rdd.sortByKey(False)). \
        map(lambda x: (x[1][0], (x[0], x[1][1])))

    
    sorted_hashtags_count.pprint()
    return sorted_hashtags_count

In [8]:
def getTopMentioned(dstream_tweets_sentiment_analysed, window_length, sliding_interval):
    mentioned = dstream_tweets_sentiment_analysed.\
            map(lambda x: ((x[0], x[2]), x[1])).\
            flatMapValues(lambda text: text.split(" ")).\
            filter(lambda x: len(x[1]) > 1 and x[1][0] == '@'). \
            map(lambda x: (x[1], (1, x[0][1])))
    
    mentioned_count_acc_sent = mentioned. \
        reduceByKeyAndWindow(lambda x, y: (x[0] + y[0], (x[1][0] + y[1][0], x[1][1] + y[1][1], x[1][2] + y[1][2])), None,
                             window_length, sliding_interval)
    
    sorted_mentioned_count = mentioned_count_acc_sent. \
        map(lambda x: (x[1][0], (x[0], x[1][1]))). \
        transform(lambda rdd: rdd.sortByKey(False)). \
        map(lambda x: (x[1][0], (x[0], x[1][1])))


    sorted_mentioned_count.pprint()
    return sorted_mentioned_count

In [14]:
def getTopActive(dstream_tweets_sentiment_analysed, window_length, sliding_interval):
    active=dstream_tweets_sentiment_analysed. \
        map(lambda x: (x[0], (1, x[2])))
    
    user_count_acc_sent = active. \
        reduceByKeyAndWindow(lambda x, y: (x[0] + y[0], (x[1][0] + y[1][0], x[1][1] + y[1][1], x[1][2] + y[1][2])), None,
                             window_length, sliding_interval)
    
    sorted_users_count = user_count_acc_sent. \
        map(lambda x: (x[1][0], (x[0], x[1][1]))). \
        transform(lambda rdd: rdd.sortByKey(False)). \
        map(lambda x: (x[1][0], (x[0], x[1][1])))
    
    sorted_users_count.pprint()
    return sorted_users_count

In [None]:
def send_top_to_dashboard(dstream_tweets_sentiment_analysed, url):

    num = 10

    def take_and_send(time, rdd):
        if not rdd.isEmpty():
            taken = rdd.take(num)

            labels = []
            negative = []
            neutral = []
            positive = []
            for (name, (count, (pos, neu, neg))) in taken:
                labels.append(name)
                negative.append(neg)
                neutral.append(neu)
                positive.append(pos)

            request_data = {'label': str(labels), 'negative': str(negative), 'neutral': str(neutral), 'positive': str(positive)}
            response = requests.post(url, data=request_data)

    dstream_tweets_sentiment_analysed.foreachRDD(take_and_send)

In [10]:
sc = SparkContext(appName="tweetStream")
# Create a local StreamingContext with batch interval of 2 second
batch_interval = 2
window_length = 15*60
sliding_interval = 6

ssc = StreamingContext(sc, batch_interval)
ssc.checkpoint("twittercheckpt")

# Create a DStream that conencts to hostname:port
tweetStream = ssc.socketTextStream("0.0.0.0", 5555)

In [11]:
tweets = tweetStream. \
        map(lambda  x: json.loads(x)). \
        map(lambda json_object: (json_object["user"]["screen_name"], json_object["text"]))

tweets_sentiment_analysed = tweets. \
        map(lambda x: (x[0], x[1], getSentiment(x[1])))

In [12]:
tweets_sentiment_analysed.persist(StorageLevel.MEMORY_AND_DISK)

<pyspark.streaming.dstream.TransformedDStream at 0x7fbe010d6c40>

In [None]:
server = 'http://localhost:5000/'

tweet_counters = getTweetsCounter(tweets_sentiment_analysed, window_length, sliding_interval)
sendTweetsCounter(tweet_counters,  server +'update_sentiments')

tweet_text= getTweets(tweetStream)
sendTweets(tweet_text, server + 'update_tweets')

key_words=getTopWords(tweets, window_length, sliding_interval)
sendTopWords(key_words, server + 'update_counts')

hashtag=getTopHashTags(tweets_sentiment_analysed, window_length, sliding_interval)
mention=getTopMentioned(tweets_sentiment_analysed, window_length, sliding_interval)
activeuser=getTopActive(tweets_sentiment_analysed, window_length, sliding_interval)

send_top_to_dashboard(hashtag)
send_top_to_dashboard(mention)
send_top_to_dashboard(activeuser)

In [15]:
getTopActive(tweets_sentiment_analysed, window_length, sliding_interval)



<pyspark.streaming.dstream.TransformedDStream at 0x7fbe010d80a0>

In [13]:
getTopMentioned(tweets_sentiment_analysed, window_length, sliding_interval)

<pyspark.streaming.dstream.TransformedDStream at 0x7f8bb7ed5d30>

In [16]:
# Start computing
ssc.start()        
# Wait for termination
ssc.awaitTermination()
ssc.stop(stopGraceFully = True)

-------------------------------------------
Time: 2021-05-26 16:37:24
-------------------------------------------
('Shambles151', (1, (0, 0, 1)))
('LincsLimpet', (1, (0, 0, 1)))
('bryspeed1e', (1, (0, 0, 1)))
('RiteshY15653204', (1, (1, 0, 0)))
('Sakthim08061171', (1, (0, 0, 1)))

-------------------------------------------
Time: 2021-05-26 16:37:30
-------------------------------------------
('Shambles151', (1, (0, 0, 1)))
('Sangfro76889793', (1, (1, 0, 0)))
('ASTRALlife_', (1, (0, 0, 1)))
('Hairyloon', (1, (1, 0, 0)))
('LincsLimpet', (1, (0, 0, 1)))
('AmberL_77', (1, (0, 0, 1)))
('marciaj64', (1, (0, 0, 1)))
('MonicaChangFury', (1, (1, 0, 0)))
('pdr212004', (1, (0, 0, 1)))
('slganesh1', (1, (0, 0, 1)))
...

-------------------------------------------
Time: 2021-05-26 16:37:36
-------------------------------------------
('RiteshY15653204', (3, (2, 0, 1)))
('LincsLimpet', (2, (1, 0, 1)))
('IamPunithGowda', (1, (0, 0, 1)))
('Shambles151', (1, (0, 0, 1)))
('Sangfro76889793', (1, (1, 0, 0

KeyboardInterrupt: 

In [5]:
server = 'http://localhost:5000/'
geodata_path='/Users/shawvin/Desktop/Big data project/geo_tweets.txt'

sendGeoData(geodata_path, server + 'update_geodata')
sendTweetsFromStream(kvs, server + 'update_tweets')
sendTopHashtagsFromStream(kvs, server + 'update_hashtagcounts')
sendTopWordsFromStream(kvs, server + 'update_counts')
sendTweetSentimentsFromStream(kvs, server + 'update_sentiments')