In [1]:
import findspark
findspark.init()

In [2]:
import json, requests, sys, re
from nltk.corpus import stopwords
from operator import add
from pyspark import SparkContext
from pyspark import StorageLevel
from pyspark.streaming import StreamingContext
from textblob import TextBlob

keyword=['myanmar']

In [3]:
# text classification
def getSentiment(text):
    sent = TextBlob(text).sentiment.polarity
    neutral_threshold = 0.05
    
    if sent >= neutral_threshold:
        return (1, 0, 0) # positive
    elif sent > -neutral_threshold:
        return (0, 1, 0) # neutral
    else:
        return (0, 0, 1) # negative

In [4]:
def getTweetsCounter(dstream_tweets_sentiment_analysed, window_length, sliding_interval):

    tweets_to_count = dstream_tweets_sentiment_analysed. \
        map(lambda x: ('count', (1, x[2])))

    tweets_count_acc_sent = tweets_to_count. \
        reduceByKeyAndWindow(lambda x, y: (x[0] + y[0], (x[1][0] + y[1][0], x[1][1] + y[1][1], x[1][2] + y[1][2])), None,
                             window_length, sliding_interval)

    total_count = tweets_count_acc_sent
    
    total_count.pprint()
    return total_count
    
def sendTweetsCounter(sentiments, url):
    def takeAndSend(time, rdd):
        if not rdd.isEmpty():
            (name, (total, (pos, neutral, neg))) = rdd.first()

            json_data = {'positive': pos, 'neutral': neutral, 'negative': neg, 'total': total}
            #print(json_data)

            response = requests.post(url, data=json_data)

    sentiments.foreachRDD(takeAndSend)

In [5]:
def getTweets(kvs, sliding_interval):
    tweets_text = kvs.map(lambda x: json.loads(x)) \
                .map(lambda json_object: (json_object["user"]["screen_name"], json_object["text"], json_object["user"]["followers_count"], json_object["id"])) \
                .window(sliding_interval,sliding_interval) \
                .transform(lambda rdd: rdd.sortBy(lambda x: x[2], ascending = False))
    
    tweets_text.pprint()
    return tweets_text
    
def sendTweets(tweets, url):
    def takeAndSend(time, rdd):
        if not rdd.isEmpty():
            tweets_data = rdd.take(10)

            users = []
            texts = []
            tweet_ids = []

            for (user, text, follower_count, tweet_id) in tweets_data:
                users.append(user)
                texts.append(text)
                tweet_ids.append(tweet_id)

            json_data = {'user': str(users), 'text': str(texts), 'id': str(tweet_ids)}
            #print(json_data)

            response = requests.post(url, data=json_data)

    tweets.foreachRDD(takeAndSend)

In [6]:
def getTopWords(tweets, window_length, sliding_interval):
    words = tweets.map(lambda line:re.sub(r'http\S+','',line[1])) \
                  .map(lambda line:re.sub(r'bit.ly/\S+','', line)) \
                  .map(lambda line:line.strip('[link]')) \
                  .flatMap(lambda line: re.split(r"[\n;,\.\s]",line))

    ## This part does the word count
    sw = stopwords.words('english')
    sw.extend(['rt']+keyword)
    
    counts = words.map(lambda word: word.strip().lower()) \
                  .filter(lambda word: word not in sw) \
                  .filter(lambda word: len(word) >= 2 and word[0] != '#' and word[0] != '@') \
                  .map(lambda word: (word, 1)) \
                  .reduceByKeyAndWindow(add, None,  window_length, sliding_interval)\
                  .transform(lambda rdd: rdd.sortBy(lambda x: x[1], ascending = False))
    
    counts.pprint()
    return counts

In [7]:
def getTopHashTags(tweets, window_length, sliding_interval):
    words = tweets.map(lambda line:re.sub(r'http\S+','',line[1])) \
                  .map(lambda line:re.sub(r'bit.ly/\S+','', line)) \
                  .map(lambda line:line.strip('[link]')) \
                  .flatMap(lambda line: re.split(r"[\n;,\.\s]",line))

    hashtags = words.map(lambda word: word.strip().lower()) \
            .filter(lambda word: len(word) >= 2 and word[0] == '#') \
            .map(lambda word: (word, 1)) \
            .reduceByKeyAndWindow(add, None,  window_length, sliding_interval)\
            .transform(lambda rdd: rdd.sortBy(lambda x: x[1], ascending = False))
    
    hashtags.pprint()
    return hashtags

In [8]:
def sendTopWords(counts, url, num):
    def takeAndSend(time, rdd):
        if not rdd.isEmpty():
            word_counts = rdd.take(num)

            words = []
            values = []

            for (word, count) in word_counts:
                words.append(word)
                values.append(count)

            json_data = {'words': str(words), 'counts': str(values)}
            print(json_data)

            response = requests.post(url, data=json_data)

    counts.foreachRDD(takeAndSend)

In [9]:
def sendGeoData(path, url):
    geodata = sc.textFile(path).map(lambda x: x.encode("ascii","ignore"))\
                .map(lambda x:json.loads(x))\
                .flatMap(lambda x: x['data'])\
                .map(lambda json_object: json_object['geo'])\
                .filter(lambda json_object: 'coordinates' in json_object)\
                .map(lambda json_object: json_object["coordinates"]["coordinates"])\
                .collect()

    longitudes = []
    latitudes = []

    for geotweet in geodata:
        longitudes.append(geotweet[0])
        latitudes.append(geotweet[1])

    json_data = {'longitude': str(longitudes), 'latitude': str(latitudes)}
    response = requests.post(url, data=json_data)

In [10]:
sc = SparkContext(appName="tweetStream")
# Create a local StreamingContext with batch interval of 2 second
batch_interval = 2
window_length = 15*60
sliding_interval = 6

ssc = StreamingContext(sc, batch_interval)
ssc.checkpoint("twittercheckpt")

# Create a DStream that conencts to hostname:port
tweetStream = ssc.socketTextStream("0.0.0.0", 5555)

In [11]:
tweets = tweetStream. \
        map(lambda  x: json.loads(x)). \
        map(lambda json_object: (json_object["user"]["screen_name"], json_object["text"]))

tweets_sentiment_analysed = tweets. \
        map(lambda x: (x[0], x[1], getSentiment(x[1])))

In [12]:
#tweets_sentiment_analysed.persist(StorageLevel.MEMORY_AND_DISK)

In [14]:
server = 'http://localhost:5000/'

tweet_counters = getTweetsCounter(tweets_sentiment_analysed, window_length, sliding_interval)
sendTweetsCounter(tweet_counters,  server +'update_sentiments')

tweet_text= getTweets(tweetStream, sliding_interval)
sendTweets(tweet_text, server + 'update_tweets')

key_words=getTopWords(tweets, window_length, sliding_interval)
sendTopWords(key_words, server + 'update_counts', 10)

hashtag=getTopHashTags(tweets, window_length, sliding_interval)
sendTopWords(hashtag, server + 'update_hashtagcounts', 30)

geodata_path='geoData/tweet_geo*.json'
sendGeoData(geodata_path, server + 'update_geodata')

In [None]:
#getTweets(tweetStream, sliding_interval)

In [None]:
#getTopWords(tweets, window_length, sliding_interval)

In [None]:
#getTopHashTags(tweets_sentiment_analysed, window_length, sliding_interval)

In [None]:
# Start computing
ssc.start()        
# Wait for termination
ssc.awaitTermination()
ssc.stop(stopGraceFully = True)

-------------------------------------------
Time: 2021-05-28 14:35:22
-------------------------------------------
('count', (99, (7, 41, 51)))

-------------------------------------------
Time: 2021-05-28 14:35:22
-------------------------------------------
('merrieslili', 'Shamless @ASEAN wants @UN to drop a call to suspend #GlobalArmsEmbargo to Myanmar Junta who shot and killed 831 civ… https://t.co/rANaQ17m2o', 994, 1398165948192878598)
('KMThein4', 'Inviting MAL without a proper plan. \n\nNo follow up on 5-point consensus. \n\nUndermining Myanmar’s democracy in UN Ge… https://t.co/6jASTjeDvZ', 885, 1398165952684912641)
('Eisan43308395', 'RT @AllianceMilkTea: So the @ASEAN summit was held just to pat MAL on the back?\n\nhttps://t.co/5INljse66Q', 858, 1398165957621604358)
('AelniHH', 'The existence of @ASEAN is a bit questioning now, for being useless or for even being abhorrent. They become a thre… https://t.co/XNRRdv71qm', 642, 1398165959505068032)
('illa_1005', '@ASEAN is useless.

-------------------------------------------
Time: 2021-05-28 14:35:28
-------------------------------------------
('asean', 66)
('arms', 58)
('embargo', 54)
('even', 51)
('resolution', 47)
('letter', 39)
('united', 37)
('nations', 36)
('"abolish', 36)
('sends', 36)
...

{'words': '[\'asean\', \'arms\', \'embargo\', \'even\', \'resolution\', \'letter\', \'united\', \'nations\', \'"abolish\', \'sends\']', 'counts': '[66, 58, 54, 51, 47, 39, 37, 36, 36, 36]'}
-------------------------------------------
Time: 2021-05-28 14:35:28
-------------------------------------------
('#globalarmsembargo', 31)
('#whatshappeninginmyanmar', 13)
('#aseanimposejuntaembargo', 9)
('#aseanrejectsac', 9)
('#may28coup…', 8)
('#myanmar', 6)
('#may28coup', 5)
('#whatshappenin…', 4)
('#shameonyouasean', 4)
('#whatshappeninginmyanmar…', 2)
...

{'words': '[\'#globalarmsembargo\', \'#whatshappeninginmyanmar\', \'#aseanimposejuntaembargo\', \'#aseanrejectsac\', \'#may28coup…\', \'#myanmar\', \'#may28coup\', \'#whats

-------------------------------------------
Time: 2021-05-28 14:35:34
-------------------------------------------
('count', (393, (47, 177, 169)))

-------------------------------------------
Time: 2021-05-28 14:35:34
-------------------------------------------
('spring_2021', "Even though ASEAN has shown little sign of respect towards the 'consensus' ASEAN states, they are trying to avert a… https://t.co/52PYIbcSC8", 4237, 1398166007953330176)
('mrattphyo12', "Even though ASEAN has shown little sign of respect towards the 'consensus' ASEAN states, they are trying to avert a… https://t.co/hH9h2fEu0w", 1903, 1398165988252680196)
('Waitun05056243', 'Shamless @ASEAN wants @UN to drop a call to suspend #GlobalArmsEmbargo to Myanmar Junta who shot and killed 831 civ… https://t.co/uU8JinHpGO', 1725, 1398166016589398017)
('_sutheint', '@ASEAN sends letter to United Nations to "Abolish the clause on the arms embargo in the @UN Resolution on Myanmar".… https://t.co/t4aiVVqSW8', 1670, 1398166001