In [None]:
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from pyspark import StorageLevel
import twitter_credentials
from textblob import TextBlob
import re  # regex for cleaning the tweets
import pandas as pd

from pyspark.sql import SQLContext

from pyspark.sql.types import StructType
from pyspark.sql.types import StructField
from pyspark.sql.types import StringType

import socket
import sys
import requests
import requests_oauthlib
import json


In [None]:
sc = SparkContext("local[2]", "Twitter Dem").getOrCreate()
ssc = StreamingContext(sc, 3) # n is the batch interval in seconds
ssc.checkpoint("./checkpoint")
sqlContext = SQLContext(sc)
df = pd.DataFrame(columns = ['HashTag', 'Sentiment'])

In [None]:
class TweetAnalyzer:
    """	Functionality for analyzing and categorizing content from tweets."""

    def clean_tweet(self, tweet):
        tweet = re.sub(r'RT', "", tweet)
        tweet = re.sub(r'@[^\s]+', "", tweet)
        tweet = re.sub(r'(https?:\/\/|(?:www\.|(?!www))[^\s\.]+\.[^\s]{2,}|www\.[^\s]+\.[^\s]{2,})', "", tweet)
        return ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", tweet).split())

    def analyze_sentiment(self, tweet):
        analysis = TextBlob(self.clean_tweet(tweet))

        if analysis.sentiment.polarity > 0:
            return 1
        elif analysis.sentiment.polarity == 0:
            return 0
        else:
            return -1

In [None]:
def process_rdd(time, rdd):
    print("----------- %s -----------" % str(time))
    if not rdd.isEmpty():
        rdd.show()

tweeter_analyzer = TweetAnalyzer()
pat = re.compile(r"(#\w+)")

tweets = ssc.socketTextStream(twitter_credentials.HOST, twitter_credentials.PORT, StorageLevel.MEMORY_AND_DISK)

tweetsWithHT = tweets.filter(lambda t: "#" in t)

hashtags_with_sentiments = tweetsWithHT.map(lambda x: (pat.findall(x), tweeter_analyzer.analyze_sentiment(x)))
  
hashtags_with_sentiments_lists.pprint()

ssc.start()
ssc.awaitTermination()



""""



tweetsWithHT = tweets.filter(lambda t: "#" in t)

hashtags_with_sentiments = tweetsWithHT.map(lambda x: (pat.findall(x), tweeter_analyzer.analyze_sentiment(x)))
hashtags_with_sentiments_lists = hashtags_with_sentiments.map(lambda elem: list(elem))

hashtags_with_sentiments_lists.pprint()
hashtags_with_sentiments_lists.foreachRDD(add_to_df)


ssc.start()
ssc.awaitTermination()

"""

In [None]:
def aggregate_tags_count(new_values, total_sum):
    return sum(new_values) + (total_sum or 0)


def get_sql_context_instance(spark_context):
    if ('sqlContextSingletonInstance' not in globals()):
        globals()['sqlContextSingletonInstance'] = SQLContext(spark_context)
    return globals()['sqlContextSingletonInstance']


def process_rdd(time, rdd):
    print("----------- %s -----------" % str(time))
    if not rdd.isEmpty():
        try:
            # Get spark sql singleton context from the current context
            sql_context = get_sql_context_instance(rdd.context)
            print(2)
            # create a DF from RDD
            hashtags_df = sql_context.createDataFrame(rdd).toDF("hashtag", "hashtag_count")
            print(3)
            # Register the dataframe as table
            hashtags_df.registerTempTable("hashtags")
            print(4)
            # get the top 10 hashtags from the table using SQL and print them
            hashtag_counts_df = sql_context.sql("select hashtag, hashtag_count from hashtags order by hashtag_count desc limit 10")
            print(5)
            hashtag_counts_df.show()
            print(type(hashtag_counts_df))
            print(hashtag_counts_df.toPandas())
            # call this method to prepare top 10 hashtags DF and send them
            send_df_to_dashboard(hashtag_counts_df)
            print(7)
        except:
            e = sys.exc_info()[0]
            print("Error: %s" % e)
        

def send_df_to_dashboard(df):
    # extract the hashtags from dataframe and convert them into array
    top_tags = [str(t.hashtag) for t in df.select("hashtag").collect()]
    # extract the counts from dataframe and convert them into array
    tags_count = [p.hashtag_count for p in df.select("hashtag_count").collect()]
    # initialize and send the data through REST API
    url = 'http://localhost:5001/updateData'
    request_data = {'label': str(top_tags), 'data': str(tags_count)}
    response = requests.post(url, data=request_data)

In [None]:

dataStream = ssc.socketTextStream(twitter_credentials.HOST, twitter_credentials.PORT, StorageLevel.MEMORY_AND_DISK)

# split each tweet into words
words = dataStream.flatMap(lambda line: line.split(" "))
# filter the words to get only hashtags, then map each hashtag to be a pair of (hashtag,1)
hashtags = words.filter(lambda w: '#' in w).map(lambda x: (x, 1))
# adding the count of each hashtag to its last count
tags_totals = hashtags.updateStateByKey(aggregate_tags_count)
tags_totals.pprint()
# do processing for each RDD generated in each interval
tags_totals.foreachRDD(process_rdd)
# start the streaming computation
ssc.start()
# wait for the streaming to finish
ssc.awaitTermination()