In [1]:
# Dependencies
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils
from datetime import datetime
from json import loads

In [2]:
# Setup Kafka and Spark Streaming
KAFKA_TOPIC = "social_media_topic"
BOOTSTRAP_SERVER = "localhost:9092"

ssc = StreamingContext(sc, 60) #stream each one minute
ssc.checkpoint("./checkpoint")
lines = KafkaUtils.createDirectStream(ssc, [KAFKA_TOPIC],
                                      {"metadata.broker.list": BOOTSTRAP_SERVER})

In [3]:
def calculate_data(lines):
    def convert_timestamp(ts, socmed_type):
        result = ts
        if socmed_type == "twitter":
            result = datetime.strptime(ts, "%a %b %d %H:%M:%S %z %Y")
        elif socmed_type == "youtube":
            result = datetime.strptime(ts, "%Y-%m-%dT%H:%M:%SZ")
        elif socmed_type == "instagram":
            result = datetime.fromtimestamp(int(result))
        elif socmed_type == "facebook":
            result = datetime.strptime(ts, "%Y-%m-%dT%H:%M:%S%z")

        result = result.replace(second=0, microsecond=0)
        result = datetime.strftime(result, "%Y-%m-%dT%H:%M:%S")
        return result
    
    def map_elements(line):
        el = loads(line[1])
        data = {}
        socmed_type = el["crawler_target"]["specific_resource_type"]
        timestamp = None
        
        if socmed_type == "twitter":
            timestamp = convert_timestamp(el["created_at"], socmed_type)
            data["user_count"] = 1
            data["stream_count"] = 1
            data["user_ids"] = [el["user_id_str"]]
        elif socmed_type == "youtube":
            timestamp = convert_timestamp(el["snippet"]["publishedAt"], socmed_type)
            data["user_count"] = 1
            data["stream_count"] = 1
            if (el["snippet"].get("channelId")):
                data["user_ids"] = [el["snippet"]["channelId"]]
            else :
                data["user_ids"] = ["no_user_id_provided"]
        elif socmed_type == "instagram":
            timestamp = convert_timestamp(el["created_time"], socmed_type)
            data["user_count"] = 1
            data["stream_count"] = 1
            data["user_ids"] = [el["user"]["id"]]
        elif socmed_type == "facebook":
            timestamp = convert_timestamp(el["created_time"], socmed_type)
            data["user_count"] = 1
            data["stream_count"] = 1
            data["user_ids"] = [el["from"]["id"]]

        return ((socmed_type, timestamp), data)
    
    def reducer(a,b):
        # User Ids
        new_user_ids = list(set(a['user_ids'] + b['user_ids']))
        
        # User Count
        new_user_count = len(new_user_ids)
        
        # Stream Count
        new_stream_count = a['stream_count'] + b['stream_count']
        
        return {
            "user_count": new_user_count,
            "stream_count": new_stream_count,
            "user_ids": new_user_ids
        }
    
    def updateFunction(a,b):
        if b is None:
            b = {
                "user_count": 0,
                "stream_count": 0,
                "user_ids": []
            }
        
        if a == []:
            a = [{
                "user_count": 0,
                "stream_count": 0,
                "user_ids": []
            }]
        
        # User Ids
        new_user_ids = list(set(a[0]['user_ids'] + b['user_ids']))
        
        # User Count
        new_user_count = len(new_user_ids)
        
        # Stream Count
        new_stream_count = a[0]['stream_count'] + b['stream_count']
        
        # Write to SQL
        
        return {
            "user_count": new_user_count,
            "stream_count": new_stream_count,
            "user_ids": new_user_ids
        }
    
    # Streaming Main
    process_result = lines.map(map_elements)
    
    # Reduce Process
    reduced = process_result.reduceByKey(reducer)
    
    # Update Process
    result = reduced.updateStateByKey(updateFunction)
    
    return result
 
social_media = calculate_data(lines)
social_media.pprint()

ssc.start()
ssc.awaitTermination()

-------------------------------------------
Time: 2022-04-25 17:17:18
-------------------------------------------
(('youtube', '2022-04-25T17:17:00'), {'user_count': 1, 'stream_count': 3, 'user_ids': ['no_user_id_provided']})
(('youtube', '2022-04-25T17:15:00'), {'user_count': 1, 'stream_count': 1, 'user_ids': ['EBTDHTIUNNONELSOPXMK']})
(('youtube', '2022-04-25T16:56:00'), {'user_count': 1, 'stream_count': 1, 'user_ids': ['QOJTTHNBUIDZCXQWFTPW']})
(('instagram', '2022-04-25T17:16:00'), {'user_count': 1, 'stream_count': 1, 'user_ids': ['ZSCSCPNERYESQDHAUBPS']})
(('instagram', '2022-04-25T17:17:00'), {'user_count': 1, 'stream_count': 1, 'user_ids': ['QWEKCNZWSCDRKYNBIRQF']})
(('youtube', '2022-04-25T17:16:00'), {'user_count': 1, 'stream_count': 1, 'user_ids': ['no_user_id_provided']})
(('twitter', '2022-04-25T17:16:00'), {'user_count': 2, 'stream_count': 2, 'user_ids': ['ZSCSCPNERYESQDHAUBPS', 'UKREMMAYSKNZKHAFOZDJ']})
(('facebook', '2022-04-25T16:58:00'), {'user_count': 1, 'stream_count

-------------------------------------------
Time: 2022-04-25 17:17:30
-------------------------------------------
(('youtube', '2022-04-25T17:17:00'), {'user_count': 3, 'stream_count': 12, 'user_ids': ['ZQLEJBRQEMZUXBXPNFKL', 'no_user_id_provided', 'JDVFVVDPJEJWLUUCMMOK']})
(('youtube', '2022-04-25T17:15:00'), {'user_count': 2, 'stream_count': 2, 'user_ids': ['no_user_id_provided', 'EBTDHTIUNNONELSOPXMK']})
(('youtube', '2022-04-25T16:56:00'), {'user_count': 1, 'stream_count': 1, 'user_ids': ['QOJTTHNBUIDZCXQWFTPW']})
(('facebook', '2022-04-25T17:17:00'), {'user_count': 6, 'stream_count': 8, 'user_ids': ['ZMGXOEFZMLNPEMLEPUDB', 'OEGBPJMYOLJPTEWRPPPG', 'IWYBTBREQNIVQBKEQFSI', 'FQUZVJABECFNNTAYLKHE', 'ZWXHQBINVOBYFGSENDUU', 'DRVHJMCAIYMQQZAKZEBW']})
(('youtube', '2022-04-25T16:54:00'), {'user_count': 1, 'stream_count': 1, 'user_ids': ['ISFYGAOQXSGMLQGWSSBD']})
(('instagram', '2022-04-25T16:53:00'), {'user_count': 1, 'stream_count': 1, 'user_ids': ['PIQQWBFGJDHZNLLWFKGY']})
(('twitter', '

KeyboardInterrupt: 