In [1]:
# Dependencies
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils
from datetime import datetime
from json import loads

In [2]:
# Setup Kafka and Spark Streaming
KAFKA_TOPIC = "social_media_topic"
BOOTSTRAP_SERVER = "localhost:9092"

ssc = StreamingContext(sc, 1) #stream each one second
ssc.checkpoint("./checkpoint")
lines = KafkaUtils.createDirectStream(ssc, [KAFKA_TOPIC],
                                      {"metadata.broker.list": BOOTSTRAP_SERVER})

In [3]:
def calculate_data(lines, window=60, sliding=60):
    def convert_timestamp(ts, socmed_type):
        result = ts
        if socmed_type == "twitter":
            result = datetime.strptime(ts, "%a %b %d %H:%M:%S %z %Y")
        elif socmed_type == "youtube":
            result = datetime.strptime(ts, "%Y-%m-%dT%H:%M:%SZ")
        elif socmed_type == "instagram":
            result = datetime.fromtimestamp(int(result))
        elif socmed_type == "facebook":
            result = datetime.strptime(ts, "%Y-%m-%dT%H:%M:%S%z")

        result = result.replace(second=0, microsecond=0)
        result = datetime.strftime(result, "%Y-%m-%dT%H:%M:%S")
        return result
    
    def map_elements(line):
        el = loads(line[1])
        data = {}
        socmed_type = el["crawler_target"]["specific_resource_type"]
        timestamp = None
        
        if socmed_type == "twitter":
            timestamp = convert_timestamp(el["created_at"], socmed_type)
            data["user_count"] = 1
            data["stream_count"] = 1
            data["user_ids"] = [el["user_id_str"]]
        elif socmed_type == "youtube":
            timestamp = convert_timestamp(el["snippet"]["publishedAt"], socmed_type)
            data["user_count"] = 1
            data["stream_count"] = 1
            if (el["snippet"].get("channelId")):
                data["user_ids"] = [el["snippet"]["channelId"]]
            else :
                data["user_ids"] = ["no_user_id_provided"]
        elif socmed_type == "instagram":
            timestamp = convert_timestamp(el["created_time"], socmed_type)
            data["user_count"] = 1
            data["stream_count"] = 1
            data["user_ids"] = [el["user"]["id"]]
        elif socmed_type == "facebook":
            timestamp = convert_timestamp(el["created_time"], socmed_type)
            data["user_count"] = 1
            data["stream_count"] = 1
            data["user_ids"] = [el["from"]["id"]]

        return ((socmed_type, timestamp), data)
    
    def reducer(a,b):
        # User Ids
        new_user_ids = list(set(a['user_ids'] + b['user_ids']))
        
        # User Count
        new_user_count = len(new_user_ids)
        
        # Stream Count
        new_stream_count = a['stream_count'] + b['stream_count']
        
        return {
            "user_count": new_user_count,
            "stream_count": new_stream_count,
            "user_ids": new_user_ids
        }
    
    def updateFunction(a,b):
        if b is None:
            b = {
                "user_count": 0,
                "stream_count": 0,
                "user_ids": []
            }
        
        # User Ids
        new_user_ids = list(set(a['user_ids'] + b['user_ids']))
        
        # User Count
        new_user_count = len(new_user_ids)
        
        # Stream Count
        new_stream_count = a[0]['stream_count'] + b['stream_count']
        
        return {
            "user_count": new_user_count,
            "stream_count": new_stream_count,
            "user_ids": new_user_ids
        }
    
    # Streaming Main
    process_result = lines.map(map_elements)
    
    # Reduce Process
    reduced = process_result.reduceByKeyAndWindow(reducer, window, sliding)
    
    # Update Process
    result = reduced.updateStateByKey(updateFunction)
    
    return reduced
 
social_media = calculate_data(lines)
social_media.pprint()

ssc.start()
ssc.awaitTermination()

aloha
-------------------------------------------
Time: 2022-04-25 16:11:58
-------------------------------------------
(('youtube', '2022-04-25T15:58:00'), {'user_count': 1, 'stream_count': 1, 'user_ids': ['no_user_id_provided']})
(('instagram', '2022-04-25T15:56:00'), {'user_count': 1, 'stream_count': 1, 'user_ids': ['WQPTGPHFUBDVRRNVWWRZ']})
(('twitter', '2022-04-25T15:53:00'), {'user_count': 1, 'stream_count': 1, 'user_ids': ['GNTNFCJZGKRXKEGAIINE']})
(('twitter', '2022-04-25T16:11:00'), {'user_count': 2, 'stream_count': 4, 'user_ids': ['DITFDVBBNBFPUPDNOQAD', 'XLCHWBWVQBHZZTDCOFAM']})
(('youtube', '2022-04-25T16:11:00'), {'user_count': 3, 'stream_count': 5, 'user_ids': ['GYZKIMAVVHSJZJUIEZBZ', 'WQPTGPHFUBDVRRNVWWRZ', 'no_user_id_provided']})
(('instagram', '2022-04-25T16:11:00'), {'user_count': 5, 'stream_count': 7, 'user_ids': ['LVLQUULNCZEZZNZAEVLK', 'WTISGSJXOVWBOCSJRJJP', 'WQPTGPHFUBDVRRNVWWRZ', 'XLCHWBWVQBHZZTDCOFAM', 'UFPXZAJVHPRLHWNIWMCW']})
(('facebook', '2022-04-25T15:40:

-------------------------------------------
Time: 2022-04-25 16:12:04
-------------------------------------------
(('youtube', '2022-04-25T15:58:00'), {'user_count': 1, 'stream_count': 1, 'user_ids': ['no_user_id_provided']})
(('instagram', '2022-04-25T15:56:00'), {'user_count': 2, 'stream_count': 2, 'user_ids': ['WQPTGPHFUBDVRRNVWWRZ', 'MCLZBAXZWWJYFTGDSMAO']})
(('twitter', '2022-04-25T15:53:00'), {'user_count': 1, 'stream_count': 1, 'user_ids': ['GNTNFCJZGKRXKEGAIINE']})
(('twitter', '2022-04-25T16:11:00'), {'user_count': 7, 'stream_count': 10, 'user_ids': ['PCYNFJDHSENSAGXYYRBD', 'TMSMBOAEDWRFYMUCPEZT', 'BJEEAWMZQHEXPPTAMOLI', 'GASACYZXYWXOLNGRXDDG', 'XLCHWBWVQBHZZTDCOFAM', 'DITFDVBBNBFPUPDNOQAD', 'SABQJDZCLCHUAZBSAHLH']})
(('youtube', '2022-04-25T16:06:00'), {'user_count': 1, 'stream_count': 1, 'user_ids': ['SABQJDZCLCHUAZBSAHLH']})
(('youtube', '2022-04-25T15:57:00'), {'user_count': 1, 'stream_count': 1, 'user_ids': ['no_user_id_provided']})
(('youtube', '2022-04-25T16:02:00'), {'

KeyboardInterrupt: 