In [1]:
# Dependencies
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils
from datetime import datetime
from json import loads

In [2]:
# Setup Kafka and Spark Streaming
KAFKA_TOPIC = "social_media_topic"
BOOTSTRAP_SERVER = "localhost:9092"

ssc = StreamingContext(sc, 1) #stream each one second
ssc.checkpoint("./checkpoint")
lines = KafkaUtils.createDirectStream(ssc, [KAFKA_TOPIC],
                                      {"metadata.broker.list": BOOTSTRAP_SERVER})

In [3]:
def calculate_data(lines, window=60, sliding=60):
    def convert_timestamp(ts, socmed_type):
        result = ts
        if socmed_type == "twitter":
            result = datetime.strptime(ts, "%a %b %d %H:%M:%S %z %Y")
        elif socmed_type == "youtube":
            result = datetime.strptime(ts, "%Y-%m-%dT%H:%M:%SZ")
        elif socmed_type == "instagram":
            result = datetime.fromtimestamp(int(result))
        elif socmed_type == "facebook":
            result = datetime.strptime(ts, "%Y-%m-%dT%H:%M:%S%z")

        result = result.replace(second=0, microsecond=0)
        result = datetime.strftime(result, "%Y-%m-%dT%H:%M:%S")
        return result
    
    def map_elements(line):
        el = loads(line[1])
        data = {}
        socmed_type = el["crawler_target"]["specific_resource_type"]
        timestamp = None
        
        if socmed_type == "twitter":
            timestamp = convert_timestamp(el["created_at"], socmed_type)
            data["user_count"] = 1
            data["stream_count"] = 1
            data["user_ids"] = [el["user_id_str"]]
        elif socmed_type == "youtube":
            timestamp = convert_timestamp(el["snippet"]["publishedAt"], socmed_type)
            data["user_count"] = 1
            data["stream_count"] = 1
        elif socmed_type == "instagram":
            timestamp = convert_timestamp(el["created_time"], socmed_type)
            data["user_count"] = 1
            data["stream_count"] = 1
            data["user_ids"] = [el["user"]["id"]]
        elif socmed_type == "facebook":
            timestamp = convert_timestamp(el["created_time"], socmed_type)
            data["user_count"] = 1
            data["stream_count"] = 1
            data["user_ids"] = [el["from"]["id"]]

        return ((socmed_type, timestamp), data)
    
    # Streaming Main
    process_result = lines.map(map_elements)
    
    return process_result
 
social_media = calculate_data(lines)
social_media.pprint()

ssc.start()
ssc.awaitTermination()

-------------------------------------------
Time: 2022-04-25 14:54:15
-------------------------------------------

-------------------------------------------
Time: 2022-04-25 14:54:16
-------------------------------------------

-------------------------------------------
Time: 2022-04-25 14:54:17
-------------------------------------------

-------------------------------------------
Time: 2022-04-25 14:54:18
-------------------------------------------

-------------------------------------------
Time: 2022-04-25 14:54:19
-------------------------------------------

-------------------------------------------
Time: 2022-04-25 14:54:20
-------------------------------------------

-------------------------------------------
Time: 2022-04-25 14:54:21
-------------------------------------------

-------------------------------------------
Time: 2022-04-25 14:54:22
-------------------------------------------

-------------------------------------------
Time: 2022-04-25 14:54:23
----------

-------------------------------------------
Time: 2022-04-25 14:54:55
-------------------------------------------
(('instagram', '2022-04-25T14:54:00'), {'user_count': 1, 'stream_count': 1, 'user_ids': ['IHBOJYTFOXEKJTTNOLUF']})
(('instagram', '2022-04-25T14:54:00'), {'user_count': 1, 'stream_count': 1, 'user_ids': ['IHBOJYTFOXEKJTTNOLUF']})
(('instagram', '2022-04-25T14:54:00'), {'user_count': 1, 'stream_count': 1, 'user_ids': ['IHBOJYTFOXEKJTTNOLUF']})
(('twitter', '2022-04-25T14:54:00'), {'user_count': 1, 'stream_count': 1, 'user_ids': ['IHBOJYTFOXEKJTTNOLUF']})
(('instagram', '2022-04-25T14:39:00'), {'user_count': 1, 'stream_count': 1, 'user_ids': ['APTKYXTEOQEEHLYBKDCQ']})
(('instagram', '2022-04-25T14:54:00'), {'user_count': 1, 'stream_count': 1, 'user_ids': ['OBHDZNEESTTDPDUPBCPL']})
(('instagram', '2022-04-25T14:54:00'), {'user_count': 1, 'stream_count': 1, 'user_ids': ['FYWPKERXJJMEQGJMVHLS']})
(('twitter', '2022-04-25T14:18:00'), {'user_count': 1, 'stream_count': 1, 'user_ids

-------------------------------------------
Time: 2022-04-25 14:55:03
-------------------------------------------
(('youtube', '2022-04-25T14:52:00'), {'user_count': 1, 'stream_count': 1})
(('twitter', '2022-04-25T14:54:00'), {'user_count': 1, 'stream_count': 1, 'user_ids': ['ILZCALMIOYOSLVAKMVGB']})
(('twitter', '2022-04-25T14:54:00'), {'user_count': 1, 'stream_count': 1, 'user_ids': ['ILZCALMIOYOSLVAKMVGB']})
(('instagram', '2022-04-25T14:20:00'), {'user_count': 1, 'stream_count': 1, 'user_ids': ['ZGDADDYZWCOVJPCHMOMI']})
(('instagram', '2022-04-25T14:54:00'), {'user_count': 1, 'stream_count': 1, 'user_ids': ['RSIDDOMBXMKLPEUGUWFO']})
(('instagram', '2022-04-25T14:54:00'), {'user_count': 1, 'stream_count': 1, 'user_ids': ['JMIUAPSPQNTNFNTRYSZV']})
(('twitter', '2022-04-25T14:54:00'), {'user_count': 1, 'stream_count': 1, 'user_ids': ['DUBLFSJISKAKZGNLCOJW']})
(('instagram', '2022-04-25T14:54:00'), {'user_count': 1, 'stream_count': 1, 'user_ids': ['NSAAZEHULBYPRBULRBLA']})
(('youtube',

KeyboardInterrupt: 