In [1]:
# Dependencies
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils
from json import loads

In [2]:
# Setup Kafka and Spark Streaming
KAFKA_TOPIC = "social_media_topic"
BOOTSTRAP_SERVER = "localhost:9092"

ssc = StreamingContext(sc, 1) #stream each one second
ssc.checkpoint("./checkpoint")
lines = KafkaUtils.createDirectStream(ssc, [KAFKA_TOPIC],
                                      {"metadata.broker.list": BOOTSTRAP_SERVER})

In [3]:
def calculate_data(lines, window=60, sliding=60):
    def map_elements(line):
        el = loads(line[1])
        data = {}
        socmed_type = el["crawler_target"]["specific_resource_type"]
        timestamp = None
        
        if socmed_type == "twitter":
            timestamp = el["created_at"]
            data["user_count"] = 1
            data["stream_count"] = 1
        elif socmed_type == "youtube":
            timestamp = el["snippet"]["publishedAt"]
            data["user_count"] = 1
            data["stream_count"] = 1
        elif socmed_type == "instagram":
            timestamp = el["created_time"]
            data["user_count"] = 1
            data["stream_count"] = 1
        elif socmed_type == "facebook":
            timestamp = el["created_time"]
            data["user_count"] = 1
            data["stream_count"] = 1

        return ((socmed_type, timestamp), data)
    
    # Streaming Main
    process_result = lines.map(map_elements)
    
    return process_result
 
social_media = calculate_data(lines)
social_media.pprint()

ssc.start()
ssc.awaitTermination()

-------------------------------------------
Time: 2022-04-25 10:00:57
-------------------------------------------
(('instagram', '1650855566'), {'user_count': 1, 'stream_count': 1})
(('twitter', 'Mon Apr 25 09:28:36 +0700 2022'), {'user_count': 1, 'stream_count': 1})
(('instagram', '1650854605'), {'user_count': 1, 'stream_count': 1})
(('youtube', '2022-04-25T10:00:54Z'), {'user_count': 1, 'stream_count': 1})
(('facebook', '2022-04-25T10:00:53+0700'), {'user_count': 1, 'stream_count': 1})
(('instagram', '1650855652'), {'user_count': 1, 'stream_count': 1})
(('twitter', 'Mon Apr 25 10:00:37 +0700 2022'), {'user_count': 1, 'stream_count': 1})
(('youtube', '2022-04-25T09:56:29Z'), {'user_count': 1, 'stream_count': 1})
(('youtube', '2022-04-25T09:28:49Z'), {'user_count': 1, 'stream_count': 1})
(('facebook', '2022-04-25T09:33:13+0700'), {'user_count': 1, 'stream_count': 1})
...

-------------------------------------------
Time: 2022-04-25 10:00:58
-------------------------------------------
(

-------------------------------------------
Time: 2022-04-25 10:01:07
-------------------------------------------
(('twitter', 'Mon Apr 25 09:58:44 +0700 2022'), {'user_count': 1, 'stream_count': 1})
(('twitter', 'Mon Apr 25 09:21:11 +0700 2022'), {'user_count': 1, 'stream_count': 1})
(('instagram', '1650855663'), {'user_count': 1, 'stream_count': 1})
(('instagram', '1650855228'), {'user_count': 1, 'stream_count': 1})
(('youtube', '2022-04-25T10:01:03Z'), {'user_count': 1, 'stream_count': 1})
(('twitter', 'Mon Apr 25 10:01:05 +0700 2022'), {'user_count': 1, 'stream_count': 1})
(('facebook', '2022-04-25T09:54:38+0700'), {'user_count': 1, 'stream_count': 1})
(('twitter', 'Mon Apr 25 09:54:24 +0700 2022'), {'user_count': 1, 'stream_count': 1})
(('twitter', 'Mon Apr 25 10:01:02 +0700 2022'), {'user_count': 1, 'stream_count': 1})
(('facebook', '2022-04-25T09:23:45+0700'), {'user_count': 1, 'stream_count': 1})
...

-------------------------------------------
Time: 2022-04-25 10:01:08
-------

KeyboardInterrupt: 