# Notebook to play around with the Python Kafka Modules

In [6]:
# !pip install kafka-python
# ! pip install lorem_text names

In [33]:
from kafka import KafkaProducer
from kafka import KafkaConsumer

import json
import os
import uuid
from datetime import datetime
import time
from lorem_text import lorem
import names
import random

KAFKA_SERVER = "kafka.kafka.svc.cluster.local:9092"
KAFKA_TOPIC = "twitter-raw"
GROUP_ID=str(uuid.uuid1())

In [23]:
# Initialize producer variable
producer = KafkaProducer(bootstrap_servers=KAFKA_SERVER,value_serializer=lambda x: json.dumps(x).encode('utf-8')) 



## Random tweet generator

In [10]:
# create random data
# #################################

# country list
country_list=[
              {"country": "Germany","language": "DE"},
              {"country": "USA","language": "EN"},
              {"country": "United Kingdom","language": "EN"},
              {"country": "France","language": "FR"},
              {"country": "India","language": "EN"},
              {"country": "Spain","language": "ES"},
              {"country": "Brasil","language": "PT"}
             ]

# create name list of 50 names
names_list=set()
for n in range(50):
    names_list.add(names.get_first_name())

# create unique set of name, country, language
user_country_list=[]
for name in names_list:
    random_country=random.choice(country_list)
    user_country_list.append({
        "user_name": name, 
        "user_location": random_country["country"], 
        "language": random_country["language"],
        "user_follower_count": random.randint(100, 5000),
        "user_friends_count": random.randint(1000, 50000)
    })
   

# hasthag list
hastag_list=["DataScience","ML","AI","Data","DataEngineering","machinelearning", "iot","analytics","dataanlytics","cloud"]


In [11]:
def create_random_message(counter):
    random_user=random.choice(user_country_list)
    message={
        "Id": str(uuid.uuid1()),
        "Counter": str(counter),
        "CreatedAt": time.time(),
        "Text": lorem.sentence(),
        "User": {
            "ScreenName": random_user["user_name"],
            "Location": random_user["user_location"],
            "FollowersCount": random_user["user_follower_count"],
            "FriendsCount": random_user["user_friends_count"]
        },
        "Lang": random_user["language"],
        "HashtagEntities": random.choices(hastag_list, k=random.randint(1, 5)),
        "RetweetCount": random.randint(0, 1000)
    }

    return message

In [20]:
starttime = time.time()
counter=0
looptime=1.0
while True:
    message=create_random_message(counter)
    looptime=random.randint(1, 8)
    counter=counter+1
    print("++ message nr: " + str(counter)+ "+++++++++++++++++++++++++++++++++++++++++++")
    print(json.dumps(message))
    print("+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
    #producer.send(KAFKA_TOPIC,key=message.get("Id"),value=message)
    producer.send(KAFKA_TOPIC,value=message)
    time.sleep(looptime - ((time.time() - starttime) % looptime))
    
    

++ message nr: 1+++++++++++++++++++++++++++++++++++++++++++
{"Id": "78839fd8-c8b0-11ed-ae74-5ecbfa752d46", "Counter": "0", "CreatedAt": 1679489578.0045042, "Text": "Eos fugiat nobis tempora eaque ipsam dolores eum voluptate consectetur, molestiae debitis consequuntur et possimus fuga quibusdam earum commodi ipsam distinctio esse, repellat laborum quaerat accusamus ratione minus distinctio voluptates sunt, veniam optio dicta fugit magnam delectus deserunt?", "User": {"ScreenName": "Bree", "Location": "France", "FollowersCount": 4428, "FriendsCount": 1682}, "Lang": "FR", "HashtagEntities": ["DataScience", "iot"], "RetweetCount": 14}
+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
++ message nr: 2+++++++++++++++++++++++++++++++++++++++++++
{"Id": "7ae68cae-c8b0-11ed-ae74-5ecbfa752d46", "Counter": "1", "CreatedAt": 1679489582.0082948, "Text": "Neque dolore nobis numquam magni consectetur eius earum pariatur, amet nesciunt ab error sunt voluptate assumenda plac

KeyboardInterrupt: 

## Tweet Stream Converter

In [24]:
KAFKA_SOURCE_TOPIC = "twitter-raw"
KAFKA_TARGET_TOPIC = "twitter-table"
GROUP_ID=str(uuid.uuid1())

# Initialize consumer variable
consumer = KafkaConsumer (KAFKA_SOURCE_TOPIC, group_id =GROUP_ID,bootstrap_servers = KAFKA_SERVER, value_deserializer=lambda m: json.loads(m.decode('utf-8')))


In [35]:
for message in consumer:
    print(message.value)

{'Id': '32572e12-c8ec-11ed-b911-4ab75a7cc0ea', 'Counter': '198', 'CreatedAt': 1679515230.0765338, 'Text': 'Saepe repellendus commodi facere non laborum iure labore illo ab corporis nulla, dolores amet sint ea?', 'User': {'ScreenName': 'Robert', 'Location': 'Brasil', 'FollowersCount': 2020, 'FriendsCount': 36419}, 'Lang': 'PT', 'HashtagEntities': ['cloud', 'iot', 'DataScience', 'Data', 'iot'], 'RetweetCount': 122}
{'Id': '35ebb5f2-c8ec-11ed-b911-4ab75a7cc0ea', 'Counter': '199', 'CreatedAt': 1679515236.0831115, 'Text': 'Commodi fuga soluta tempora odio vel dolorem aliquid saepe sapiente neque, architecto esse debitis adipisci consequuntur qui quisquam voluptatum explicabo suscipit et voluptas, tempora est eius optio quos veritatis eligendi, ipsa quo soluta numquam consequatur eveniet atque quaerat?', 'User': {'ScreenName': 'Charles', 'Location': 'France', 'FollowersCount': 1229, 'FriendsCount': 1778}, 'Lang': 'FR', 'HashtagEntities': ['ML', 'machinelearning'], 'RetweetCount': 817}
{'Id':

KeyboardInterrupt: 

In [36]:
print("##########################################################")
print("+++ starting stream conversion...")
counter=0
for message in consumer:
    counter=counter+1
    result={
        "tweet_id": str(message.value["Id"]),
        "created_at": datetime.fromtimestamp(int(message.value["CreatedAt"])/1000).strftime('%Y-%m-%d %H:%M:%S'),
        "tweet_message": message.value["Text"],
        "user_name": message.value["User"]["ScreenName"],
        "user_location": message.value["User"]["Location"],
        "user_follower_count": int(message.value["User"]["FollowersCount"]),
        "user_friends_count": int(message.value["User"]["FriendsCount"]),
        "retweet_count": int(message.value["RetweetCount"]),
        "language": message.value["Lang"],
        "hashtags": message.value["HashtagEntities"]
    }
    print("++ total: " + str(counter)+ "+++++++++++++++++++++++++++++++++++++++++++")
    print(json.dumps(result))
    #producer.send(KAFKA_TARGET_TOPIC,value=result)

##########################################################
+++ starting stream conversion...
++ total: 1+++++++++++++++++++++++++++++++++++++++++++
{"tweet_id": "7a8a1082-c8ec-11ed-b911-4ab75a7cc0ea", "created_at": "1970-01-20 10:31:55", "tweet_message": "At suscipit iure, explicabo commodi recusandae exercitationem molestiae doloribus est nesciunt tenetur nulla porro, in modi odio, quas illum vero reprehenderit similique in, officia commodi quos deserunt blanditiis aut repellendus quam?", "user_name": "Leah", "user_location": "Germany", "user_follower_count": 2408, "user_friends_count": 41788, "retweet_count": 26, "language": "DE", "hashtags": ["ML", "dataanlytics"]}
++ total: 2+++++++++++++++++++++++++++++++++++++++++++
{"tweet_id": "7ced3764-c8ec-11ed-b911-4ab75a7cc0ea", "created_at": "1970-01-20 10:31:55", "tweet_message": "Laudantium cumque rerum adipisci ullam quisquam tempora culpa harum libero nobis, amet odit et eligendi nostrum doloremque possimus deleniti mollitia quod unde,

KeyboardInterrupt: 