<a href="https://colab.research.google.com/github/WKhisa/Apache-Spark-DataFrames-Project/blob/main/Starter_Code_Generator.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Sample Starter Code: Basic Stream Processing with Spark Streaming

Here's some sample code in Python using the kafka-python package for generating network traffic data and publishing it to the Kafka topic.

In [None]:
from kafka import KafkaProducer, KafkaAdminClient
from kafka.admin import NewTopic
import time
import random


# Kafka producer configuration
bootstrap_servers = 'broker:29092'
topic_names = ['network-traffic', 'processed-data']

producer = KafkaProducer(
    bootstrap_servers=bootstrap_servers,
)

# Function to create topics if they do not exist
def create_topics_if_not_exist(admin_client, topic_names):
    for topic_name in topic_names:
        if topic_name not in admin_client.list_topics():
            try:
                topic = NewTopic(name=topic_name, num_partitions=1, replication_factor=1)
                admin_client.create_topics([topic])
                print(f"Created topic: {topic_name}")
            except Exception as e:
                print(f"Failed to create topic {topic_name}: {e}")

# Create an admin client
admin_client = KafkaAdminClient(bootstrap_servers=bootstrap_servers)

# Create network-traffic and processed-data topics if they do not exist
create_topics_if_not_exist(admin_client, topic_names)


# Generate and publish network traffic data to Kafka topic
while True:
    # Generate random network traffic data
    source_ip = '.'.join(str(random.randint(0, 255)) for _ in range(4))
    destination_ip = '.'.join(str(random.randint(0, 255)) for _ in range(4))
    bytes_sent = random.randint(1000, 100000)

    # Publish network traffic data to Kafka topic
    for topic_name in topic_names:
        producer.send(topic_name, f"{source_ip},{destination_ip},{bytes_sent}".encode('utf-8'))
        print(f"Published data to {topic_name}")

    # Wait for 1 second before generating the next network traffic data
    time.sleep(1)