<a href="https://colab.research.google.com/github/ShovalBenjer/Bigdata_Pyspark_Spark_Hadoop_Apache/blob/main/319037404_209017755.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install Java
!apt-get install openjdk-8-jdk-headless -qq > /dev/null

# Download Apache Spark
!wget -q https://archive.apache.org/dist/spark/spark-3.5.0/spark-3.5.0-bin-hadoop3.tgz

# Verify the download
!ls -l spark-3.5.0-bin-hadoop3.tgz

# Extract the downloaded file
!tar xf spark-3.5.0-bin-hadoop3.tgz

# Move Spark to /usr/local/spark
!mv spark-3.5.0-bin-hadoop3 /usr/local/spark

# Download Apache Kafka
!wget -q https://archive.apache.org/dist/kafka/3.5.1/kafka_2.13-3.5.1.tgz

# Verify the download
!ls -l kafka_2.13-3.5.1.tgz

# Extract the downloaded file
!tar xf kafka_2.13-3.5.1.tgz

# Move Kafka to /usr/local/kafka
!mv kafka_2.13-3.5.1 /usr/local/kafka

# Set environment variables
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/usr/local/spark"
os.environ["PATH"] += ":/usr/local/spark/bin"

# Set Kafka environment variable
os.environ["PATH"] += ":/usr/local/kafka/bin"

# Install pyspark
!pip install pyspark kafka-python

im feeling fantastic!-rw-r--r-- 1 root root 400395283 Sep  9  2023 spark-3.5.0-bin-hadoop3.tgz
mv: cannot move 'spark-3.5.0-bin-hadoop3' to '/usr/local/spark/spark-3.5.0-bin-hadoop3': Directory not empty
-rw-r--r-- 1 root root 106748875 Jul 21  2023 kafka_2.13-3.5.1.tgz
mv: cannot move 'kafka_2.13-3.5.1' to '/usr/local/kafka/kafka_2.13-3.5.1': Directory not empty


In [None]:
# Start Zookeeper
!nohup /usr/local/kafka/bin/zookeeper-server-start.sh /usr/local/kafka/config/zookeeper.properties &

# Start Kafka Broker
!nohup /usr/local/kafka/bin/kafka-server-start.sh /usr/local/kafka/config/server.properties &

nohup: appending output to 'nohup.out'
nohup: appending output to 'nohup.out'


In [None]:
# Create 'sentiments' topic
!kafka-topics.sh --create --topic sentiments --bootstrap-server localhost:9092 --replication-factor 1 --partitions 1

# Create 'text' topic
!kafka-topics.sh --create --topic text --bootstrap-server localhost:9092 --replication-factor 1 --partitions 1


Error while executing topic command : Topic 'sentiments' already exists.
[2024-12-21 17:02:10,940] ERROR org.apache.kafka.common.errors.TopicExistsException: Topic 'sentiments' already exists.
 (kafka.admin.TopicCommand$)
Error while executing topic command : Topic 'text' already exists.
[2024-12-21 17:02:13,587] ERROR org.apache.kafka.common.errors.TopicExistsException: Topic 'text' already exists.
 (kafka.admin.TopicCommand$)


In [None]:
from kafka import KafkaProducer, KafkaConsumer
from pyspark.sql import SparkSession
import threading
import time
import json

def producer_sentiments(file_path, topic, bootstrap_servers='localhost:9092'):
    """
    Reads sentiment data from a file and sends it to the specified Kafka topic.

    Args:
        file_path (str): Path to the sentiment file (e.g., 'AFINN-111.txt').
        topic (str): Kafka topic to send the data to.
        bootstrap_servers (str): Kafka server address. Default is 'localhost:9092'.
    """
    producer = KafkaProducer(
        bootstrap_servers=bootstrap_servers,
        value_serializer=lambda v: json.dumps(v).encode('utf-8')
    )
    with open(file_path, 'r') as f:
        lines = f.readlines()
    sentiment_data = [(line.split('\t')[0], int(line.split('\t')[1])) for line in lines]
    while True:
        batch = sentiment_data[:100]
        for word, sentiment in batch:
            producer.send(topic, {'word': word, 'sentiment': sentiment})
        sentiment_data = sentiment_data[100:] + batch  # Rotate data
        time.sleep(2)

def producer_text(topic, bootstrap_servers='localhost:9092'):
    """
    Reads user input from the console and sends it to the specified Kafka topic.

    Args:
        topic (str): Kafka topic to send the data to.
        bootstrap_servers (str): Kafka server address. Default is 'localhost:9092'.
    """
    producer = KafkaProducer(
        bootstrap_servers=bootstrap_servers,
        value_serializer=lambda v: v.encode('utf-8')
    )
    while True:
        user_input = input("Enter a sentence to analyze: ")
        producer.send(topic, user_input)
        print(f"Sent: {user_input}")

def spark_kafka_consumer(bootstrap_servers='localhost:9092', sentiment_topic='sentiments', text_topic='text', stop_after=10):
    """
    Consumes messages from Kafka topics and calculates the Total Sentiment Level (TSL) using Spark.

    Args:
        bootstrap_servers (str): Kafka server address. Default is 'localhost:9092'.
        sentiment_topic (str): Kafka topic for sentiment data.
        text_topic (str): Kafka topic for text data.
        stop_after (int): Number of messages to process before stopping. Default is 10.
    """
    from kafka import KafkaConsumer
    from pyspark.sql import SparkSession

    # Start Spark session
    spark = SparkSession.builder.appName("KafkaSparkConsumer").getOrCreate()
    sc = spark.sparkContext

    # Create Kafka consumer
    consumer = KafkaConsumer(
        sentiment_topic,
        text_topic,
        bootstrap_servers=bootstrap_servers,
        value_deserializer=lambda v: v.decode('utf-8'),  # Deserialize as string
    )

    # Dictionary to store sentiments
    sentiment_dict = {}
    message_count = 0

    for message in consumer:
        try:
            topic = message.topic
            value = message.value
            print(f"Received message from topic '{topic}': {value}")

            if topic == sentiment_topic:
                # Parse JSON for sentiment data
                sentiment_data = json.loads(value)
                sentiment_dict[sentiment_data['word']] = sentiment_data['sentiment']
            elif topic == text_topic:
                # Process text data
                words = value.split()
                known_sentiments = [sentiment_dict[word] for word in words if word in sentiment_dict]
                if known_sentiments:
                    tsl = sum(known_sentiments) / len(known_sentiments)
                    print(f"TSL for '{value}': {tsl}")
                else:
                    print(f"TSL for '{value}': No known words in sentiment dictionary")
        except json.JSONDecodeError:
            print("Error decoding JSON, skipping message:", message.value)
        except Exception as e:
            print(f"Error processing message: {e}")

        message_count += 1
        if message_count >= stop_after:
            print("Processed maximum messages. Stopping consumer...")
            break

In [None]:
import threading
import time

# Main function to start producers and consumer
if __name__ == "__main__":
    # Path to the sentiment file
    sentiment_file = "AFINN-111.txt"

    # Start the Kafka producer for 'sentiments' topic
    producer_sentiments_thread = threading.Thread(
        target=producer_sentiments, args=(sentiment_file, "sentiments")
    )
    producer_sentiments_thread.daemon = True
    producer_sentiments_thread.start()

    # Start the Kafka producer for 'text' topic
    producer_text_thread = threading.Thread(
        target=producer_text, args=("text",)
    )
    producer_text_thread.daemon = True
    producer_text_thread.start()

    # Run the Kafka consumer for a limited number of messages
    spark_kafka_consumer(stop_after=10)

    # Stop producer threads after consumer finishes
    print("Stopping producer threads...")
    producer_sentiments_thread.join(timeout=5)
    producer_text_thread.join(timeout=5)
    print("Producers stopped.")




Received message from topic 'sentiments': {"word": "boosts", "sentiment": 1}
Received message from topic 'sentiments': {"word": "bore", "sentiment": -2}
Received message from topic 'sentiments': {"word": "bored", "sentiment": -2}
Received message from topic 'sentiments': {"word": "boring", "sentiment": -3}
Received message from topic 'sentiments': {"word": "bother", "sentiment": -2}
Received message from topic 'sentiments': {"word": "bothered", "sentiment": -2}
Received message from topic 'sentiments': {"word": "bothers", "sentiment": -2}
Received message from topic 'sentiments': {"word": "bothersome", "sentiment": -2}
Received message from topic 'sentiments': {"word": "boycott", "sentiment": -2}
Received message from topic 'sentiments': {"word": "boycotted", "sentiment": -2}
Processed maximum messages. Stopping consumer...
Stopping producer threads...
Producers stopped.
