In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json, col, window, length
from pyspark.sql.types import StructType, StructField, StringType, LongType, ArrayType
import time 


jdbc_url = "jdbc:postgresql://postgres:5432/postgres"
connection_properties = {
    "user": "postgres",
    "password": "postgres",
    "driver": "org.postgresql.Driver"
}


# Define schema for incoming data
schema = StructType([
    StructField("user_id", StringType(), True),
    StructField("content", StringType(), True),
    StructField("timestamp", StringType(), True),
    StructField("favourites", LongType(), True),
    StructField("reblogs", LongType(), True),
    StructField("hashtags", ArrayType(StringType()), True)
])

spark = SparkSession.builder \
    .appName("MastodonStreamProcessor") \
    .master("local[*]") \
    .config("spark.driver.memory", "4g") \
    .config("spark.executor.memory", "4g") \
    .config("spark.sql.shuffle.partitions", "8") \
    .config("spark.jars.packages", 
            "org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.3,"
            "org.apache.kafka:kafka-clients:3.3.1,"
            "org.postgresql:postgresql:42.2.18") \
    .config("spark.sql.streaming.checkpointLocation", "/tmp/spark_checkpoint") \
    .config("spark.driver.extraJavaOptions", "-Dlog4j.configuration=file:/correct/path/to/log4j.properties") \
    .getOrCreate()


    

In [2]:
def setup_stream():
    # Read from Kafka topic
    kafka_df = spark \
        .readStream \
        .format("kafka") \
        .option("kafka.bootstrap.servers", "kafka:9092") \
        .option("subscribe", "mastodonStream") \
        .load()
    
    # Parse the Kafka stream data
    parsed_df = kafka_df.selectExpr("CAST(value AS STRING)") \
        .select(from_json(col("value"), schema).alias("data")) \
        .select("data.*")

    
    # Filter for content containing "AI"
    keyword_filtered_df = parsed_df.filter(col("content").contains("AI"))

    # Aggregate by window of 1 hour
    windowed_df = keyword_filtered_df \
        .withColumn("timestamp", col("timestamp").cast("timestamp")) \
        .groupBy(window(col("timestamp"), "1 hour")) \
        .count()

    # Accessing window.start and window.end
    windowed_df = windowed_df \
        .withColumn("window_start", col("window.start")) \
        .withColumn("window_end", col("window.end")) \
        .drop("window")

    windowed_df.writeStream \
    .outputMode("complete") \
    .foreachBatch(lambda df, epochId: df.show()) \  # This will print to the console
    .start()

    
    # Calculate average toot length per user
    avg_toot_length_df = keyword_filtered_df \
        .withColumn("toot_length", length(col("content"))) \
        .groupBy("user_id") \
        .agg({"toot_length": "avg"}) \
        .withColumnRenamed("avg(toot_length)", "avg_toot_length")
    
    # Write windowed aggregation to PostgreSQL
    window_query = windowed_df.writeStream \
        .outputMode("complete") \
        .foreachBatch(lambda df, epochId: df.write.jdbc(
            url=jdbc_url, 
            table="toot_window_counts", 
            mode="append", 
            properties=connection_properties)) \
        .start()
    
    # Write average toot length to PostgreSQL
    avg_length_query = avg_toot_length_df.writeStream \
        .outputMode("complete") \
        .foreachBatch(lambda df, epochId: df.write.jdbc(
            url=jdbc_url, 
            table="avg_toot_length", 
            mode="append", 
            properties=connection_properties)) \
        .start()
    # For windowed aggregation
    window_query = windowed_df.writeStream \
        .outputMode("complete") \
        .format("console") \
        .start()
    
    # For average toot length
    avg_length_query = avg_toot_length_df.writeStream \
        .outputMode("complete") \
        .format("console") \
        .start()

    window_query.awaitTermination()
    avg_length_query.awaitTermination()


In [3]:
setup_stream()

In [4]:
from pyspark.ml.feature import Tokenizer, CountVectorizer
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import MulticlassClassificationEvaluator


class TweetSentimentAnalysis:
    def __init__(self):
        self.data = spark.read.csv("/home/jovyan/work/data.csv", header=False, inferSchema=True)

        self.data = self.data.withColumnRenamed("_c0", "target") \
                   .withColumnRenamed("_c1", "ids") \
                   .withColumnRenamed("_c2", "date") \
                   .withColumnRenamed("_c3", "flag") \
                   .withColumnRenamed("_c4", "user") \
                   .withColumnRenamed("_c5", "text")
    def show_dataset(self):
        self.data.show(5)

    def preprocess_data(self):
        # Tokenize the tweet text
        self.tokenizer = Tokenizer(inputCol="text", outputCol="words")
        tokenized_data = self.tokenizer.transform(self.data)
        
        # Convert words into feature vectors
        self.vectorizer = CountVectorizer(inputCol="words", outputCol="features")
        self.vectorized_data = self.vectorizer.fit(tokenized_data).transform(tokenized_data)
        
        # Show the transformed data
        self.vectorized_data.select("text", "words", "features").show(5)

    def train_model(self):
        # Convert target labels (0 -> negative, 4 -> positive)
        self.data = self.data.withColumn("label", (self.data.target / 4).cast("int"))  # Normalize target (0 -> 0, 4 -> 1)
        
        # Split the dataset into training and testing sets
        train_data, self.test_data = self.data.randomSplit([0.8, 0.2], seed=42)
        
        # Logistic Regression model
        lr = LogisticRegression(featuresCol="features", labelCol="label")
        
        # Create pipeline
        pipeline = Pipeline(stages=[self.tokenizer, self.vectorizer, lr])
        
        # Train the model
        self.model = pipeline.fit(train_data)

    def evaluate_model(self):
        # Make predictions
        predictions = self.model.transform(self.test_data)
        
        evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
        accuracy = evaluator.evaluate(predictions)
        
        print(f"Test Accuracy: {accuracy}")



In [5]:
tsa = TweetSentimentAnalysis()

In [6]:
tsa.show_dataset()

+------+----------+--------------------+--------+---------------+--------------------+
|target|       ids|                date|    flag|           user|                text|
+------+----------+--------------------+--------+---------------+--------------------+
|     0|1467810369|Mon Apr 06 22:19:...|NO_QUERY|_TheSpecialOne_|@switchfoot http:...|
|     0|1467810672|Mon Apr 06 22:19:...|NO_QUERY|  scotthamilton|is upset that he ...|
|     0|1467810917|Mon Apr 06 22:19:...|NO_QUERY|       mattycus|@Kenichan I dived...|
|     0|1467811184|Mon Apr 06 22:19:...|NO_QUERY|        ElleCTF|my whole body fee...|
|     0|1467811193|Mon Apr 06 22:19:...|NO_QUERY|         Karoli|@nationwideclass ...|
+------+----------+--------------------+--------+---------------+--------------------+
only showing top 5 rows



In [7]:
tsa.preprocess_data()

+--------------------+--------------------+--------------------+
|                text|               words|            features|
+--------------------+--------------------+--------------------+
|@switchfoot http:...|[@switchfoot, htt...|(262144,[1,2,4,7,...|
|is upset that he ...|[is, upset, that,...|(262144,[1,4,6,8,...|
|@Kenichan I dived...|[@kenichan, i, di...|(262144,[0,1,2,3,...|
|my whole body fee...|[my, whole, body,...|(262144,[5,6,13,3...|
|@nationwideclass ...|[@nationwideclass...|(262144,[0,7,18,2...|
+--------------------+--------------------+--------------------+
only showing top 5 rows



In [8]:
tsa.train_model()

In [9]:
tsa.evaluate_model()

Test Accuracy: 0.7658921470784228


In [10]:
def batch_sentiment_analysis(data):
    data_predictions = tsa.model.transform(data)
    data_predictions.select("text", "prediction").write \
        .jdbc(url=jdbc_url, table="sentiment_analysis", mode="append", properties=connection_properties)

In [11]:
batch_data = spark.read.csv("/home/jovyan/work/data.csv", header=False, inferSchema=True)

batch_data = batch_data.withColumnRenamed("_c0", "target") \
           .withColumnRenamed("_c1", "ids") \
           .withColumnRenamed("_c2", "date") \
           .withColumnRenamed("_c3", "flag") \
           .withColumnRenamed("_c4", "user") \
           .withColumnRenamed("_c5", "text")

In [12]:
batch_sentiment_analysis(batch_data)

In [1]:
!pip install streamlit



In [None]:
!streamlit run app.py


Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.
[0m
[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://192.168.32.6:8501[0m
[34m  External URL: [0m[1mhttp://185.62.227.171:8501[0m
[0m
