In [1]:
from pyspark.ml.feature import Tokenizer, CountVectorizer
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder \
    .appName("MastodonStreamProcessor") \
    .master("local[*]") \
    .config("spark.driver.memory", "4g") \
    .config("spark.executor.memory", "4g") \
    .config("spark.sql.shuffle.partitions", "8") \
    .config("spark.jars.packages", 
            "org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.3,"
            "org.apache.kafka:kafka-clients:3.3.1,"
            "org.postgresql:postgresql:42.2.18") \
    .config("spark.sql.streaming.checkpointLocation", "/tmp/spark_checkpoint") \
    .config("spark.driver.extraJavaOptions", "-Dlog4j.configuration=file:/correct/path/to/log4j.properties") \
    .getOrCreate()


In [5]:
class TweetSentimentAnalysis:
    def __init__(self):
        self.data = spark.read.csv("/home/jovyan/work/data/data.csv", header=False, inferSchema=True)

        self.data = self.data.withColumnRenamed("_c0", "target") \
                   .withColumnRenamed("_c1", "ids") \
                   .withColumnRenamed("_c2", "date") \
                   .withColumnRenamed("_c3", "flag") \
                   .withColumnRenamed("_c4", "user") \
                   .withColumnRenamed("_c5", "text")
    def show_dataset(self):
        self.data.show(5)

    def preprocess_data(self):
        self.tokenizer = Tokenizer(inputCol="text", outputCol="words")
        tokenized_data = self.tokenizer.transform(self.data)
        
        self.vectorizer = CountVectorizer(inputCol="words", outputCol="features")
        self.vectorized_data = self.vectorizer.fit(tokenized_data).transform(tokenized_data)
        
        self.vectorized_data.select("text", "words", "features").show(5)

    def train_model(self):
        self.data = self.data.withColumn("label", (self.data.target / 4).cast("int"))  # Normalize target (0 -> 0, 4 -> 1)
        
        train_data, self.test_data = self.data.randomSplit([0.8, 0.2], seed=42)
        
        lr = LogisticRegression(featuresCol="features", labelCol="label")
        
        pipeline = Pipeline(stages=[self.tokenizer, self.vectorizer, lr])
        
        self.model = pipeline.fit(train_data)

    def evaluate_model(self):
        predictions = self.model.transform(self.test_data)
        
        evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
        accuracy = evaluator.evaluate(predictions)
        
        print(f"Test Accuracy: {accuracy}")



In [6]:
tsa = TweetSentimentAnalysis()

In [7]:
tsa.show_dataset()

+------+----------+--------------------+--------+---------------+--------------------+
|target|       ids|                date|    flag|           user|                text|
+------+----------+--------------------+--------+---------------+--------------------+
|     0|1467810369|Mon Apr 06 22:19:...|NO_QUERY|_TheSpecialOne_|@switchfoot http:...|
|     0|1467810672|Mon Apr 06 22:19:...|NO_QUERY|  scotthamilton|is upset that he ...|
|     0|1467810917|Mon Apr 06 22:19:...|NO_QUERY|       mattycus|@Kenichan I dived...|
|     0|1467811184|Mon Apr 06 22:19:...|NO_QUERY|        ElleCTF|my whole body fee...|
|     0|1467811193|Mon Apr 06 22:19:...|NO_QUERY|         Karoli|@nationwideclass ...|
+------+----------+--------------------+--------+---------------+--------------------+
only showing top 5 rows



In [8]:
tsa.preprocess_data()

+--------------------+--------------------+--------------------+
|                text|               words|            features|
+--------------------+--------------------+--------------------+
|@switchfoot http:...|[@switchfoot, htt...|(262144,[1,2,4,7,...|
|is upset that he ...|[is, upset, that,...|(262144,[1,4,6,8,...|
|@Kenichan I dived...|[@kenichan, i, di...|(262144,[0,1,2,3,...|
|my whole body fee...|[my, whole, body,...|(262144,[5,6,13,3...|
|@nationwideclass ...|[@nationwideclass...|(262144,[0,7,18,2...|
+--------------------+--------------------+--------------------+
only showing top 5 rows



In [9]:
tsa.train_model()

In [10]:
tsa.evaluate_model()

Test Accuracy: 0.7658045482560013


In [17]:
jdbc_url = "jdbc:postgresql://postgres:5432/postgres"
connection_properties = {
    "user": "postgres",
    "password": "postgres",
    "driver": "org.postgresql.Driver"
}


In [18]:
def batch_sentiment_analysis(data):
    data_predictions = tsa.model.transform(data)
    data_predictions.select("text", "prediction").write \
        .jdbc(url=jdbc_url, table="sentiment_analysis", mode="append", properties=connection_properties)

In [19]:
batch_data = spark.read.csv("/home/jovyan/work/data/data.csv", header=False, inferSchema=True)

batch_data = batch_data.withColumnRenamed("_c0", "target") \
           .withColumnRenamed("_c1", "ids") \
           .withColumnRenamed("_c2", "date") \
           .withColumnRenamed("_c3", "flag") \
           .withColumnRenamed("_c4", "user") \
           .withColumnRenamed("_c5", "text")

In [20]:
batch_sentiment_analysis(batch_data)