In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, udf, when, isnan, isnull
from pyspark.sql.types import IntegerType, StringType, FloatType
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF, StringIndexer
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import Word2Vec
from pyspark.ml.classification import RandomForestClassifier

spark = SparkSession.builder.appName(
            "SentimentAnalysisBaseline"
        ).config(
            "spark.executor.memory", "16g"
        ).config(
            "spark.driver.memory", "32g"
        ).getOrCreate()

df = spark.read.csv("../ggg_sg.csv", header=True, inferSchema=True, multiLine=True, escape='"')
df = df.filter(df.ContextualText.isNotNull())
df = df.filter(df.DocTone.isNotNull())
df = df.withColumn("DocTone", df["DocTone"].cast(FloatType()))
# Create sentiment label: Positive (2), Neutral (1), Negative (0)
def sentiment_label(score):
    if score > 1.9910:
        return 2
    elif score < -2.0202:
        return 0
    else:
        return 1
    
sentiment_udf = udf(sentiment_label, IntegerType())

df = df.withColumn("label", sentiment_udf(col("DocTone")))
tokenizer = Tokenizer(inputCol="ContextualText", outputCol="words")
remover = StopWordsRemover(inputCol="words", outputCol="filtered_words")
# Feature extraction (Word2Vec)
word2Vec = Word2Vec(vectorSize=100, minCount=5, inputCol="filtered_words", outputCol="features")

pipeline = Pipeline(stages=[tokenizer, remover, word2Vec])
(trainingData, testData) = df.randomSplit([0.8, 0.2], seed=42)

pipelineModel = pipeline.fit(trainingData)
trainingData = pipelineModel.transform(trainingData)
testData = pipelineModel.transform(testData)

# Random Forest model
rf = RandomForestClassifier(featuresCol='features', labelCol='label', numTrees=50, maxDepth=10, seed=42)

rfModel = rf.fit(trainingData)

24/09/27 14:25:29 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.
[Stage 5:>                                                          (0 + 1) / 1]

In [None]:
predictions = rfModel.transform(testData)
# Evaluate model
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test Accuracy = %g " % accuracy)

# Detailed evaluation
evaluator_precision = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="weightedPrecision")
precision = evaluator_precision.evaluate(predictions)

evaluator_recall = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="weightedRecall")
recall = evaluator_recall.evaluate(predictions)

evaluator_f1 = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="f1")
f1 = evaluator_f1.evaluate(predictions)

print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

predictions.select("ContextualText", "label", "prediction").show(10, truncate=50)