In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, udf, when, isnan, isnull
from pyspark.sql.types import IntegerType, StringType, FloatType
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF, StringIndexer
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import Word2Vec
from pyspark.ml.classification import RandomForestClassifier

spark = SparkSession.builder.appName(
            "SentimentAnalysisBaseline"
        ).config(
            "spark.executor.memory", "16g"
        ).config(
            "spark.driver.memory", "32g"
        ).getOrCreate()

df = spark.read.csv("../ggg_sg.csv", header=True, inferSchema=True, multiLine=True, escape='"')
df = df.filter(df.ContextualText.isNotNull())
df = df.filter(df.DocTone.isNotNull())
df = df.withColumn("DocTone", df["DocTone"].cast(FloatType()))
# Create sentiment label: Positive (2), Neutral (1), Negative (0)
def sentiment_label(score):
    if score > 1.9910:
        return 2
    elif score < -2.0202:
        return 0
    else:
        return 1
    
sentiment_udf = udf(sentiment_label, IntegerType())

df = df.withColumn("label", sentiment_udf(col("DocTone")))
tokenizer = Tokenizer(inputCol="ContextualText", outputCol="words")
remover = StopWordsRemover(inputCol="words", outputCol="filtered_words")
# Feature extraction (Word2Vec)
word2Vec = Word2Vec(vectorSize=100, minCount=5, inputCol="filtered_words", outputCol="features")

pipeline = Pipeline(stages=[tokenizer, remover, word2Vec])
(trainingData, testData) = df.randomSplit([0.8, 0.2], seed=42)

pipelineModel = pipeline.fit(trainingData)
trainingData = pipelineModel.transform(trainingData)
testData = pipelineModel.transform(testData)

# Random Forest model
rf = RandomForestClassifier(featuresCol='features', labelCol='label', numTrees=50, maxDepth=10, seed=42)

rfModel = rf.fit(trainingData)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/09/27 14:39:34 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/09/27 14:49:27 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
24/09/27 16:21:52 WARN DAGScheduler: Broadcasting large task binary with size 1176.7 KiB
24/09/27 16:23:28 WARN DAGScheduler: Broadcasting large task binary with size 2.2 MiB
24/09/27 16:25:13 WARN DAGScheduler: Broadcasting large task binary with size 4.3 MiB
24/09/27 16:27:34 WARN DAGScheduler: Broadcasting large task binary with size 1241.4 KiB
24/09/27 16:27:35 WARN DAGScheduler: Broadcasting large task binary with size 8.3 MiB
24/09/27 16:30:37 WARN DAGScheduler: Broadcasting large task binary with size 2.4 MiB
                                                                                

In [2]:
predictions = rfModel.transform(testData)
# Evaluate model
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test Accuracy = %g " % accuracy)

# Detailed evaluation
evaluator_precision = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="weightedPrecision")
precision = evaluator_precision.evaluate(predictions)

evaluator_recall = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="weightedRecall")
recall = evaluator_recall.evaluate(predictions)

evaluator_f1 = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="f1")
f1 = evaluator_f1.evaluate(predictions)

print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

predictions.select("ContextualText", "label", "prediction").show(10, truncate=50)

24/09/27 16:30:40 WARN DAGScheduler: Broadcasting large task binary with size 5.8 MiB
24/09/27 16:32:36 WARN DAGScheduler: Broadcasting large task binary with size 5.8 MiB


Test Accuracy = 0.679253 


24/09/27 16:34:33 WARN DAGScheduler: Broadcasting large task binary with size 5.8 MiB
24/09/27 16:36:30 WARN DAGScheduler: Broadcasting large task binary with size 5.8 MiB
24/09/27 16:38:30 WARN DAGScheduler: Broadcasting large task binary with size 5.8 MiB


Precision: 0.7093272180121379
Recall: 0.6792528027519265
F1 Score: 0.6667760421736545


[Stage 42:>                                                         (0 + 1) / 1]

+--------------------------------------------------+-----+----------+
|                                    ContextualText|label|prediction|
+--------------------------------------------------+-----+----------+
|blink asiaone forum work crime woman gets 7 yrs...|    0|       1.0|
|1 fetch object currentpost scrollintoview true ...|    0|       1.0|
|scrollintoview true vbimageresize blink asiaone...|    0|       0.0|
|b c re letters government must step up enforcem...|    0|       0.0|
|in september re city harvest church appeal case...|    2|       1.0|
|1 fetch object currentpost scrollintoview true ...|    1|       1.0|
|last edited by isingapore yesterday at 11 14 pm...|    0|       1.0|
|air denies forcing job applicants to strip down...|    0|       0.0|
|career a current affair ben mccormack built a r...|    0|       0.0|
|big start small act fast civil servants told by...|    1|       1.0|
+--------------------------------------------------+-----+----------+
only showing top 10 

                                                                                