In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType, FloatType, ArrayType
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

import sparknlp
from sparknlp.base import DocumentAssembler
from sparknlp.annotator import Tokenizer, StopWordsCleaner, RoBertaEmbeddings, EmbeddingsFinisher

spark = SparkSession.builder \
    .appName(
        "SentimentAnalysisRoBERTaRF"
    ).config(
        "spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:4.2.0"
    ).config(
        "spark.executor.memory", "16g"
    ).config(
        "spark.driver.memory", "32g"
    ).getOrCreate()

print("Spark NLP version: ", sparknlp.version())

In [None]:
df = spark.read.csv("../ggg_sg.csv", header=True, inferSchema=True, multiLine=True, escape='"')

df = df.filter(df.ContextualText.isNotNull())
df = df.filter(df.DocTone.isNotNull())

df = df.withColumn("DocTone", df["DocTone"].cast(FloatType()))

def sentiment_label(score):
    if score > 1.9910:
        return 2
    elif score < -2.0202:
        return 0
    else:
        return 1

sentiment_udf = udf(sentiment_label, IntegerType())
df = df.withColumn("label", sentiment_udf(col("DocTone")))

label_counts = df.groupBy("label").count().orderBy("label")
label_counts.show()

In [None]:
# Create a Spark NLP pipeline
document_assembler = DocumentAssembler() \
    .setInputCol("ContextualText") \
    .setOutputCol("document")

tokenizer = Tokenizer() \
    .setInputCols(["document"]) \
    .setOutputCol("token")

stopwords_cleaner = StopWordsCleaner() \
    .setInputCols(["token"]) \
    .setOutputCol("cleanTokens") \
    .setCaseSensitive(False)

# Use RoBERTa for embeddings
roberta_embeddings = RoBertaEmbeddings.pretrained("roberta_base", "en") \
    .setInputCols(["document", "cleanTokens"]) \
    .setOutputCol("embeddings") \
    .setPoolingStrategy("mean")

# Finisher will output the embeddings to a list
embeddings_finisher = EmbeddingsFinisher() \
    .setInputCols(["embeddings"]) \
    .setOutputCols(["finished_embeddings"]) \
    .setOutputAsVector(True) \
    .setCleanAnnotations(False)

In [None]:
from pyspark.ml.feature import VectorAssembler

# Create a feature vector
vector_assembler = VectorAssembler(inputCols=["finished_embeddings"], outputCol="features")

pipeline = Pipeline(stages=[
    document_assembler,
    tokenizer,
    stopwords_cleaner,
    roberta_embeddings,
    embeddings_finisher,
    vector_assembler
])

(trainingData, testData) = df.randomSplit([0.8, 0.2], seed=42)

pipelineModel = pipeline.fit(trainingData)
trainingData = pipelineModel.transform(trainingData)
testData = pipelineModel.transform(testData)

rf = RandomForestClassifier(featuresCol='features', labelCol='label', numTrees=50, maxDepth=10)

rfModel = rf.fit(trainingData)


In [None]:
predictions = rfModel.transform(testData)

In [None]:
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test Accuracy = %g " % accuracy)

precision_evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="weightedPrecision")
recall_evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="weightedRecall")
f1_evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="f1")

precision = precision_evaluator.evaluate(predictions)
recall = recall_evaluator.evaluate(predictions)
f1 = f1_evaluator.evaluate(predictions)

print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

predictions.select("ContextualText", "label", "prediction").show(10, truncate=50)

spark.stop()