In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install pyspark



In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, udf, when, isnan, isnull
from pyspark.sql.types import IntegerType, StringType, FloatType
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF, StringIndexer
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import Word2Vec
from pyspark.ml.classification import RandomForestClassifier

spark = SparkSession.builder.appName(
            "SentimentAnalysisBaseline"
        ).getOrCreate()

df = spark.read.csv("/content/drive/MyDrive/ggg_sg.csv", header=True, inferSchema=True, multiLine=True, escape='"')
df = df.filter(df.ContextualText.isNotNull())
df = df.filter(df.DocTone.isNotNull())
df = df.withColumn("DocTone", df["DocTone"].cast(FloatType()))
# Create sentiment label: Positive (2), Neutral (1), Negative (0)
def sentiment_label(score):
    if score > 1.9910:
        return 2
    elif score < -2.0202:
        return 0
    else:
        return 1

sentiment_udf = udf(sentiment_label, IntegerType())

df = df.withColumn("label", sentiment_udf(col("DocTone")))
tokenizer = Tokenizer(inputCol="ContextualText", outputCol="words")
remover = StopWordsRemover(inputCol="words", outputCol="filtered_words")
# Feature extraction (Word2Vec)
word2Vec = Word2Vec(vectorSize=100, minCount=5, inputCol="filtered_words", outputCol="features")

pipeline = Pipeline(stages=[tokenizer, remover, word2Vec])
(trainingData, testData) = df.randomSplit([0.8, 0.2], seed=42)

pipelineModel = pipeline.fit(trainingData)
trainingData = pipelineModel.transform(trainingData)
testData = pipelineModel.transform(testData)

----------------------------------------
Exception occurred during processing of request from ('127.0.0.1', 49342)
ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/IPython/core/interactiveshell.py", line 3553, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-3-b98a781f5ff3>", line 39, in <cell line: 39>
    pipelineModel = pipeline.fit(trainingData)
  File "/usr/local/lib/python3.10/dist-packages/pyspark/ml/base.py", line 205, in fit
    return self._fit(dataset)
  File "/usr/local/lib/python3.10/dist-packages/pyspark/ml/pipeline.py", line 134, in _fit
    model = stage.fit(dataset)
  File "/usr/local/lib/python3.10/dist-packages/pyspark/ml/base.py", line 205, in fit
    return self._fit(dataset)
  File "/usr/local/lib/python3.10/dist-packages/pyspark/ml/wrapper.py", line 381, in _fit
    java_model = self._fit_java(dataset)
  File "/usr/local/lib/python3.10/dist-packag

ConnectionRefusedError: [Errno 111] Connection refused

In [None]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# 随机森林分类器
rf = RandomForestClassifier(featuresCol='features', labelCol='label', seed=42)

# 创建参数网格
paramGrid = ParamGridBuilder() \
    .addGrid(rf.numTrees, [20, 50, 100]) \
    .addGrid(rf.maxDepth, [5, 10, 15]) \
    .addGrid(rf.maxBins, [32, 64]) \
    .build()

# 设置评估器
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")

# 创建交叉验证器
crossval = CrossValidator(estimator=rf,
                          estimatorParamMaps=paramGrid,
                          evaluator=evaluator,
                          numFolds=5)  # 5折交叉验证

# 训练模型
cvModel = crossval.fit(trainingData)

# 用最优模型预测测试集
predictions = cvModel.transform(testData)

# 输出最优超参数
bestModel = cvModel.bestModel
print("Best Model's Parameters:")

# 输出最优参数值
print(f"Number of Trees: {bestModel.getNumTrees}")
print(f"Max Depth: {bestModel.getMaxDepth()}")
print(f"Max Bins: {bestModel.getMaxBins()}")

# 计算评估指标
accuracy = evaluator.evaluate(predictions)
print("Test Accuracy = %g" % accuracy)

# 详细评估
evaluator_precision = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="weightedPrecision")
precision = evaluator_precision.evaluate(predictions)

evaluator_recall = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="weightedRecall")
recall = evaluator_recall.evaluate(predictions)

evaluator_f1 = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="f1")
f1 = evaluator_f1.evaluate(predictions)

print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

# 显示预测结果
predictions.select("ContextualText", "label", "prediction").show(10, truncate=50)