In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import concat_ws
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF, StringIndexer
from pyspark.ml.classification import NaiveBayes
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import MulticlassClassificationEvaluator


In [None]:
# 1. Spark session
spark = SparkSession.builder.appName("ReviewScoreClassification_NB").getOrCreate()

# 2. Load data
train_df = spark.read.csv("date/train.csv", header=True, inferSchema=True)
test_df = spark.read.csv("date/test.csv", header=True, inferSchema=True)

# 3. Combine 'Text' and 'Summary'
train_df = train_df.withColumn("combined_text", concat_ws(" ", "Summary", "Text"))
test_df = test_df.withColumn("combined_text", concat_ws(" ", "Summary", "Text"))

# 4. NLP preprocessing
tokenizer = Tokenizer(inputCol="combined_text", outputCol="words")
remover = StopWordsRemover(inputCol="words", outputCol="filtered")
hashingTF = HashingTF(inputCol="filtered", outputCol="rawFeatures", numFeatures=10000)

# Note: Naive Bayes expects raw term frequencies (TF), not TF-IDF.
# So we skip IDF and feed 'rawFeatures' directly to the classifier.

# 5. Label encoding
label_indexer = StringIndexer(inputCol="Score", outputCol="label", handleInvalid="skip")

# 6. Classifier: Naive Bayes
nb = NaiveBayes(labelCol="label", featuresCol="rawFeatures", modelType="multinomial", smoothing=0.9)

# 7. Pipeline
pipeline = Pipeline(stages=[tokenizer, remover, hashingTF, label_indexer, nb])

# 8. Train model
model = pipeline.fit(train_df)

# 9. Predictions
predictions = model.transform(test_df)

# 10. Evaluation
evaluator_accuracy = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
evaluator_precision = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="weightedPrecision")
evaluator_recall = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="weightedRecall")
evaluator_f1 = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="f1")

accuracy = evaluator_accuracy.evaluate(predictions)
precision = evaluator_precision.evaluate(predictions)
recall = evaluator_recall.evaluate(predictions)
f1 = evaluator_f1.evaluate(predictions)

print("=== Evaluation Metrics (Naive Bayes) ===")
print(f"Accuracy       : {accuracy:.4f}")
print(f"Precision      : {precision:.4f}")
print(f"Recall         : {recall:.4f}")
print(f"F1 Score       : {f1:.4f}")

=== Evaluation Metrics (Naive Bayes) ===
Accuracy       : 0.6908
Precision      : 0.6847
Recall         : 0.6908
F1 Score       : 0.6872
