In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF, StringIndexer
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline
from pyspark.sql.functions import col
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql.functions import concat_ws
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [2]:
# 1. Inițializare sesiune Spark
spark = SparkSession.builder.appName("ReviewScoreClassification").getOrCreate()

In [3]:
# 2. Citire date CSV
train_df = spark.read.csv("date/train.csv", header=True, inferSchema=True)
test_df = spark.read.csv("date/test.csv", header=True, inferSchema=True)

In [4]:
# 3. Combinare 'Text' și 'Summary' într-o singură coloană
train_df = train_df.withColumn("combined_text", concat_ws(" ", "Summary", "Text"))
test_df = test_df.withColumn("combined_text", concat_ws(" ", "Summary", "Text"))

In [5]:
# 4. Preprocesare NLP
tokenizer = Tokenizer(inputCol="combined_text", outputCol="words")
remover = StopWordsRemover(inputCol="words", outputCol="filtered")
hashingTF = HashingTF(inputCol="filtered", outputCol="rawFeatures", numFeatures=10000)
idf = IDF(inputCol="rawFeatures", outputCol="features")

In [6]:
# 5. Pregătire label și model
label_indexer = StringIndexer(inputCol="Score", outputCol="label", handleInvalid="skip")
lr = LogisticRegression(maxIter=25, regParam=0.001, elasticNetParam=0.7)

In [7]:
# 6. Pipeline complet
pipeline = Pipeline(stages=[tokenizer, remover, hashingTF, idf, label_indexer, lr])

In [8]:
# 7. Antrenare model
model = pipeline.fit(train_df)

ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "c:\Users\User\AppData\Local\Programs\Python\Python312\Lib\site-packages\py4j\java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\User\AppData\Local\Programs\Python\Python312\Lib\site-packages\py4j\clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
                          ^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\User\AppData\Local\Programs\Python\Python312\Lib\socket.py", line 720, in readinto
    return self._sock.recv_into(b)
           ^^^^^^^^^^^^^^^^^^^^^^^
KeyboardInterrupt


KeyboardInterrupt: 

In [None]:
# 8. Predicții pe test
predictions = model.transform(test_df)

In [None]:
# 9. Evaluare
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
evaluator_accuracy = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
evaluator_precision = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="weightedPrecision")
evaluator_recall = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="weightedRecall")
evaluator_f1 = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="f1")

accuracy = evaluator_accuracy.evaluate(predictions)
precision = evaluator_precision.evaluate(predictions)
recall = evaluator_recall.evaluate(predictions)
f1 = evaluator_f1.evaluate(predictions)

print("=== Evaluation Metrics ===")
print(f"Accuracy       : {accuracy:.4f}")
print(f"Precision      : {precision:.4f}")
print(f"Recall         : {recall:.4f}")
print(f"F1 Score       : {f1:.4f}")

=== Evaluation Metrics ===
Accuracy       : 0.7377
Precision      : 0.6957
Recall         : 0.7377
F1 Score       : 0.6964


In [None]:
spark.stop()