In [None]:
import os
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.functions import broadcast
from pyspark.sql.window import Window
from pyspark.sql.types import IntegerType, FloatType, ArrayType, StringType
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF, NGram
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import LinearSVC
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sparknlp

# Initialize SparkSession
spark = SparkSession.builder \
    .appName("Amazon_Sentiment_Analysis") \
    .master("spark://master:7077") \
    .config("spark.driver.memory", "4g") \
    .config("spark.executor.memory", "8g") \
    .config("spark.executor.memoryOverhead", "2g") \
    .config("spark.sql.shuffle.partitions", "400") \
    .config("spark.default.parallelism", "400") \
    .config("spark.network.timeout", "800s") \
    .config("spark.executor.heartbeatInterval", "100s") \
    .config("spark.memory.fraction", "0.6") \
    .config("spark.sql.execution.arrow.pyspark.enabled", "true") \
    .getOrCreate()

In [None]:
# CONFIGURATION
PARQUET_PATH = "reviews_final_parquet"
NUM_PARTITIONS = 400  
NUM_FEATURES = 2**16  
MAX_ITER = 20         
REG_PARAM = 0.1

df = spark.read.parquet(PARQUET_PATH) #.repartition(NUM_PARTITIONS)

# PREPROCESSING
df = df.filter("overall != 3") \
       .withColumn("label", F.when(F.col("overall") >= 4, 1.0).otherwise(0.0)) \
       .select("lemmatized_tokens", "label")
       # .withColumn("reviewText", F.lower(F.col("reviewText"))) \
       # .withColumn("reviewText", F.regexp_replace("reviewText", r"http\S+|www\S+|[^a-z\s]", " ")) \
       # .withColumn("reviewText", F.trim(F.regexp_replace("reviewText", r"\s+", " "))) \
       # .filter(F.length("reviewText") > 10) \

print(">> Load and filtering done.")

hashingTF = HashingTF(inputCol="lemmatized_tokens", outputCol="rawFeatures", numFeatures=NUM_FEATURES)
idf = IDF(inputCol="rawFeatures", outputCol="features")
svm = LinearSVC(maxIter=MAX_ITER, regParam=REG_PARAM, labelCol="label", featuresCol="features")

pipeline = Pipeline(stages=[hashingTF, idf, svm])

train_df, test_df = df.randomSplit([0.8, 0.2], seed=42)

num_negatives = train_df.filter(F.col("label") == 0.0).count()
num_positives = train_df.filter(F.col("label") == 1.0).count()

print(f"Pozitive: {num_positives}, Negative: {num_negatives}")

# DOWNSAMPLING
fraction = num_negatives / num_positives
pos_df = train_df.filter(F.col("label") == 1.0).sample(withReplacement=False, fraction=fraction, seed=42)
neg_df = train_df.filter(F.col("label") == 0.0)

# UNION
balanced_train_df = pos_df.union(neg_df).repartition(NUM_PARTITIONS)

print(f"Noua dimensiune antrenare: {balanced_train_df.count()}")

print(">>> Începe antrenarea modelului...")
spark.catalog.clearCache()
model = pipeline.fit(balanced_train_df)
print(">>> S-a terminat antrenarea modelului.")

# EVALUATION
predictions = model.transform(test_df)
evaluator = MulticlassClassificationEvaluator(labelCol="label", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)

print(f"Acuratețe finală: {accuracy:.4f}")

MODEL_PATH = "models/amazon_sentiment_svm_big"

print(f">>> Se salvează modelul în: {MODEL_PATH}")

# SAVE
model.write().overwrite().save(MODEL_PATH)

print("Salvare reușită!")

In [None]:
"""
Pozitive: 50048793, Negative: 5063240
                                                                                
Noua dimensiune antrenare: 10126555

Acuratețe finală: 0.9058
"""

In [None]:
from sparknlp.base import DocumentAssembler
from sparknlp.annotator import Tokenizer, LemmatizerModel
from pyspark.ml import Pipeline as SparkNLP_Pipeline

loaded_model = PipelineModel.load(MODEL_PATH)

from pyspark.sql.types import StructType, StructField, StringType, ArrayType

schema = StructType([
    StructField("originalText", StringType(), True),
    StructField("lemmatized_tokens", ArrayType(StringType()), True)
])

data = [
    ("I absolutely hated this book...", ["absolutely", "hate", "book", "plot", "be", "non", "existent", "character", "be", "flat"]),
    ("What an amazing journey!...", ["amazing", "journey", "not", "can", "put", "down", "last", "page"]),
    ("I hate it! It's awful", ["hate", "awful"])
]

test_df = spark.createDataFrame(data, schema)

results = loaded_model.transform(test_df)

results.select("lemmatized_tokens", "prediction").show(truncate=False)

In [None]:
"""
+----------------------------------------------------------------------+----------+
|lemmatized_tokens                                                     |prediction|
+----------------------------------------------------------------------+----------+
|[absolutely, hate, book, plot, be, non, existent, character, be, flat]|0.0       |
|[amazing, journey, not, can, put, down, last, page]                   |1.0       |
|[hate, awful]                                                         |0.0       |
+----------------------------------------------------------------------+----------+
"""

In [None]:
"""
Results on Books dataset (20+ GB) for the previous version (The "Yes Man Problem"):

+-------------------------------------------------------------------------------------+----------+
|reviewText                                                                           |prediction|
+-------------------------------------------------------------------------------------+----------+
|I absolutely hated this book, the plot was non-existent and the characters were flat.|1.0       |
|What an amazing journey! I couldn't put it down until the last page.                 |1.0       |
|I hate it! It's awful                                                                |0.0       |
+-------------------------------------------------------------------------------------+----------+
"""