In [None]:
# !pip install recbole
# !pip install ray
# !pip install "numpy<2.0"
# !pip install kmeans_pytorch
# !pip install "scipy<1.12"
# !pip install pandas
# !pip install matplotlib
# !pip install seaborn
# !pip install pyarrow
# !pip install tensorflow
# !pip install spark-nlp
# !pip install emoji
# !pip install contractions

In [None]:
"""
Results: SVM on video-games:

SVM Accuracy: 0.9487
SVM F1-score: 0.9396
"""

In [None]:
import os
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.functions import broadcast
from pyspark.sql.window import Window
from pyspark.sql.types import IntegerType, FloatType, ArrayType, StringType
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF, NGram
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import LinearSVC
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sparknlp

spark = SparkSession.builder \
    .appName("Amazon_Sentiment_Analysis") \
    .master("spark://master:7077") \
    .config("spark.driver.memory", "4g") \
    .config("spark.executor.memory", "8g") \
    .config("spark.executor.memoryOverhead", "2g") \
    .config("spark.sql.shuffle.partitions", "400") \
    .config("spark.default.parallelism", "400") \
    .config("spark.network.timeout", "800s") \
    .config("spark.executor.heartbeatInterval", "100s") \
    .config("spark.memory.fraction", "0.6") \
    .config("spark.sql.execution.arrow.pyspark.enabled", "true") \
    .getOrCreate()

BASE_DIR = '/home/ubuntu/data/aaucr/'
file_path = os.path.join(BASE_DIR, 'Books_5.json')

df = spark.read.json(file_path)
df.select("asin", "reviewerID", "reviewText", "overall").write.mode("overwrite").parquet("data_amazon.parquet")

# CONFIGURATION FOR 2 WORKERS x 12GB
PARQUET_PATH = "data_amazon.parquet"
NUM_PARTITIONS = 400  
NUM_FEATURES = 2**15  
MAX_ITER = 20         
REG_PARAM = 0.1

df = spark.read.parquet(PARQUET_PATH).repartition(NUM_PARTITIONS)

item_counts = df.groupBy("asin").count().filter("count >= 5").select("asin")
user_counts = df.groupBy("reviewerID").count().filter("count >= 5").select("reviewerID")

df = df.join(broadcast(item_counts), "asin") \
       .join(broadcast(user_counts), "reviewerID") \
       .select("reviewText", "overall") 

df = df.filter("overall != 3") \
       .withColumn("label", F.when(F.col("overall") >= 4, 1.0).otherwise(0.0)) \
       .withColumn("reviewText", F.lower(F.col("reviewText"))) \
       .withColumn("reviewText", F.regexp_replace("reviewText", r"http\S+|www\S+|[^a-z\s]", " ")) \
       .withColumn("reviewText", F.trim(F.regexp_replace("reviewText", r"\s+", " "))) \
       .filter(F.length("reviewText") > 10) \
       .select("reviewText", "label")

print(">> Load and filtering done.")

# PIPELINE
tokenizer = Tokenizer(inputCol="reviewText", outputCol="tokens")
remover = StopWordsRemover(inputCol="tokens", outputCol="tokens_clean")
hashingTF = HashingTF(inputCol="tokens_clean", outputCol="rawFeatures", numFeatures=NUM_FEATURES)
idf = IDF(inputCol="rawFeatures", outputCol="features")
svm = LinearSVC(maxIter=MAX_ITER, regParam=REG_PARAM, labelCol="label", featuresCol="features")

pipeline = Pipeline(stages=[tokenizer, remover, hashingTF, idf, svm])

# TRAIN AND SPLIT
train_df, test_df = df.randomSplit([0.8, 0.2], seed=42)

print(">>> Începe antrenarea modelului...")
spark.catalog.clearCache()
model = pipeline.fit(train_df)
print(">>> S-a terminat antrenarea modelului.")

# EVALUATION
predictions = model.transform(test_df)
evaluator = MulticlassClassificationEvaluator(labelCol="label", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)

print(f"Acuratețe finală: {accuracy:.4f}")

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
26/01/14 17:04:45 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
                                                                                

>> Load and filtering done.
>>> Începe antrenarea modelului...


26/01/14 17:38:15 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
                                                                                

>>> S-a terminat antrenarea modelului.




Acuratețe finală: 0.9460


                                                                                

In [None]:
MODEL_PATH = "models/amazon_sentiment_svm_pipeline_books"

print(f">>> Se salvează modelul în: {MODEL_PATH}")

model.write().overwrite().save(MODEL_PATH)

print("Salvare reușită!")

>>> Se salvează modelul în: models/amazon_sentiment_svm_pipeline_books


                                                                                

Salvare reușită!


In [5]:
from pyspark.ml import PipelineModel

loaded_model = PipelineModel.load("models/amazon_sentiment_svm_pipeline_books")

test_sentences = spark.createDataFrame([
    ("I absolutely hated this book, the plot was non-existent and the characters were flat.",),
    ("What an amazing journey! I couldn't put it down until the last page.",),
    ("I hate it! It's awful",)
], ["reviewText"])

results = loaded_model.transform(test_sentences)
results.select("reviewText", "prediction").show(truncate=False)



+-------------------------------------------------------------------------------------+----------+
|reviewText                                                                           |prediction|
+-------------------------------------------------------------------------------------+----------+
|I absolutely hated this book, the plot was non-existent and the characters were flat.|1.0       |
|What an amazing journey! I couldn't put it down until the last page.                 |1.0       |
|I hate it!                                                                           |1.0       |
+-------------------------------------------------------------------------------------+----------+



                                                                                

In [None]:
import os
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.functions import broadcast
from pyspark.sql.window import Window
from pyspark.sql.types import IntegerType, FloatType, ArrayType, StringType
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF, NGram
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import LinearSVC
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sparknlp

# Initialize SparkSession
spark = SparkSession.builder \
    .appName("Amazon_Sentiment_Analysis") \
    .master("spark://master:7077") \
    .config("spark.driver.memory", "4g") \
    .config("spark.executor.memory", "8g") \
    .config("spark.executor.memoryOverhead", "2g") \
    .config("spark.sql.shuffle.partitions", "400") \
    .config("spark.default.parallelism", "400") \
    .config("spark.network.timeout", "800s") \
    .config("spark.executor.heartbeatInterval", "100s") \
    .config("spark.memory.fraction", "0.6") \
    .config("spark.sql.execution.arrow.pyspark.enabled", "true") \
    .getOrCreate()

# BASE_DIR = '/home/ubuntu/data/aaucr/'
# file_path = os.path.join(BASE_DIR, 'Books_5.json')

#df = spark.read.json(file_path)
#df.select("asin", "reviewerID", "reviewText", "overall").write.mode("overwrite").parquet("data_amazon.parquet")

# CONFIGURATION
PARQUET_PATH = "reviews_final_parquet"
NUM_PARTITIONS = 400  
NUM_FEATURES = 2**16  
MAX_ITER = 20         
REG_PARAM = 0.1

df = spark.read.parquet(PARQUET_PATH) #.repartition(NUM_PARTITIONS)

# PREPROCESSING
df = df.filter("overall != 3") \
       .withColumn("label", F.when(F.col("overall") >= 4, 1.0).otherwise(0.0)) \
       .select("lemmatized_tokens", "label")

print(">> Load and filtering done.")

hashingTF = HashingTF(inputCol="lemmatized_tokens", outputCol="rawFeatures", numFeatures=NUM_FEATURES)
idf = IDF(inputCol="rawFeatures", outputCol="features")
svm = LinearSVC(maxIter=MAX_ITER, regParam=REG_PARAM, labelCol="label", featuresCol="features")

pipeline = Pipeline(stages=[hashingTF, idf, svm])

train_df, test_df = df.randomSplit([0.8, 0.2], seed=42)

num_negatives = train_df.filter(F.col("label") == 0.0).count()
num_positives = train_df.filter(F.col("label") == 1.0).count()

print(f"Pozitive: {num_positives}, Negative: {num_negatives}")

# DOWNSAMPLING
fraction = num_negatives / num_positives
pos_df = train_df.filter(F.col("label") == 1.0).sample(withReplacement=False, fraction=fraction, seed=42)
neg_df = train_df.filter(F.col("label") == 0.0)

# UNION
balanced_train_df = pos_df.union(neg_df).repartition(NUM_PARTITIONS)

print(f"Noua dimensiune antrenare: {balanced_train_df.count()}")

print(">>> Începe antrenarea modelului...")
spark.catalog.clearCache()
model = pipeline.fit(balanced_train_df)
print(">>> S-a terminat antrenarea modelului.")

# EVALUATION
predictions = model.transform(test_df)
evaluator = MulticlassClassificationEvaluator(labelCol="label", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)

print(f"Acuratețe finală: {accuracy:.4f}")

MODEL_PATH = "models/amazon_sentiment_svm_big"

print(f">>> Se salvează modelul în: {MODEL_PATH}")

# SAVE
model.write().overwrite().save(MODEL_PATH)

print("Salvare reușită!")

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
26/01/15 20:39:29 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
26/01/15 20:39:49 WARN TaskSchedulerImpl: Initial job has not accepted any resources; check your cluster UI to ensure that workers are registered and have sufficient resources
                                                                                

>> Load and filtering done.


                                                                                

Pozitive: 50048793, Negative: 5063240


                                                                                

Noua dimensiune antrenare: 10126555
>>> Începe antrenarea modelului...


26/01/15 21:14:38 WARN DAGScheduler: Broadcasting large task binary with size 1114.8 KiB
26/01/15 21:16:43 WARN DAGScheduler: Broadcasting large task binary with size 1115.9 KiB
26/01/15 21:16:44 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
26/01/15 21:16:44 WARN DAGScheduler: Broadcasting large task binary with size 1115.3 KiB
26/01/15 21:18:43 WARN DAGScheduler: Broadcasting large task binary with size 1116.5 KiB
26/01/15 21:18:43 WARN DAGScheduler: Broadcasting large task binary with size 1115.3 KiB
26/01/15 21:18:45 WARN DAGScheduler: Broadcasting large task binary with size 1116.5 KiB
26/01/15 21:18:46 WARN DAGScheduler: Broadcasting large task binary with size 1115.3 KiB
26/01/15 21:18:47 WARN DAGScheduler: Broadcasting large task binary with size 1116.5 KiB
26/01/15 21:18:47 WARN DAGScheduler: Broadcasting large task binary with size 1115.3 KiB
26/01/15 21:18:49 WARN DAGScheduler: Broadcasting large task binary with size 1116.5 KiB
26/

>>> S-a terminat antrenarea modelului.


26/01/15 21:32:41 WARN DAGScheduler: Broadcasting large task binary with size 1604.7 KiB
                                                                                

Acuratețe finală: 0.9058
>>> Se salvează modelul în: models/amazon_sentiment_svm_big


26/01/15 21:37:13 WARN TaskSetManager: Stage 152 contains a task of very large size (1052 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

Salvare reușită!


In [None]:
from pyspark.ml import PipelineModel

loaded_model = PipelineModel.load(MODEL_PATH)

test_sentences = spark.createDataFrame([
    ("I absolutely hated this book, the plot was non-existent and the characters were flat.",),
    ("What an amazing journey! I couldn't put it down until the last page.",),
    ("I hate it! It's awful",)
], ["reviewText"])

results = loaded_model.transform(test_sentences)
results.select("reviewText", "prediction").show(truncate=False)

In [7]:
from sparknlp.base import DocumentAssembler
from sparknlp.annotator import Tokenizer, LemmatizerModel
from pyspark.ml import Pipeline as SparkNLP_Pipeline

loaded_model = PipelineModel.load(MODEL_PATH)

from pyspark.sql.types import StructType, StructField, StringType, ArrayType

schema = StructType([
    StructField("originalText", StringType(), True),
    StructField("lemmatized_tokens", ArrayType(StringType()), True)
])

data = [
    ("I absolutely hated this book...", ["absolutely", "hate", "book", "plot", "be", "non", "existent", "character", "be", "flat"]),
    ("What an amazing journey!...", ["amazing", "journey", "not", "can", "put", "down", "last", "page"]),
    ("I hate it! It's awful", ["hate", "awful"])
]

test_df = spark.createDataFrame(data, schema)

results = loaded_model.transform(test_df)

results.select("lemmatized_tokens", "prediction").show(truncate=False)

26/01/15 21:58:04 WARN DAGScheduler: Broadcasting large task binary with size 1581.9 KiB
26/01/15 21:58:04 WARN DAGScheduler: Broadcasting large task binary with size 1581.9 KiB
26/01/15 21:58:05 WARN DAGScheduler: Broadcasting large task binary with size 1581.9 KiB
26/01/15 21:58:05 WARN DAGScheduler: Broadcasting large task binary with size 1581.9 KiB
26/01/15 21:58:07 WARN DAGScheduler: Broadcasting large task binary with size 1581.9 KiB
                                                                                

+----------------------------------------------------------------------+----------+
|lemmatized_tokens                                                     |prediction|
+----------------------------------------------------------------------+----------+
|[absolutely, hate, book, plot, be, non, existent, character, be, flat]|0.0       |
|[amazing, journey, not, can, put, down, last, page]                   |1.0       |
|[hate, awful]                                                         |0.0       |
+----------------------------------------------------------------------+----------+



In [None]:
"""
Results on Books dataset (20+ GB) for the previous version (The "Yes Man Problem"):

+-------------------------------------------------------------------------------------+----------+
|reviewText                                                                           |prediction|
+-------------------------------------------------------------------------------------+----------+
|I absolutely hated this book, the plot was non-existent and the characters were flat.|1.0       |
|What an amazing journey! I couldn't put it down until the last page.                 |1.0       |
|I hate it! It's awful                                                                |0.0       |
+-------------------------------------------------------------------------------------+----------+
"""