In [None]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import Tokenizer, HashingTF, IDF, NGram, SQLTransformer
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.storagelevel import StorageLevel

spark = SparkSession.builder \
    .appName("Amazon_Reviews_Recommender_Optimized") \
    .config("spark.driver.memory", "8g") \
    .config("spark.driver.maxResultSize", "4g") \
    .config("spark.executor.memory", "8g") \
    .config("spark.executor.memoryOverhead", "2g") \
    .config("spark.memory.offHeap.enabled", "true") \
    .config("spark.memory.offHeap.size", "2g") \
    .config("spark.sql.shuffle.partitions", "2000") \
    .config("spark.default.parallelism", "2000") \
    .config("spark.memory.fraction", "0.6") \
    .config("spark.sql.execution.arrow.pyspark.enabled", "true") \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .config("spark.kryoserializer.buffer.max", "1024m") \
    .getOrCreate()

In [None]:
PARQUET_PATH = "reviews_final_parquet"
df = spark.read.parquet(PARQUET_PATH)

df_spark = df.withColumn(
    "label", 
    F.when((F.col("overall") >= 1) & (F.col("overall") <= 2.5), 0.0)
     .when((F.col("overall") > 2.5) & (F.col("overall") <= 3.5), 1.0)
     .otherwise(2.0)
)

ngram = NGram(n=2, inputCol="lemmatized_tokens", outputCol="bigrams")

combiner = SQLTransformer(
    statement="SELECT *, concat(lemmatized_tokens, bigrams) AS all_tokens FROM __THIS__"
)

hashingTF = HashingTF(inputCol="all_tokens", outputCol="rawFeatures", numFeatures=2**16)

idf = IDF(inputCol="rawFeatures", outputCol="features")

# Multiclass Logistic Regression
lr = LogisticRegression(
    labelCol="label", 
    featuresCol="features", 
    maxIter=10, 
    regParam=0.01, 
    elasticNetParam=1.0, 
    family="multinomial"
)

pipeline = Pipeline(stages=[ngram, combiner, hashingTF, idf, lr])

train_df, test_df = df_spark.randomSplit([0.9, 0.1], seed=42)

model = pipeline.fit(train_df)

predictions = model.transform(test_df)

evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction")
accuracy = evaluator.evaluate(predictions, {evaluator.metricName: "accuracy"})
f1 = evaluator.evaluate(predictions, {evaluator.metricName: "f1"})

print(f"Test Accuracy: {accuracy:.4f}")
print(f"Test F1-score: {f1:.4f}")

model_path = "logreg_sentiment_pipeline_v2"
print(f"--- Saving PipelineModel to {model_path} ---")

model.write().save(model_path)
print("Success: Model saved.")

In [None]:
"""
Test Accuracy: 0.8377
Test F1-score: 0.7740
"""