In [None]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import Tokenizer, HashingTF, IDF
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.storagelevel import StorageLevel

spark = SparkSession.builder \
    .appName("Amazon_Reviews_Recommender_Optimized") \
    .config("spark.driver.memory", "8g") \
    .config("spark.driver.maxResultSize", "4g") \
    .config("spark.executor.memory", "8g") \
    .config("spark.executor.memoryOverhead", "2g") \
    .config("spark.memory.offHeap.enabled", "true") \
    .config("spark.memory.offHeap.size", "2g") \
    .config("spark.sql.shuffle.partitions", "2000") \
    .config("spark.default.parallelism", "2000") \
    .config("spark.memory.fraction", "0.6") \
    .config("spark.sql.execution.arrow.pyspark.enabled", "true") \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .config("spark.kryoserializer.buffer.max", "1024m") \
    .getOrCreate()


PARQUET_PATH = "reviews_final_parquet"
df = spark.read.parquet(PARQUET_PATH)

df_spark = df.withColumn(
    "label", 
    F.when((F.col("overall") >= 1) & (F.col("overall") <= 2.5), 0.0)
     .when((F.col("overall") > 2.5) & (F.col("overall") <= 3.5), 1.0)
     .otherwise(2.0)
)


hashingTF = HashingTF(inputCol="lemmatized_tokens", outputCol="rawFeatures", numFeatures=1000)

idf = IDF(inputCol="rawFeatures", outputCol="features")

# Multiclass Logistic Regression
lr = LogisticRegression(
    labelCol="label", 
    featuresCol="features", 
    maxIter=10, 
    regParam=0.01, 
    elasticNetParam=1.0, 
    family="multinomial"
)

pipeline = Pipeline(stages=[ hashingTF, idf, lr])

train_df, test_df = df_spark.randomSplit([0.9, 0.1], seed=42)

model = pipeline.fit(train_df)

predictions = model.transform(test_df)

evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction")
accuracy = evaluator.evaluate(predictions, {evaluator.metricName: "accuracy"})
f1 = evaluator.evaluate(predictions, {evaluator.metricName: "f1"})

print(f"Test Accuracy: {accuracy:.4f}")
print(f"Test F1-score: {f1:.4f}")


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
26/01/16 02:30:39 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
26/01/16 02:58:24 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS

Test Accuracy: 0.8322
Test F1-score: 0.7607


                                                                                

In [None]:
from pyspark.ml import PipelineModel

model_path = "logreg_sentiment_pipeline_v1"
print(f"--- Saving PipelineModel to {model_path} ---")

model.write().overwrite().save(model_path)
print("Success: Model saved.")

print(f"--- Loading PipelineModel from {model_path} ---")
loaded_model = PipelineModel.load(model_path)
print("Success: Model loaded.")

print("--- Verifying Loaded Model on Test Data ---")

predictions_loaded = loaded_model.transform(test_df)

evaluator_verify = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction")
acc_verify = evaluator_verify.evaluate(predictions_loaded, {evaluator_verify.metricName: "accuracy"})

print(f"Original Accuracy: {accuracy:.4f}")
print(f"Loaded Model Accuracy: {acc_verify:.4f}")

if accuracy == acc_verify:
    print("Verification Passed: Loaded model behaves exactly like the original.")
else:
    print("Verification Warning: Metrics differ slightly (this is rare).")

predictions_loaded.select("label", "prediction", "probability").show(5, truncate=False)

--- Saving PipelineModel to logreg_sentiment_pipeline_v1 ---


                                                                                

Success: Model saved.
--- Loading PipelineModel from logreg_sentiment_pipeline_v1 ---


                                                                                

Success: Model loaded.
--- Verifying Loaded Model on Test Data ---


                                                                                

Original Accuracy: 0.8322
Loaded Model Accuracy: 0.8322
Verification Passed: Loaded model behaves exactly like the original.




+-----+----------+-------------------------------------------------------------+
|label|prediction|probability                                                  |
+-----+----------+-------------------------------------------------------------+
|2.0  |2.0       |[0.046960655081238194,0.05170117521505912,0.9013381697037026]|
|2.0  |2.0       |[0.08299363148734924,0.07983364357356251,0.8371727249390882] |
|2.0  |2.0       |[0.045329722850059324,0.07387914078544117,0.8807911363644996]|
|2.0  |2.0       |[0.032056267809961465,0.03575619654839257,0.9321875356416459]|
|2.0  |2.0       |[0.10927388228380479,0.09844390638973831,0.7922822113264569] |
+-----+----------+-------------------------------------------------------------+
only showing top 5 rows



                                                                                

In [None]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import Tokenizer, HashingTF, IDF
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.storagelevel import StorageLevel
from pyspark.sql.types import StructType, StructField, StringType, ArrayType

spark = SparkSession.builder \
    .appName("Amazon_Reviews_Recommender_Optimized") \
    .config("spark.driver.memory", "8g") \
    .config("spark.driver.maxResultSize", "4g") \
    .config("spark.executor.memory", "8g") \
    .config("spark.executor.memoryOverhead", "2g") \
    .config("spark.memory.offHeap.enabled", "true") \
    .config("spark.memory.offHeap.size", "2g") \
    .config("spark.sql.shuffle.partitions", "2000") \
    .config("spark.default.parallelism", "2000") \
    .config("spark.memory.fraction", "0.6") \
    .config("spark.sql.execution.arrow.pyspark.enabled", "true") \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .config("spark.kryoserializer.buffer.max", "1024m") \
    .getOrCreate()

model_path = "logreg_sentiment_pipeline_v1"

loaded_model = PipelineModel.load(model_path)

schema = StructType([
    StructField("originalText", StringType(), True),
    StructField("lemmatized_tokens", ArrayType(StringType()), True)
])

data = [
    ("I absolutely hated this book...", ["absolutely", "hate", "book", "plot", "be", "non", "existent", "character", "be", "flat"]),
    ("What an amazing journey!...", ["amazing", "journey", "not", "can", "put", "down", "last", "page"]),
    ("I hate it! It's awful", ["hate", "awful"])
]

test_df = spark.createDataFrame(data, schema)

results = loaded_model.transform(test_df)

results.select("lemmatized_tokens", "prediction").show(truncate=False)

                                                                                

+----------------------------------------------------------------------+----------+
|lemmatized_tokens                                                     |prediction|
+----------------------------------------------------------------------+----------+
|[absolutely, hate, book, plot, be, non, existent, character, be, flat]|2.0       |
|[amazing, journey, not, can, put, down, last, page]                   |2.0       |
|[hate, awful]                                                         |2.0       |
+----------------------------------------------------------------------+----------+



In [None]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import Tokenizer, HashingTF, IDF, NGram, SQLTransformer
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.storagelevel import StorageLevel

spark = SparkSession.builder \
    .appName("Amazon_Reviews_Recommender_Optimized") \
    .config("spark.driver.memory", "8g") \
    .config("spark.driver.maxResultSize", "4g") \
    .config("spark.executor.memory", "8g") \
    .config("spark.executor.memoryOverhead", "2g") \
    .config("spark.memory.offHeap.enabled", "true") \
    .config("spark.memory.offHeap.size", "2g") \
    .config("spark.sql.shuffle.partitions", "2000") \
    .config("spark.default.parallelism", "2000") \
    .config("spark.memory.fraction", "0.6") \
    .config("spark.sql.execution.arrow.pyspark.enabled", "true") \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .config("spark.kryoserializer.buffer.max", "1024m") \
    .getOrCreate()


PARQUET_PATH = "reviews_final_parquet"
df = spark.read.parquet(PARQUET_PATH)

df_spark = df.withColumn(
    "label", 
    F.when((F.col("overall") >= 1) & (F.col("overall") <= 2.5), 0.0)
     .when((F.col("overall") > 2.5) & (F.col("overall") <= 3.5), 1.0)
     .otherwise(2.0)
)

ngram = NGram(n=2, inputCol="lemmatized_tokens", outputCol="bigrams")

combiner = SQLTransformer(
    statement="SELECT *, concat(lemmatized_tokens, bigrams) AS all_tokens FROM __THIS__"
)

hashingTF = HashingTF(inputCol="all_tokens", outputCol="rawFeatures", numFeatures=2**16)

idf = IDF(inputCol="rawFeatures", outputCol="features")

lr = LogisticRegression(
    labelCol="label", 
    featuresCol="features", 
    maxIter=10, 
    regParam=0.01, 
    elasticNetParam=1.0, 
    family="multinomial"
)

pipeline = Pipeline(stages=[ngram, combiner, hashingTF, idf, lr])

train_df, test_df = df_spark.randomSplit([0.9, 0.1], seed=42)

model = pipeline.fit(train_df)

predictions = model.transform(test_df)

evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction")
accuracy = evaluator.evaluate(predictions, {evaluator.metricName: "accuracy"})
f1 = evaluator.evaluate(predictions, {evaluator.metricName: "f1"})

print(f"Test Accuracy: {accuracy:.4f}")
print(f"Test F1-score: {f1:.4f}")

model_path = "logreg_sentiment_pipeline_v2"
print(f"--- Saving PipelineModel to {model_path} ---")

model.write().save(model_path)
print("Success: Model saved.")


26/01/16 07:25:31 WARN DAGScheduler: Broadcasting large task binary with size 1127.2 KiB
26/01/16 08:00:25 WARN DAGScheduler: Broadcasting large task binary with size 1128.4 KiB
26/01/16 08:00:35 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
26/01/16 08:00:36 WARN DAGScheduler: Broadcasting large task binary with size 1127.8 KiB
26/01/16 08:40:27 WARN DAGScheduler: Broadcasting large task binary with size 1128.9 KiB
26/01/16 08:40:47 WARN DAGScheduler: Broadcasting large task binary with size 1127.8 KiB
26/01/16 08:45:56 WARN DAGScheduler: Broadcasting large task binary with size 1128.9 KiB
26/01/16 08:46:08 WARN DAGScheduler: Broadcasting large task binary with size 1127.8 KiB
26/01/16 08:50:45 WARN DAGScheduler: Broadcasting large task binary with size 1128.9 KiB
26/01/16 08:50:50 WARN DAGScheduler: Broadcasting large task binary with size 1127.8 KiB
26/01/16 08:55:35 WARN DAGScheduler: Broadcasting large task binary with size 1128.9 KiB
26/

Test Accuracy: 0.8377
Test F1-score: 0.7740
--- Saving PipelineModel to logreg_sentiment_pipeline_v2 ---


26/01/16 09:56:19 WARN TaskSetManager: Stage 45 contains a task of very large size (1052 KiB). The maximum recommended task size is 1000 KiB.
[Stage 51:>                                                         (0 + 1) / 1]

Success: Model saved.


                                                                                