In [1]:
import findspark

findspark.init()

try:
    from pyspark.ml import Pipeline
    from pyspark.ml.classification import LogisticRegression
    from pyspark.ml.evaluation import MulticlassClassificationEvaluator
    from pyspark.ml.feature import IDF, HashingTF, StopWordsRemover, Tokenizer
    from pyspark.sql import SparkSession
    from pyspark.sql import functions as F

    pyspark_available = True
except ImportError:
    print("PySpark not available. Install with: pip install pyspark")
    pyspark_available = False

# Initialize SparkSession and SparkContext
if pyspark_available:
    spark = (
        SparkSession.builder.appName("yelp_review_silver_transform")
        .master("local[*]")
        .getOrCreate()
    )
    sc = spark.sparkContext

    print("Spark session initialzed succesfully!")
    print(f"Spark version: {spark.version}")
    print(f"Spark UI available at: {sc.uiWebUrl}")
else:
    print("Skipping Spark tasks - Pyspark not available")

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/11/26 09:37:10 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Spark session initialzed succesfully!
Spark version: 3.5.0
Spark UI available at: http://test:4040


## Load silver review data as DataFrame

In [2]:
reivew_silver_path = "file:///data/silver/yelp/review/ingest_date\=2025-11-25/"

if pyspark_available:
    review_silver_df = spark.read.parquet(reivew_silver_path)
    review_silver_df.show(5)

                                                                                

+--------------------+--------------------+--------------------+-----+--------------------+-------------------+------+-----+----+
|           review_id|             user_id|         business_id|stars|          text_clean|               date|useful|funny|cool|
+--------------------+--------------------+--------------------+-----+--------------------+-------------------+------+-----+----+
|gImS1dtA_TixEouDf...|xE7AXFF9wVaN6id6O...|D5V0Fawd6ODVgqCY8...|    4|i have been to a ...|2017-01-14 21:05:04|     1|    0|   2|
|JJgPXqfCfY-prfVUo...|2Pqs3nedztBqwQE7z...|4-CxVJ9n6TPPAmKV7...|    5|my first time her...|2011-07-29 11:24:29|     2|    0|   0|
|BMeYOL1Qj95QTb_XU...|nZ1bQi8LqaDk9a2oy...|Jn_F3hYqknRTepCg6...|    5|i absolutely love...|2013-07-12 22:39:58|     0|    0|   0|
|bg3TAimbMppppLUTf...|R9VOYcIWbNA3Q3qiW...|EIlFs8kybcG-l60GJ...|    1|avoid at all cost...|2017-02-09 06:35:49|     0|    0|   0|
|YsO0W7CJVDwDyD4rL...|T4ZUbz-rN5Lue9oJ5...|kQANkgj6vHc1xCmxf...|    4|i definitely pref...

## Preprare lables

2-class supervised sentiment

In [3]:
if pyspark_available:
    review_labelled_df = review_silver_df.withColumn(
        "label",
        F.when(F.col("stars") <= 2, 0).when(F.col("stars") >= 4, 1).otherwise(None),
    ).na.drop(subset=["label"])

    review_labelled_df.groupBy("label").count().show()



+-----+-------+
|label|  count|
+-----+-------+
|    1|4684545|
|    0|1613801|
+-----+-------+



                                                                                

## ML Text Pipeline

In [None]:
if pyspark_available:
    tokenizer = Tokenizer(inputCol="text_clean", outputCol="tokens")
    remover = StopWordsRemover(inputCol="tokens", outputCol="filtered")
    hashingTF = HashingTF(inputCol="filtered", outputCol="rawFeatures", numFeatures=10000)
    idf = IDF(inputCol="rawFeatures", outputCol="features")
    lr = LogisticRegression(featuresCol="features", labelCol="label", maxIter=10)

## Build Pipeline

In [None]:
if pyspark_available:
    pipeline = Pipeline(stages=[tokenizer, remover, hashingTF, idf, lr])

## Tran Test Split

In [6]:
if pyspark_available:
    train, test = review_labelled_df.randomSplit([0.8, 0.2], seed=42)

## Fit the Model

In [7]:
if pyspark_available:
    model = pipeline.fit(train)

25/11/26 09:41:49 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
25/11/26 09:42:02 WARN MemoryStore: Not enough space to cache rdd_42_3 in memory! (computed 65.0 MiB so far)
25/11/26 09:42:02 WARN BlockManager: Persisting block rdd_42_3 to disk instead.
25/11/26 09:42:03 WARN MemoryStore: Not enough space to cache rdd_42_1 in memory! (computed 65.0 MiB so far)
25/11/26 09:42:03 WARN BlockManager: Persisting block rdd_42_1 to disk instead.
25/11/26 09:42:11 WARN MemoryStore: Not enough space to cache rdd_42_3 in memory! (computed 65.0 MiB so far)
25/11/26 09:42:11 WARN MemoryStore: Not enough space to cache rdd_42_1 in memory! (computed 65.0 MiB so far)
25/11/26 09:42:11 WARN MemoryStore: Not enough space to cache rdd_42_4 in memory! (computed 17.0 MiB so far)
25/11/26 09:42:11 WARN BlockManager: Persisting block rdd_42_4 to disk instead.
25/11/26 09:42:20 WARN MemoryStore: Not enough space to cache rdd_42_7 in memory! (computed 33.0 MiB so far)

## Evaluate Model

In [8]:
if pyspark_available:
    predictions = model.transform(test)

    evaluator = MulticlassClassificationEvaluator(
        labelCol="label", predictionCol="prediction", metricName="accuracy"
    )

    accuracy = evaluator.evaluate(predictions)
    print(f"Test set accuracy = {accuracy:.4f}")



Test set accuracy = 0.9443


                                                                                

## Run the model on all the silver reviews

In [9]:
if pyspark_available:
    predictions_gold_df = model.transform(review_silver_df)

    gold_df = predictions_gold_df.select(
        "review_id",
        "business_id",
        "user_id",
        "stars",
        "text_clean",
        F.col("prediction").alias("sentiment_ml"),
        F.col("probability").alias("sentiment_probability"),
        "date",
    )

In [10]:
gold_path = "file:///data/gold/yelp/review_sentiment_ml/"

if pyspark_available:
    gold_df.write.mode("overwrite").parquet(gold_path)
    print(f"Gold data written to: {gold_path}")



Gold data written to: file:///data/gold/yelp/review_sentiment_ml/


                                                                                

In [11]:
if pyspark_available:
    business_gold_df = gold_df.groupBy("business_id").agg(
        F.avg("sentiment_ml").alias("avg_sentiment_ml"),
        F.avg("stars").alias("avg_stars"),
        F.count("*").alias("review_count"),
    )

    business_gold_df.show(5)



+--------------------+-------------------+------------------+------------+
|         business_id|   avg_sentiment_ml|         avg_stars|review_count|
+--------------------+-------------------+------------------+------------+
|Ryt1Fhgz7sixMQSJi...| 0.5483870967741935| 2.629032258064516|          62|
|q6661I3CGW0UB740E...|0.35507246376811596| 2.239130434782609|         138|
|TuRy46Cyb7MWjV7VM...| 0.5194805194805194| 2.896103896103896|          77|
|wIHee6-l_ODAkkFEy...| 0.8285714285714286|4.0285714285714285|          35|
|AwmeLVLEfdFoCa0La...| 0.8076923076923077|3.6538461538461537|          26|
+--------------------+-------------------+------------------+------------+
only showing top 5 rows



                                                                                

In [12]:
business_silver_path = "file:///data/silver/yelp/business/ingest_date\=2025-11-25/"

if pyspark_available:
    business_silver_df = spark.read.parquet(business_silver_path)

    enriched_gold_df = gold_df.join(
        business_silver_df.select("business_id", "name", "categories"),
        on="business_id",
        how="left",
    )

    business_summary_df = enriched_gold_df.groupBy(
        "business_id", "name", "categories"
    ).agg(
        F.avg("sentiment_ml").alias("avg_sentiment"),
        F.avg("stars").alias("avg_stars"),
        F.count("*").alias("review_count"),
    )

    business_summary_df.show(5)



+--------------------+--------------------+--------------------+-------------------+------------------+------------+
|         business_id|                name|          categories|      avg_sentiment|         avg_stars|review_count|
+--------------------+--------------------+--------------------+-------------------+------------------+------------+
|w9KFP5loQ8WaGLs_s...|Jungle Wonder Fam...|[Laser Tag, Arts ...|0.47619047619047616|2.4285714285714284|          21|
|fnO03-RX7UDC1TzXE...|Pinocchio's Resta...|[Restaurants, Bee...|            0.71875|        3.50390625|         256|
|dNR-b-CsrFGYhMo9z...|          Soy Bistro|[Japanese, Korean...| 0.9397590361445783| 4.602409638554217|         498|
|YARjtcQLeoPR7hKz8...|Ong Hanoi Fried C...|[Restaurants, Chi...|              0.875|              4.25|           8|
|tH10ZuH28GrccqTZ6...|Carnival Cruise L...|[Hotels & Travel,...| 0.5112359550561798| 2.567415730337079|         178|
+--------------------+--------------------+--------------------+

                                                                                