In [None]:
import findspark

findspark.init()

try:
    from pyspark.ml import Pipeline
    from pyspark.ml.classification import LogisticRegression
    from pyspark.ml.evaluation import MulticlassClassificationEvaluator
    from pyspark.ml.feature import IDF, HashingTF, StopWordsRemover, Tokenizer
    from pyspark.sql import SparkSession
    from pyspark.sql import functions as F

    pyspark_available = True
except ImportError:
    print("PySpark not available. Install with: pip install pyspark")
    pyspark_available = False

# Initialize SparkSession and SparkContext
if pyspark_available:
    spark = (
        SparkSession.builder.appName("gold_sentiment_analysis")
        .master("spark://192.168.5.121:7077")
        .config("spark.sql.adaptive.enabled", "true")
        # .config("spark.dynamicAllocation.enabled", "true")
        # .config("spark.dynamicAllocation.minExecutors", "3")
        # .config("spark.dynamicAllocation.maxExecutors", "16")
        # Executor settings
        .config("spark.executor.cores", "4")
        .config("spark.executor.memory", "6g")
        # Driver memory
        .config("spark.driver.memory", "2g")
        # Parallelism settings
        .config("spark.sql.shuffle.partitions", "24")
        .config("spark.default.parallelism", "24")
        .getOrCreate()
    )
    sc = spark.sparkContext

    print("Spark session initialzed succesfully!")
    print(f"Spark version: {spark.version}")
    print(f"Spark UI available at: {sc.uiWebUrl}")
else:
    print("Skipping Spark tasks - Pyspark not available")

## Load silver review data as DataFrame

In [None]:
reivew_silver_path = "/data/silver/yelp/review/ingest_date=2025-11-27/"

if pyspark_available:
    review_silver_df = spark.read.parquet(reivew_silver_path)
    review_silver_df.show(5)

## Preprare lables

2-class supervised sentiment

In [None]:
if pyspark_available:
    review_labelled_df = review_silver_df.withColumn(
        "label",
        F.when(F.col("stars") <= 2, 0).when(F.col("stars") >= 4, 1).otherwise(None),
    ).na.drop(subset=["label"])

    review_labelled_df.groupBy("label").count().show()

## ML Text Pipeline

In [None]:
if pyspark_available:
    tokenizer = Tokenizer(inputCol="text_clean", outputCol="tokens")
    remover = StopWordsRemover(inputCol="tokens", outputCol="filtered")
    hashingTF = HashingTF(
        inputCol="filtered", outputCol="rawFeatures", numFeatures=10000
    )
    idf = IDF(inputCol="rawFeatures", outputCol="features")
    lr = LogisticRegression(featuresCol="features", labelCol="label", maxIter=10)

## Build Pipeline

In [None]:
if pyspark_available:
    pipeline = Pipeline(stages=[tokenizer, remover, hashingTF, idf, lr])

## Tran Test Split

In [None]:
if pyspark_available:
    train, test = review_labelled_df.randomSplit([0.8, 0.2], seed=42)

## Fit the Model

In [None]:
if pyspark_available:
    model = pipeline.fit(train)

## Evaluate Model

In [None]:
if pyspark_available:
    predictions = model.transform(test)

    evaluator = MulticlassClassificationEvaluator(
        labelCol="label", predictionCol="prediction", metricName="accuracy"
    )

    accuracy = evaluator.evaluate(predictions)
    print(f"Test set accuracy = {accuracy:.4f}")

## Run the model on all the silver reviews

In [None]:
if pyspark_available:
    predictions_gold_df = model.transform(review_silver_df)

    gold_df = predictions_gold_df.select(
        "review_id",
        "business_id",
        "user_id",
        "stars",
        "text_clean",
        F.col("prediction").alias("sentiment_ml"),
        F.col("probability").alias("sentiment_probability"),
        "date",
    )

In [None]:
gold_path = "/data/gold/yelp/review_sentiment_ml/"

if pyspark_available:
    gold_df.write.mode("overwrite").parquet(gold_path)
    print(f"Gold data written to: {gold_path}")

In [None]:
if pyspark_available:
    business_gold_df = gold_df.groupBy("business_id").agg(
        F.avg("sentiment_ml").alias("avg_sentiment_ml"),
        F.avg("stars").alias("avg_stars"),
        F.count("*").alias("review_count"),
    )

    business_gold_df.show(5)

In [None]:
business_silver_path = "/data/silver/yelp/business/ingest_date=2025-11-25/"

if pyspark_available:
    business_silver_df = spark.read.parquet(business_silver_path)

    enriched_gold_df = gold_df.join(
        business_silver_df.select("business_id", "name", "categories"),
        on="business_id",
        how="left",
    )

    business_summary_df = enriched_gold_df.groupBy(
        "business_id", "name", "categories"
    ).agg(
        F.avg("sentiment_ml").alias("avg_sentiment"),
        F.avg("stars").alias("avg_stars"),
        F.count("*").alias("review_count"),
    )

    business_summary_df.show(5)

In [None]:
gold_business_path = "/data/gold/yelp/business_sentiment_summary/"

if pyspark_available:
    business_summary_df.write.mode("overwrite").parquet(gold_business_path)
    print(f"Gold business sentiment summary written to: {gold_business_path}")