In [0]:
%run "/Workspace/Users/sundarasandeepteja@gmail.com/E-Commerce Analytics Medallion Architecture with GenAI/config/project_config"

In [0]:
# Databricks notebook source
# ======================================
# SILVER LAYER: PRODUCT RATING AGGREGATES
# ======================================

# MAGIC %run ../config/project_config

from pyspark.sql import functions as F

print("ðŸ¥ˆ SILVER LAYER: Product Rating Aggregates")
print("=" * 60)

# Read Silver ratings
ratings = spark.table(f"{SILVER_DB}.ratings_validated")

# Aggregate ratings per product
product_rating_agg = ratings.groupBy("product_id").agg(
    F.count("*").alias("total_ratings"),
    F.sum(F.when(F.col("has_review").cast("boolean"), 1).otherwise(0)).alias("total_reviews"),
    F.round(F.avg("rating"), 2).alias("avg_rating"),
    F.round(F.stddev("rating"), 2).alias("rating_stddev"),
    F.min("rating").alias("min_rating"),
    F.max("rating").alias("max_rating"),
    F.sum(F.when(F.col("rating") == 5, 1).otherwise(0)).alias("five_star_count"),
    F.sum(F.when(F.col("rating") == 4, 1).otherwise(0)).alias("four_star_count"),
    F.sum(F.when(F.col("rating") == 3, 1).otherwise(0)).alias("three_star_count"),
    F.sum(F.when(F.col("rating") == 2, 1).otherwise(0)).alias("two_star_count"),
    F.sum(F.when(F.col("rating") == 1, 1).otherwise(0)).alias("one_star_count"),
    F.sum(F.when(F.col("is_positive"), 1).otherwise(0)).alias("positive_count"),
    F.sum(F.when(F.col("is_negative"), 1).otherwise(0)).alias("negative_count"),
    F.sum("helpful_votes").alias("total_helpful_votes"),
    F.round(F.avg("helpful_votes"), 2).alias("avg_helpful_votes"),
    F.max("rating_date").alias("last_rating_date"),
    F.min("rating_date").alias("first_rating_date"),
    F.sum(F.when(F.col("is_recent"), 1).otherwise(0)).alias("recent_ratings_count")
)

# Add calculated metrics
product_rating_enriched = product_rating_agg.withColumn(
    "positive_rate",
    F.round(F.col("positive_count") / F.col("total_ratings") * 100, 2)
).withColumn(
    "negative_rate",
    F.round(F.col("negative_count") / F.col("total_ratings") * 100, 2)
).withColumn(
    "review_rate",
    F.round(F.col("total_reviews") / F.col("total_ratings") * 100, 2)
).withColumn(
    "rating_confidence",
    F.when(F.col("total_ratings") >= 100, "High")
     .when(F.col("total_ratings") >= 20, "Medium")
     .otherwise("Low")
).withColumn(
    "sentiment_score",
    F.round((F.col("positive_count") - F.col("negative_count")) / F.col("total_ratings") * 100, 2)
).withColumn(
    "rating_trend",
    F.when(F.col("recent_ratings_count") >= 5, "Has Recent Data")
     .otherwise("Limited Recent Data")
).withColumn(
    "_aggregated_at", F.current_timestamp()
)

# Write to Silver
product_rating_enriched.write.format("delta").mode("overwrite").saveAsTable(
    f"{SILVER_DB}.product_ratings_summary"
)

print(f"âœ… Created: {SILVER_DB}.product_ratings_summary")
print(f"   Records: {product_rating_enriched.count():,}")

# Show sample
print("\nðŸ“Š Sample Product Rating Summaries:")
display(
    spark.table(f"{SILVER_DB}.product_ratings_summary")
        .select("product_id", "total_ratings", "avg_rating", 
                "positive_rate", "sentiment_score", "rating_confidence")
        .orderBy(F.desc("total_ratings"))
        .limit(10)
)

print("\nðŸ¥ˆ PRODUCT RATING AGGREGATES COMPLETE!")