In [0]:
%run "/Workspace/Users/sundarasandeepteja@gmail.com/E-Commerce Analytics Medallion Architecture with GenAI/config/project_config"

In [0]:
# Databricks notebook source
# ======================================
# SILVER LAYER: RATINGS CLEANING & VALIDATION
# ======================================

# MAGIC %run ../config/project_config

from pyspark.sql import functions as F
from pyspark.sql.window import Window
from delta.tables import DeltaTable

print("ü•à SILVER LAYER: Ratings Cleaning & Validation")
print("=" * 60)

# ======================================
# STEP 1: READ BRONZE DATA
# ======================================
print("\nüì• Step 1: Reading Bronze ratings...")

bronze_ratings = spark.table(BRONZE_RATINGS_TABLE)
bronze_count = bronze_ratings.count()

print(f"  Bronze records: {bronze_count:,}")

# ======================================
# STEP 2: REMOVE DUPLICATES
# ======================================
print("\nüîÑ Step 2: Removing duplicates...")

# Keep the most recent version of each rating
window = Window.partitionBy("rating_id").orderBy(F.desc("_ingested_at"))

deduped_ratings = bronze_ratings \
    .withColumn("_row_num", F.row_number().over(window)) \
    .filter(F.col("_row_num") == 1) \
    .drop("_row_num")

dup_removed = bronze_count - deduped_ratings.count()
print(f"  Duplicates removed: {dup_removed:,}")

# Also check for duplicate customer-product pairs (keep latest rating)
print("  Checking for duplicate customer-product ratings...")

window_cust_prod = Window.partitionBy("customer_id", "product_id").orderBy(F.desc("rating_date"), F.desc("_ingested_at"))

deduped_ratings = deduped_ratings \
    .withColumn("_pair_row_num", F.row_number().over(window_cust_prod)) \
    .filter(F.col("_pair_row_num") == 1) \
    .drop("_pair_row_num")

print(f"  After customer-product dedup: {deduped_ratings.count():,}")

# ======================================
# STEP 3: DATA VALIDATION
# ======================================
print("\n‚úÖ Step 3: Validating data...")

# Filter invalid records
valid_ratings = deduped_ratings.filter(
    # Non-null required fields
    F.col("rating_id").isNotNull() &
    F.col("customer_id").isNotNull() &
    F.col("product_id").isNotNull() &
    F.col("rating").isNotNull() &
    
    # Valid rating range (1-5)
    (F.col("rating") >= 1) &
    (F.col("rating") <= 5) &
    
    # Valid helpful votes (non-negative)
    (F.col("helpful_votes") >= 0) &
    
    # Valid date
    F.col("rating_date").isNotNull()
)

invalid_count = deduped_ratings.count() - valid_ratings.count()
print(f"  Invalid records filtered: {invalid_count:,}")

# ======================================
# STEP 4: REFERENTIAL INTEGRITY CHECK
# ======================================
print("\nüîó Step 4: Checking referential integrity...")

# Check if customers exist in Bronze customers table
bronze_customers = spark.table(BRONZE_CUSTOMERS_TABLE).select("customer_id").distinct()
bronze_products = spark.table(BRONZE_PRODUCTS_TABLE).select("product_id").distinct()

# Join to validate references
validated_ratings = valid_ratings \
    .join(bronze_customers, "customer_id", "inner") \
    .join(bronze_products, "product_id", "inner")

orphan_count = valid_ratings.count() - validated_ratings.count()
print(f"  Orphan records removed: {orphan_count:,}")

# ======================================
# STEP 5: DATA TYPE STANDARDIZATION
# ======================================
print("\nüîß Step 5: Standardizing data types...")

clean_ratings = validated_ratings \
    .withColumn("rating_date", F.to_date("rating_date")) \
    .withColumn("rating", F.col("rating").cast("integer")) \
    .withColumn("helpful_votes", F.col("helpful_votes").cast("integer"))

print("  ‚úÖ Date parsed")
print("  ‚úÖ Integers cast")

# ======================================
# STEP 6: ADD DERIVED/ENRICHED COLUMNS
# ======================================
print("\n‚ûï Step 6: Adding enriched columns...")

enriched_ratings = clean_ratings \
    .withColumn("rating_year", F.year("rating_date")) \
    .withColumn("rating_month", F.month("rating_date")) \
    .withColumn("rating_quarter", F.quarter("rating_date")) \
    .withColumn("days_since_rating", F.datediff(F.current_date(), F.col("rating_date"))) \
    .withColumn("rating_category",
        F.when(F.col("rating") == 5, "Excellent")
         .when(F.col("rating") == 4, "Good")
         .when(F.col("rating") == 3, "Average")
         .when(F.col("rating") == 2, "Poor")
         .otherwise("Very Poor")
    ) \
    .withColumn("sentiment",
        F.when(F.col("rating") >= 4, "Positive")
         .when(F.col("rating") == 3, "Neutral")
         .otherwise("Negative")
    ) \
    .withColumn("is_positive", F.when(F.col("rating") >= 4, True).otherwise(False)) \
    .withColumn("is_negative", F.when(F.col("rating") <= 2, True).otherwise(False)) \
    .withColumn("is_recent", F.when(F.col("days_since_rating") <= 90, True).otherwise(False)) \
    .withColumn("is_helpful", F.when(F.col("helpful_votes") >= 5, True).otherwise(False)) \
    .withColumn("helpfulness_tier",
        F.when(F.col("helpful_votes") == 0, "No Votes")
         .when(F.col("helpful_votes") < 5, "Low")
         .when(F.col("helpful_votes") < 20, "Medium")
         .when(F.col("helpful_votes") < 50, "High")
         .otherwise("Very High")
    ) \
    .withColumn("has_review_bool", F.col("has_review").cast("boolean")) \
    .withColumn("review_engagement_score",
        F.when(F.col("has_review_bool") & (F.col("helpful_votes") >= 10), 3)
         .when(F.col("has_review_bool") & (F.col("helpful_votes") >= 1), 2)
         .when(F.col("has_review_bool"), 1)
         .otherwise(0)
    )

print("  ‚úÖ Date components extracted")
print("  ‚úÖ Rating categories assigned")
print("  ‚úÖ Sentiment derived")
print("  ‚úÖ Helpfulness metrics calculated")

# ======================================
# STEP 7: ADD SILVER METADATA
# ======================================
print("\nüè∑Ô∏è Step 7: Adding Silver metadata...")

silver_ratings = enriched_ratings \
    .withColumn("_silver_processed_at", F.current_timestamp()) \
    .withColumn("_data_quality_score", F.lit(1.0)) \
    .drop("_is_processed")

# Select final columns in order
final_columns = [
    # Business keys
    "rating_id", "customer_id", "product_id",
    
    # Rating details
    "rating", "rating_category", "sentiment",
    "is_positive", "is_negative",
    
    # Review details
    "has_review", "helpful_votes", "helpfulness_tier",
    "is_helpful", "review_engagement_score",
    
    # Time dimensions
    "rating_date", "rating_year", "rating_month", "rating_quarter",
    "days_since_rating", "is_recent",
    
    # Metadata
    "_ingested_at", "_silver_processed_at", "_batch_id",
    "_row_hash", "_data_quality_score"
]

silver_ratings_final = silver_ratings.select(final_columns)
print(f"  Final columns: {len(final_columns)}")

# ======================================
# STEP 8: WRITE TO SILVER
# ======================================
print("\nüíæ Step 8: Writing to Silver layer...")

# Define Silver ratings table name
SILVER_RATINGS_TABLE = f"{SILVER_DB}.ratings_validated"

silver_ratings_final.write \
    .format("delta") \
    .mode("overwrite") \
    .partitionBy("rating_year", "rating_month") \
    .option("overwriteSchema", "true") \
    .saveAsTable(SILVER_RATINGS_TABLE)

print(f"  ‚úÖ Written to: {SILVER_RATINGS_TABLE}")

# Optimize
spark.sql(f"OPTIMIZE {SILVER_RATINGS_TABLE} ZORDER BY (product_id, customer_id)")
print("  ‚úÖ Table optimized with ZORDER on (product_id, customer_id)")

# ======================================
# STEP 9: VERIFY AND SUMMARIZE
# ======================================
print("\nüìä Step 9: Verification & Summary...")

final_count = spark.table(SILVER_RATINGS_TABLE).count()
print(f"  Silver ratings: {final_count:,}")

# Rating distribution
print("\nüìä Rating Distribution:")
spark.table(SILVER_RATINGS_TABLE) \
    .groupBy("rating", "rating_category") \
    .agg(F.count("*").alias("count")) \
    .orderBy("rating") \
    .show()

# Sentiment breakdown
print("üìä Sentiment Breakdown:")
spark.table(SILVER_RATINGS_TABLE) \
    .groupBy("sentiment") \
    .agg(
        F.count("*").alias("count"),
        F.round(F.avg("helpful_votes"), 2).alias("avg_helpful_votes")
    ) \
    .orderBy(F.desc("count")) \
    .show()

# Review engagement
print("üìä Review Engagement:")
display(
    spark.table(SILVER_RATINGS_TABLE).agg(
        F.count("*").alias("total_ratings"),
        F.sum(F.when(F.col("has_review").cast("boolean"), 1).otherwise(0)).alias("with_reviews"),
        F.round(
            F.sum(F.when(F.col("has_review").cast("boolean"), 1).otherwise(0)) / F.count("*") * 100, 2
        ).alias("review_pct"),
        F.round(F.avg("helpful_votes"), 2).alias("avg_helpful"),
        F.sum(F.when(F.col("is_helpful"), 1).otherwise(0)).alias("helpful_reviews")
    )
)

# Recency analysis
print("üìä Recency Analysis:")
spark.table(SILVER_RATINGS_TABLE) \
    .groupBy("is_recent") \
    .agg(
        F.count("*").alias("count"),
        F.round(F.avg("rating"), 2).alias("avg_rating")
    ) \
    .show()

print("\n" + "=" * 60)
print("ü•à SILVER RATINGS CLEANING COMPLETE!")
print("=" * 60)