# Yelp Users Silver Layer Transformation

This notebook ingests Yelp user data from the Bronze layer, parses JSON records, validates them, deduplicates and transforms them into a Silver layer format using **PySpark RDDs** and writes in Parquet format.

In [None]:
import json
import time
from datetime import datetime, timezone

import findspark

findspark.init()

try:
    from pyspark.sql import SparkSession
    from pyspark.sql.types import (IntegerType, StringType, StructField,
                                   StructType)

    pyspark_available = True
except ImportError:
    print("PySpark not available. Install with: pip install pyspark")
    pyspark_available = False

# Initialize SparkSession and SparkContext
if pyspark_available:
    spark = (
        SparkSession.builder.appName("yelp_review_silver_transform")
        .master("local[*]")
        .getOrCreate()
    )
    sc = spark.sparkContext

    print("Spark session initialzed succesfully!")
    print(f"Spark version: {spark.version}")
    print(f"Spark UI available at: {sc.uiWebUrl}")
else:
    print("Skipping Spark tasks - Pyspark not available")

## Utility functions

In [None]:
def parse_json_safe(json_str: str) -> dict:
    """
    Safely parse a JSON string and add ingestion metadata.

    Args:
        json_str (str): The JSON string to parse.
    Returns:
        dict: A dictionary containing the parsed data and ingestion metadata,
              or error information if parsing fails.
    """
    try:
        data = json.loads(json_str)

        # Add ingestion metadata
        data["_ingestion_date"] = datetime.now(timezone.utc).strftime("%Y-%m-%d")
        data["_ingestion_timestamp"] = time.time()
        data["_source"] = "yelp_dataset"
        data["_status"] = "valid"

        return data

    except json.JSONDecodeError as e:

        return {
            "_raw_data": json_str,
            "_ingestion_timestamp": time.time(),
            "_source": "yelp_dataset",
            "_status": "parse_error",
            "_error_msg": str(e),
        }

In [None]:
def is_user_valid(user: dict) -> bool:
    """
    Validate a user record based on required fields and types.

    Args:
        user (dict): The user record to validate.
    Returns:
        bool: True if the user record is valid, False otherwise.
    """
    required_fields = [
        "user_id",
        "name",
        "review_count",
        "average_stars",
        "yelping_since",
    ]

    # 1. Check required fields exist
    if not all(field in user for field in required_fields):
        return False

    # 2. Check user_id and name are non-empty strings
    if not isinstance(user["user_id"], str) or len(user["user_id"].strip()) == 0:
        return False
    if not isinstance(user["name"], str) or len(user["name"].strip()) == 0:
        return False

    # 3. Validate numeric fields (allow int, float, numeric strings)
    try:
        if int(user["review_count"]) < 0:
            return False

        float(user["average_stars"])

        datetime.strptime(user["yelping_since"], "%Y-%m-%d %H:%M:%S")
    except Exception:
        return False

    return True

In [None]:
def transform_user_silver(user: dict) -> dict:
    """
    Transform a valid user record to the silver schema.

    Args:
        user (dict): The valid user record to transform.
    Returns:
        dict: The transformed user record.
    """
    transformed_user = {
        "user_id": user["user_id"].strip(),
        "name": user["name"].strip(),
        "review_count": int(user["review_count"]),
        "average_stars": float(user["average_stars"]),
        "yelping_since": user["yelping_since"],
        "ingest_date": datetime.now(timezone.utc).strftime("%Y-%m-%d"),
    }

    return transformed_user

## Quick sanity check

In [None]:
!wc -l /data/bronze/yelp/raw/2025-11-13/yelp_academic_dataset_user.json

In [None]:
!head -n 3 /data/bronze/yelp/raw/2025-11-13/yelp_academic_dataset_user.json

## Load bronze data as RDD

In [None]:
raw_path = "file:///data/bronze/yelp/raw/2025-11-13/yelp_academic_dataset_user.json"
if pyspark_available:
    users_raw_rdd = sc.textFile(raw_path)
    users_parsed_rdd = users_raw_rdd.map(parse_json_safe)
    print("Parsed record count:", users_parsed_rdd.count())
    print("Parsed sample line:", users_parsed_rdd.take(1))

## Filter parsable users

In [None]:
if pyspark_available:
    users_valid_json_rdd = users_parsed_rdd.filter(lambda d: d["_status"] == "valid")
    users_invalid_json_rdd = users_parsed_rdd.filter(
        lambda d: d["_status"] == "parse_error"
    )

    total_count = users_parsed_rdd.count()
    invalid_count = users_invalid_json_rdd.count()
    print(
        f"Malformed records: {invalid_count}/{total_count} ({invalid_count/total_count*100:.2f}%)"
    )
    print(f"Valid records: {users_valid_json_rdd.count()}")

## Filter valid users

In [None]:
if pyspark_available:
    users_valid_rdd = users_valid_json_rdd.filter(is_user_valid)

    print(f"Valid users records: {users_valid_rdd.count()}")

## Deduplicate users by `user_id`

In [None]:
if pyspark_available:
    users_deduped_rdd = (
        users_valid_rdd.map(lambda r: (r["user_id"], r))
        .reduceByKey(lambda a, b: a)
        .map(lambda kv: kv[1])
    )

    print("After deduplication:", users_deduped_rdd.count())

## Apply silver transformation

In [None]:
if pyspark_available:
    users_silver_rdd = users_deduped_rdd.map(transform_user_silver)
    print("Transformed users record count:", users_silver_rdd.count())
    print("Sample transformed users record:", users_silver_rdd.take(1))

## Convert RDD to DataFrame

In [None]:
if pyspark_available:
    users_silver_schema = StructType(
        [
            StructField("user_id", StringType(), False),
            StructField("name", StringType(), False),
            StructField("review_count", IntegerType(), False),
            StructField("average_stars", StringType(), False),
            StructField("yelping_since", StringType(), False),
            StructField("ingest_date", StringType(), False),
        ]
    )

    users_silver_df = spark.createDataFrame(
        users_silver_rdd, schema=users_silver_schema
    )
    users_silver_df.printSchema()
    users_silver_df.show(5, truncate=False)

## Write silver data to Parquet

In [None]:
if pyspark_available:
    users_silver_path = "file:///data/silver/yelp/users/"
    users_silver_df.write.mode("overwrite").partitionBy("ingest_date").parquet(
        users_silver_path
    )
    print(f"Users silver data written to: {users_silver_path}")

## Cleanup

In [None]:
if pyspark_available:
    spark.stop()
    print("Spark session stopped.")