# Yelp Business Silver Layer Tranformation

This notebook ingest the Yelp business data form th Bronze layer, parses JSON records, validates them, deduplicates and tranforms them int a Silver layer format using **PySpark RDDs** and wirets in Parquet format.

In [16]:
import json
import time
from datetime import datetime, timezone

import findspark

findspark.init()

try:
    from pyspark.sql import SparkSession
    from pyspark.sql.types import (IntegerType, StringType, StructField,
                                   StructType)

    pyspark_available = True
except ImportError:
    print("PySpark not available. Install with: pip install pyspark")
    pyspark_available = False

# Initialize SparkSession and SparkContext
if pyspark_available:
    spark = (
        SparkSession.builder.appName("yelp_review_silver_transform")
        .master("local[*]")
        .getOrCreate()
    )
    sc = spark.sparkContext

    print("Spark session initialzed succesfully!")
    print(f"Spark version: {spark.version}")
    print(f"Spark UI available at: {sc.uiWebUrl}")
else:
    print("Skipping Spark tasks - Pyspark not available")

Spark session initialzed succesfully!
Spark version: 3.5.0
Spark UI available at: http://test:4041


## Utility functions

In [17]:
def parse_json_safe(json_str: str) -> dict:
    """
    Safely parse a JSON string and add ingestion metadata.

    Args:
        json_str (str): The JSON string to parse.
    Returns:
        dict: A dictionary containing the parsed data and ingestion metadata,
              or error information if parsing fails.
    """
    try:
        data = json.loads(json_str)

        # Add ingestion metadata
        data["_ingestion_date"] = datetime.now(timezone.utc).strftime("%Y-%m-%d")
        data["_ingestion_timestamp"] = time.time()
        data["_source"] = "yelp_dataset"
        data["_status"] = "valid"

        return data

    except json.JSONDecodeError as e:

        return {
            "_raw_data": json_str,
            "_ingestion_timestamp": time.time(),
            "_source": "yelp_dataset",
            "_status": "parse_error",
            "_error_msg": str(e),
        }

In [18]:
def is_business_valid(business: dict) -> bool:
    """
    Validate business data based on specific criteria.

    Args:
        business (dict): The business data to validate.
    Returns:
        bool: True if the business data is valid, False otherwise.
    """
    required_fields = ["business_id", "name", "categories"]

    if not all(field in business for field in required_fields):
        return False

    if not isinstance(business["business_id"], str) or  len(business["business_id"].strip()) == 0:
        return False
    
    if not isinstance(business["name"], str) or len(business["name"].strip()) == 0:
        return False
    
    if not isinstance(business["categories"], str) or len(business["categories"].strip()) == 0:
        return False

    return True

In [19]:
def transform_business_silver(business: dict) -> dict:
    """
    Transform business data to silver schema.

    Args:
        business (dict): The business data to transform.
    Returns:
        dict: The transformed business data.
    """
    raw_cat = business.get("categories")
    if raw_cat is None or len(raw_cat.strip()) == 0:
        categories = ["Unknown"]
    else:
        categories = [cat.strip() for cat in raw_cat.split(",") if cat.strip()]
    
    return {
        "business_id": business["business_id"],
        "name": business["name"],
        "categories": categories,
        "ingest_date": datetime.now(timezone.utc).strftime("%Y-%m-%d"),
    }

## Quick santity check

In [20]:
!wc -l /data/bronze/yelp/raw/2025-11-13/yelp_academic_dataset_business.json

150346 /data/bronze/yelp/raw/2025-11-13/yelp_academic_dataset_business.json


In [21]:
!head -n 3 /data/bronze/yelp/raw/2025-11-13/yelp_academic_dataset_business.json

{"business_id":"Pns2l4eNsfO8kk83dixA6A","name":"Abby Rappoport, LAC, CMQ","address":"1616 Chapala St, Ste 2","city":"Santa Barbara","state":"CA","postal_code":"93101","latitude":34.4266787,"longitude":-119.7111968,"stars":5.0,"review_count":7,"is_open":0,"attributes":{"ByAppointmentOnly":"True"},"categories":"Doctors, Traditional Chinese Medicine, Naturopathic\/Holistic, Acupuncture, Health & Medical, Nutritionists","hours":null}
{"business_id":"mpf3x-BjTdTEA3yCZrAYPw","name":"The UPS Store","address":"87 Grasso Plaza Shopping Center","city":"Affton","state":"MO","postal_code":"63123","latitude":38.551126,"longitude":-90.335695,"stars":3.0,"review_count":15,"is_open":1,"attributes":{"BusinessAcceptsCreditCards":"True"},"categories":"Shipping Centers, Local Services, Notaries, Mailbox Centers, Printing Services","hours":{"Monday":"0:0-0:0","Tuesday":"8:0-18:30","Wednesday":"8:0-18:30","Thursday":"8:0-18:30","Friday":"8:0-18:30","Saturday":"8:0-14:0"}}
{"business_id":"tUFrWirKiKi_TAnsVWI

## Load bronze data as RDD

In [22]:
raw_path = "file:///data/bronze/yelp/raw/2025-11-13/yelp_academic_dataset_business.json"
if pyspark_available:
    business_raw_rdd = sc.textFile(raw_path)
    business_parsed_rdd = business_raw_rdd.map(parse_json_safe)
    print("Parsed record count:", business_parsed_rdd.count())
    print("Parsed sample line:", business_parsed_rdd.take(1))

                                                                                

Parsed record count: 150346
Parsed sample line: [{'business_id': 'Pns2l4eNsfO8kk83dixA6A', 'name': 'Abby Rappoport, LAC, CMQ', 'address': '1616 Chapala St, Ste 2', 'city': 'Santa Barbara', 'state': 'CA', 'postal_code': '93101', 'latitude': 34.4266787, 'longitude': -119.7111968, 'stars': 5.0, 'review_count': 7, 'is_open': 0, 'attributes': {'ByAppointmentOnly': 'True'}, 'categories': 'Doctors, Traditional Chinese Medicine, Naturopathic/Holistic, Acupuncture, Health & Medical, Nutritionists', 'hours': None, '_ingestion_date': '2025-11-19', '_ingestion_timestamp': 1763546101.1103616, '_source': 'yelp_dataset', '_status': 'valid'}]


## Filter parable business

In [23]:
if pyspark_available:
    business_valid_json_rdd = business_parsed_rdd.filter(lambda d: d["_status"] == "valid")
    business_invalid_json_rdd = business_parsed_rdd.filter(
        lambda d: d["_status"] == "parse_error"
    )

    total_count = business_parsed_rdd.count()
    invalid_count = business_invalid_json_rdd.count()
    print(
        f"Malformed records: {invalid_count}/{total_count} ({invalid_count/total_count*100:.2f}%)"
    )
    print(f"Valid records: {business_valid_json_rdd.count()}")

                                                                                

Malformed records: 0/150346 (0.00%)
Valid records: 150346


                                                                                

## Filter valid business

In [24]:
if pyspark_available:
    business_valid_rdd = business_valid_json_rdd.filter(is_business_valid)

    print(f"Valid business records: {business_valid_rdd.count()}")

Valid business records: 150243


                                                                                

## Deduplicate business by `business_id`

In [25]:
if pyspark_available:
    business_deduped_rdd = (
        business_valid_rdd.map(lambda r: (r["business_id"], r))
        .reduceByKey(lambda a, b: a)
        .map(lambda kv: kv[1])
    )

    print("After deduplication:", business_deduped_rdd.count())



After deduplication: 150243


                                                                                

## Apply silver transformattion

In [26]:
if pyspark_available:
    business_silver_rdd = business_deduped_rdd.map(transform_business_silver)
    print("Transformed business record count:", business_silver_rdd.count())
    print("Sample transformed business record:", business_silver_rdd.take(1))

                                                                                

Transformed business record count: 150243
Sample transformed business record: [{'business_id': 'Pns2l4eNsfO8kk83dixA6A', 'name': 'Abby Rappoport, LAC, CMQ', 'categories': ['Doctors', 'Traditional Chinese Medicine', 'Naturopathic/Holistic', 'Acupuncture', 'Health & Medical', 'Nutritionists'], 'ingest_date': '2025-11-19'}]


## Convert RDD to DataFrame

In [29]:
if pyspark_available:
    business_silver_schema = StructType(
        [
            StructField("business_id", StringType(), False),
            StructField("name", StringType(), False),
            StructField("categories", StringType(), False),
            StructField("ingest_date", StringType(), False),
        ]
    )

    business_silver_df = spark.createDataFrame(business_silver_rdd, schema=business_silver_schema)
    business_silver_df.printSchema()
    business_silver_df.show(5, truncate=False)

root
 |-- business_id: string (nullable = false)
 |-- name: string (nullable = false)
 |-- categories: string (nullable = false)
 |-- ingest_date: string (nullable = false)

+----------------------+------------------------+------------------------------------------------------------------------------------------------------------+-----------+
|business_id           |name                    |categories                                                                                                  |ingest_date|
+----------------------+------------------------+------------------------------------------------------------------------------------------------------------+-----------+
|Pns2l4eNsfO8kk83dixA6A|Abby Rappoport, LAC, CMQ|[Doctors, Traditional Chinese Medicine, Naturopathic/Holistic, Acupuncture, Health & Medical, Nutritionists]|2025-11-19 |
|mpf3x-BjTdTEA3yCZrAYPw|The UPS Store           |[Shipping Centers, Local Services, Notaries, Mailbox Centers, Printing Services]             

## Write silver data to Parquet

In [30]:
if pyspark_available:
    business_silver_path = "file:///data/silver/yelp/business/"
    business_silver_df.write.mode("overwrite").partitionBy("ingest_date").parquet(
        business_silver_path
    )
    print(f"Users silver data written to: {business_silver_path}")

[Stage 43:>                                                         (0 + 4) / 4]

Users silver data written to: file:///data/silver/yelp/business/


                                                                                

## Cleanup

In [31]:
if pyspark_available:
    spark.stop()
    print("Spark session stopped.")

Spark session stopped.
