In [None]:
import time
from datetime import datetime
import json
import re
import findspark
findspark.init()

try: 
    from pyspark.sql import SparkSession
    pyspark_available = True
except ImportError:
    print("PySpark not available. Install with: pip install pyspark")
    pyspark_available = False

# Initialize SparkSession and SparkContext
if pyspark_available:
    spark = SparkSession.builder \
        .appName("Yelp_bronze_cleaning_rdd") \
        .master("local[*]") \
        .getOrCreate()
    sc = spark.sparkContext

    print(f"Spark session initialzed succesfully!")
    print(f"Spark version: {spark.version}")
    print(f"Spark UI available at: {sc.uiWebUrl}")
else:
    print("Skipping Spark tasks - Pyspark not available")

In [None]:
raw_path = "file:///data/bronze/yelp/raw/2025-11-13/yelp_academic_dataset_review.json"
if pyspark_available:
    bronze_rdd = sc.textFile(raw_path)
    print("Raw record count:", bronze_rdd.count())
    print("Sample line:", bronze_rdd.take(1)[0][:300])


In [None]:
def parse_json_safe(json_str):
    try:
        data = json.loads(json_str)
        # Add ingestion metadata
        data['_ingestion_timestamp'] = time.time()
        data['_source'] = 'yelp_dataset'
        data['_status'] = 'valid'
        return data
    except:
        return {
            '_raw_data': json_str,
            '_ingestion_timestamp': time.time(),
            '_source': 'yelp_dataset',
            '_status': 'parse_error'
        }

if pyspark_available:
    parsed_rdd = bronze_rdd.map(parse_json_safe)

    valid_rdd = parsed_rdd.filter(lambda d: d['_status'] == 'valid')
    invalid_rdd = parsed_rdd.filter(lambda d: d['_status'] == 'parse_error')

In [None]:
if pyspark_available:
    total = bronze_rdd.count()
    bad_count = invalid_rdd.count()
    print(f"Malformed records: {bad_count}/{total} ({bad_count/total*100:.2f}%)")


In [None]:
sample = valid_rdd.take(1)[0]
print(type(sample))
print(sample)


In [None]:
print(type(parsed_rdd.take(1)[0]))


In [None]:
def valid_review(r):
    required = ["review_id", "user_id", "business_id", "stars", "date", "text", "useful", "funny", "cool"]
    if not all(k in r for k in required):
        return False
    if r["text"] is None or len(r["text"].strip()) == 0:
        return False
    try:
        stars = int(r["stars"])
        if stars < 1 or stars > 5:
            return False
        
        for field in ["useful", "funny", "cool"]:
            if int(r[field]) < 0:
                return False
            
        # Try both possible formats
        try:
            datetime.strptime(r["date"], "%Y-%m-%d")
        except ValueError:
            datetime.strptime(r["date"], "%Y-%m-%d %H:%M:%S")
        
    except Exception:
        return False
    
    return True

    
if pyspark_available:
    valid_rdd_2 = valid_rdd.filter(valid_review)
    print("Valid records:", valid_rdd_2.count())


In [None]:
deduped = valid_rdd_2.map(lambda r: (r["review_id"], r)) \
                   .reduceByKey(lambda a, b: a) \
                   .map(lambda kv: kv[1])

print("After deduplication:", deduped.count())


In [None]:
punct_re = re.compile(r'[^\w\s]')

def normalize_review(r):
    # Lowercase and remove punctuation
    text = r["text"].lower()
    text = punct_re.sub('', text).strip()
    
    # Normalize date to ISO format
    try:
        dt = time.datetime.strptime(r["date"], "%Y-%m-%d")
        r["date"] = dt.strftime("%Y-%m-%d")
    except:
        r["date"] = None
    
    # Return normalized record
    return {
        "review_id": r["review_id"],
        "user_id": r["user_id"],
        "business_id": r["business_id"],
        "stars": float(r["stars"]),
        "text": text,
        "date": r["date"]
    }

normalized = deduped.map(normalize_review)


In [None]:
from datetime import datetime
fill_date = datetime.utcnow().strftime("%Y-%m-%d")

cleaned = normalized.map(lambda r: {**r, "date": r["date"] or fill_date})


In [None]:
output_path = "/data/silver/yelp/cleaned_reviews_jsonlines"
cleaned_json = cleaned.map(lambda r: json.dumps(r))

cleaned_json.coalesce(8).saveAsTextFile(output_path)
print(f"âœ… Cleaned data saved to {output_path}")


In [None]:
total_cleaned = cleaned.count()
avg_length = cleaned.map(lambda r: len(r["text"].split())).mean()
star_dist = cleaned.map(lambda r: (r["stars"], 1)).reduceByKey(lambda a,b: a+b).collect()

print("Total cleaned reviews:", total_cleaned)
print("Average review length (words):", avg_length)
print("Stars distribution:", star_dist)
