In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.ml.feature import Imputer
import pyspark.sql.functions as F
from delta import *

In [2]:
builder = SparkSession.builder \
    .appName("Reviews to Silver") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:3.1.0") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [3]:
def load_reviews_data(spark, path):
    schema = StructType([
        StructField("review_id", StringType(), True),
        StructField("user_id", StringType(), True),
        StructField("business_id", StringType(), True),
        StructField("stars", DoubleType(), True),
        StructField("useful", IntegerType(), True),
        StructField("funny", IntegerType(), True),
        StructField("cool", IntegerType(), True),
        StructField("text", StringType(), True),
        StructField("date", StringType(), True)
    ])
    return spark.read.json(path, schema=schema)

df = load_reviews_data(spark, "D:/Project/delta_lake/bronze/yelp_academic_dataset_review.json")
df.show(5)

+--------------------+--------------------+--------------------+-----+------+-----+----+--------------------+-------------------+
|           review_id|             user_id|         business_id|stars|useful|funny|cool|                text|               date|
+--------------------+--------------------+--------------------+-----+------+-----+----+--------------------+-------------------+
|KU_O5udG6zpxOg-Vc...|mh_-eMZ6K5RLWhZyI...|XQfwVwDr-v0ZS3_Cb...|  3.0|     0|    0|   0|If you decide to ...|2018-07-07 22:09:11|
|BiTunyQ73aT9WBnpR...|OyoGAe7OKpv6SyGZT...|7ATYjTIgM3jUlt4UM...|  5.0|     1|    0|   1|I've taken a lot ...|2012-01-03 15:28:18|
|saUsX_uimxRlCVr67...|8g_iMtfSiwikVnbP2...|YjUWPpI6HXG530lwP...|  3.0|     0|    0|   0|Family diner. Had...|2014-02-05 20:30:30|
|AqPFMleE6RsU23_au...|_7bHUi9Uuf5__HHc_...|kxX2SOes4o-D3ZQBk...|  5.0|     1|    0|   1|Wow!  Yummy, diff...|2015-01-04 00:01:03|
|Sx8TMOWLNuJBWer-0...|bcjbaE6dDog4jkNY9...|e4Vwtrqf-wpJfwesg...|  4.0|     1|    0|   1|Cu

In [4]:
def handle_missing_values(df):
    imputer = Imputer(
        inputCols=["stars", "useful", "funny", "cool"],
        outputCols=["stars_imputed", "useful_imputed", "funny_imputed", "cool_imputed"]
    ).setStrategy("median")
    
    df = imputer.fit(df).transform(df)
    
    df = df.na.fill({
        "text": "No review text provided",
        "date": "1970-01-01"
    })
    return df

df = handle_missing_values(df)
df.show(5)

+--------------------+--------------------+--------------------+-----+------+-----+----+--------------------+-------------------+-------------+--------------+-------------+------------+
|           review_id|             user_id|         business_id|stars|useful|funny|cool|                text|               date|stars_imputed|useful_imputed|funny_imputed|cool_imputed|
+--------------------+--------------------+--------------------+-----+------+-----+----+--------------------+-------------------+-------------+--------------+-------------+------------+
|KU_O5udG6zpxOg-Vc...|mh_-eMZ6K5RLWhZyI...|XQfwVwDr-v0ZS3_Cb...|  3.0|     0|    0|   0|If you decide to ...|2018-07-07 22:09:11|          3.0|             0|            0|           0|
|BiTunyQ73aT9WBnpR...|OyoGAe7OKpv6SyGZT...|7ATYjTIgM3jUlt4UM...|  5.0|     1|    0|   1|I've taken a lot ...|2012-01-03 15:28:18|          5.0|             1|            0|           1|
|saUsX_uimxRlCVr67...|8g_iMtfSiwikVnbP2...|YjUWPpI6HXG530lwP...|  3.0|

In [5]:
def standardize_data(df):
    return df \
        .withColumn("text", trim(lower(col("text")))) \
        .withColumn("date", to_timestamp(col("date"))) \
        .withColumn("year", year(col("date"))) \
        .withColumn("month", month(col("date"))) \
        .withColumn("day", dayofmonth(col("date")))

df = standardize_data(df)
df.show(5)

+--------------------+--------------------+--------------------+-----+------+-----+----+--------------------+-------------------+-------------+--------------+-------------+------------+----+-----+---+
|           review_id|             user_id|         business_id|stars|useful|funny|cool|                text|               date|stars_imputed|useful_imputed|funny_imputed|cool_imputed|year|month|day|
+--------------------+--------------------+--------------------+-----+------+-----+----+--------------------+-------------------+-------------+--------------+-------------+------------+----+-----+---+
|KU_O5udG6zpxOg-Vc...|mh_-eMZ6K5RLWhZyI...|XQfwVwDr-v0ZS3_Cb...|  3.0|     0|    0|   0|if you decide to ...|2018-07-07 22:09:11|          3.0|             0|            0|           0|2018|    7|  7|
|BiTunyQ73aT9WBnpR...|OyoGAe7OKpv6SyGZT...|7ATYjTIgM3jUlt4UM...|  5.0|     1|    0|   1|i've taken a lot ...|2012-01-03 15:28:18|          5.0|             1|            0|           1|2012|    1|

In [6]:
def feature_engineering(df):
    df = df.withColumn("text_length", length(col("text")))
    
    df = df.withColumn("total_votes", 
        col("useful") + col("funny") + col("cool"))
    
    df = df.withColumn("rating_category",
        when(col("stars") >= 4.5, "Excellent")
        .when(col("stars") >= 4.0, "Very Good")
        .when(col("stars") >= 3.5, "Good")
        .when(col("stars") >= 3.0, "Average")
        .otherwise("Below Average"))
    
    df = df.withColumn("review_age_days", 
        datediff(current_date(), col("date")))
    
    return df

df = feature_engineering(df)
df.show(5)

+--------------------+--------------------+--------------------+-----+------+-----+----+--------------------+-------------------+-------------+--------------+-------------+------------+----+-----+---+-----------+-----------+---------------+---------------+
|           review_id|             user_id|         business_id|stars|useful|funny|cool|                text|               date|stars_imputed|useful_imputed|funny_imputed|cool_imputed|year|month|day|text_length|total_votes|rating_category|review_age_days|
+--------------------+--------------------+--------------------+-----+------+-----+----+--------------------+-------------------+-------------+--------------+-------------+------------+----+-----+---+-----------+-----------+---------------+---------------+
|KU_O5udG6zpxOg-Vc...|mh_-eMZ6K5RLWhZyI...|XQfwVwDr-v0ZS3_Cb...|  3.0|     0|    0|   0|if you decide to ...|2018-07-07 22:09:11|          3.0|             0|            0|           0|2018|    7|  7|        513|          0|     

In [7]:
def validate_data(df):
    df = df.filter(
        (col("stars").between(1, 5)) &
        (col("useful") >= 0) &
        (col("funny") >= 0) &
        (col("cool") >= 0) &
        (length(col("text")) > 0)
    )
    
    df = df.filter(
        (length(col("review_id")) > 0) &
        (length(col("business_id")) > 0) &
        (length(col("user_id")) > 0)
    )
    
    return df

df = validate_data(df)
df.show(5)

+--------------------+--------------------+--------------------+-----+------+-----+----+--------------------+-------------------+-------------+--------------+-------------+------------+----+-----+---+-----------+-----------+---------------+---------------+
|           review_id|             user_id|         business_id|stars|useful|funny|cool|                text|               date|stars_imputed|useful_imputed|funny_imputed|cool_imputed|year|month|day|text_length|total_votes|rating_category|review_age_days|
+--------------------+--------------------+--------------------+-----+------+-----+----+--------------------+-------------------+-------------+--------------+-------------+------------+----+-----+---+-----------+-----------+---------------+---------------+
|KU_O5udG6zpxOg-Vc...|mh_-eMZ6K5RLWhZyI...|XQfwVwDr-v0ZS3_Cb...|  3.0|     0|    0|   0|if you decide to ...|2018-07-07 22:09:11|          3.0|             0|            0|           0|2018|    7|  7|        513|          0|     

In [8]:
def quality_checks(df):
    null_counts = df.select([sum(col(c).isNull().cast("int")).alias(c) 
                           for c in df.columns])
    
    duplicate_count = df.count() - df.dropDuplicates().count()
    
    value_dist = df.select(
        mean("stars").alias("mean_stars"),
        stddev("stars").alias("stddev_stars"),
        mean("useful").alias("mean_useful"),
        mean("funny").alias("mean_funny"),
        mean("cool").alias("mean_cool")
    )
    
    print("Null Counts:")
    null_counts.show()
    print(f"\nDuplicate Count: {duplicate_count}")
    print("\nValue Distributions:")
    value_dist.show()
    
    return df

df = quality_checks(df)

Null Counts:
+---------+-------+-----------+-----+------+-----+----+----+----+-------------+--------------+-------------+------------+----+-----+---+-----------+-----------+---------------+---------------+
|review_id|user_id|business_id|stars|useful|funny|cool|text|date|stars_imputed|useful_imputed|funny_imputed|cool_imputed|year|month|day|text_length|total_votes|rating_category|review_age_days|
+---------+-------+-----------+-----+------+-----+----+----+----+-------------+--------------+-------------+------------+----+-----+---+-----------+-----------+---------------+---------------+
|        0|      0|          0|    0|     0|    0|   0|   0|   0|            0|             0|            0|           0|   0|    0|  0|          0|          0|              0|              0|
+---------+-------+-----------+-----+------+-----+----+----+----+-------------+--------------+-------------+------------+----+-----+---+-----------+-----------+---------------+---------------+


Duplicate Count: 0



In [9]:
df.write.format("delta") \
    .option("mergeSchema", "true") \
    .mode("overwrite") \
    .partitionBy("year", "month") \
    .save("D:/Project/delta_lake/silver/reviews")