In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.ml.feature import Imputer
import pyspark.sql.functions as F
from delta import *


In [2]:
builder = SparkSession.builder \
    .appName("Tips to Silver") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:3.1.0") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

spark = configure_spark_with_delta_pip(builder).getOrCreate()


In [3]:
def load_tips_data(spark, path):
    schema = StructType([
        StructField("user_id", StringType(), True),
        StructField("business_id", StringType(), True),
        StructField("text", StringType(), True),
        StructField("date", StringType(), True),
        StructField("compliment_count", IntegerType(), True)
    ])
    return spark.read.json(path, schema=schema)

df = load_tips_data(spark, "D:/Project/delta_lake/bronze/yelp_academic_dataset_tip.json")
df.show(5)


+--------------------+--------------------+--------------------+-------------------+----------------+
|             user_id|         business_id|                text|               date|compliment_count|
+--------------------+--------------------+--------------------+-------------------+----------------+
|AGNUgVwnZUey3gcPC...|3uLgwr0qeCNMjKenH...|Avengers time wit...|2012-05-18 02:17:21|               0|
|NBN4MgHP9D3cw--Sn...|QoezRbYQncpRqyrLH...|They have lots of...|2013-02-05 18:35:10|               0|
|-copOvldyKh1qr-vz...|MYoRNLb5chwjQe3c_...|It's open even wh...|2013-08-18 00:56:08|               0|
|FjMQVZjSqY8syIO-5...|hV-bABTK-glh5wj31...|Very decent fried...|2017-06-27 23:05:38|               0|
|ld0AperBXk1h6Ubqm...|_uN0OudeJ3Zl_tf6n...|Appetizers.. plat...|2012-10-06 19:43:09|               0|
+--------------------+--------------------+--------------------+-------------------+----------------+
only showing top 5 rows



+--------------------+--------------------+--------------------+-----+------+-----+----+--------------------+-------------------+
|           review_id|             user_id|         business_id|stars|useful|funny|cool|                text|               date|
+--------------------+--------------------+--------------------+-----+------+-----+----+--------------------+-------------------+
|KU_O5udG6zpxOg-Vc...|mh_-eMZ6K5RLWhZyI...|XQfwVwDr-v0ZS3_Cb...|  3.0|     0|    0|   0|If you decide to ...|2018-07-07 22:09:11|
|BiTunyQ73aT9WBnpR...|OyoGAe7OKpv6SyGZT...|7ATYjTIgM3jUlt4UM...|  5.0|     1|    0|   1|I've taken a lot ...|2012-01-03 15:28:18|
|saUsX_uimxRlCVr67...|8g_iMtfSiwikVnbP2...|YjUWPpI6HXG530lwP...|  3.0|     0|    0|   0|Family diner. Had...|2014-02-05 20:30:30|
|AqPFMleE6RsU23_au...|_7bHUi9Uuf5__HHc_...|kxX2SOes4o-D3ZQBk...|  5.0|     1|    0|   1|Wow!  Yummy, diff...|2015-01-04 00:01:03|
|Sx8TMOWLNuJBWer-0...|bcjbaE6dDog4jkNY9...|e4Vwtrqf-wpJfwesg...|  4.0|     1|    0|   1|Cu

+--------------------+--------------------+--------------------+-----+------+-----+----+--------------------+-------------------+
|           review_id|             user_id|         business_id|stars|useful|funny|cool|                text|               date|
+--------------------+--------------------+--------------------+-----+------+-----+----+--------------------+-------------------+
|KU_O5udG6zpxOg-Vc...|mh_-eMZ6K5RLWhZyI...|XQfwVwDr-v0ZS3_Cb...|  3.0|     0|    0|   0|If you decide to ...|2018-07-07 22:09:11|
|BiTunyQ73aT9WBnpR...|OyoGAe7OKpv6SyGZT...|7ATYjTIgM3jUlt4UM...|  5.0|     1|    0|   1|I've taken a lot ...|2012-01-03 15:28:18|
|saUsX_uimxRlCVr67...|8g_iMtfSiwikVnbP2...|YjUWPpI6HXG530lwP...|  3.0|     0|    0|   0|Family diner. Had...|2014-02-05 20:30:30|
|AqPFMleE6RsU23_au...|_7bHUi9Uuf5__HHc_...|kxX2SOes4o-D3ZQBk...|  5.0|     1|    0|   1|Wow!  Yummy, diff...|2015-01-04 00:01:03|
|Sx8TMOWLNuJBWer-0...|bcjbaE6dDog4jkNY9...|e4Vwtrqf-wpJfwesg...|  4.0|     1|    0|   1|Cu

+--------------------+--------------------+--------------------+-----+------+-----+----+--------------------+-------------------+
|           review_id|             user_id|         business_id|stars|useful|funny|cool|                text|               date|
+--------------------+--------------------+--------------------+-----+------+-----+----+--------------------+-------------------+
|KU_O5udG6zpxOg-Vc...|mh_-eMZ6K5RLWhZyI...|XQfwVwDr-v0ZS3_Cb...|  3.0|     0|    0|   0|If you decide to ...|2018-07-07 22:09:11|
|BiTunyQ73aT9WBnpR...|OyoGAe7OKpv6SyGZT...|7ATYjTIgM3jUlt4UM...|  5.0|     1|    0|   1|I've taken a lot ...|2012-01-03 15:28:18|
|saUsX_uimxRlCVr67...|8g_iMtfSiwikVnbP2...|YjUWPpI6HXG530lwP...|  3.0|     0|    0|   0|Family diner. Had...|2014-02-05 20:30:30|
|AqPFMleE6RsU23_au...|_7bHUi9Uuf5__HHc_...|kxX2SOes4o-D3ZQBk...|  5.0|     1|    0|   1|Wow!  Yummy, diff...|2015-01-04 00:01:03|
|Sx8TMOWLNuJBWer-0...|bcjbaE6dDog4jkNY9...|e4Vwtrqf-wpJfwesg...|  4.0|     1|    0|   1|Cu

+--------------------+--------------------+--------------------+-----+------+-----+----+--------------------+-------------------+
|           review_id|             user_id|         business_id|stars|useful|funny|cool|                text|               date|
+--------------------+--------------------+--------------------+-----+------+-----+----+--------------------+-------------------+
|KU_O5udG6zpxOg-Vc...|mh_-eMZ6K5RLWhZyI...|XQfwVwDr-v0ZS3_Cb...|  3.0|     0|    0|   0|If you decide to ...|2018-07-07 22:09:11|
|BiTunyQ73aT9WBnpR...|OyoGAe7OKpv6SyGZT...|7ATYjTIgM3jUlt4UM...|  5.0|     1|    0|   1|I've taken a lot ...|2012-01-03 15:28:18|
|saUsX_uimxRlCVr67...|8g_iMtfSiwikVnbP2...|YjUWPpI6HXG530lwP...|  3.0|     0|    0|   0|Family diner. Had...|2014-02-05 20:30:30|
|AqPFMleE6RsU23_au...|_7bHUi9Uuf5__HHc_...|kxX2SOes4o-D3ZQBk...|  5.0|     1|    0|   1|Wow!  Yummy, diff...|2015-01-04 00:01:03|
|Sx8TMOWLNuJBWer-0...|bcjbaE6dDog4jkNY9...|e4Vwtrqf-wpJfwesg...|  4.0|     1|    0|   1|Cu

+--------------------+--------------------+--------------------+-----+------+-----+----+--------------------+-------------------+
|           review_id|             user_id|         business_id|stars|useful|funny|cool|                text|               date|
+--------------------+--------------------+--------------------+-----+------+-----+----+--------------------+-------------------+
|KU_O5udG6zpxOg-Vc...|mh_-eMZ6K5RLWhZyI...|XQfwVwDr-v0ZS3_Cb...|  3.0|     0|    0|   0|If you decide to ...|2018-07-07 22:09:11|
|BiTunyQ73aT9WBnpR...|OyoGAe7OKpv6SyGZT...|7ATYjTIgM3jUlt4UM...|  5.0|     1|    0|   1|I've taken a lot ...|2012-01-03 15:28:18|
|saUsX_uimxRlCVr67...|8g_iMtfSiwikVnbP2...|YjUWPpI6HXG530lwP...|  3.0|     0|    0|   0|Family diner. Had...|2014-02-05 20:30:30|
|AqPFMleE6RsU23_au...|_7bHUi9Uuf5__HHc_...|kxX2SOes4o-D3ZQBk...|  5.0|     1|    0|   1|Wow!  Yummy, diff...|2015-01-04 00:01:03|
|Sx8TMOWLNuJBWer-0...|bcjbaE6dDog4jkNY9...|e4Vwtrqf-wpJfwesg...|  4.0|     1|    0|   1|Cu

+--------------------+--------------------+--------------------+-----+------+-----+----+--------------------+-------------------+
|           review_id|             user_id|         business_id|stars|useful|funny|cool|                text|               date|
+--------------------+--------------------+--------------------+-----+------+-----+----+--------------------+-------------------+
|KU_O5udG6zpxOg-Vc...|mh_-eMZ6K5RLWhZyI...|XQfwVwDr-v0ZS3_Cb...|  3.0|     0|    0|   0|If you decide to ...|2018-07-07 22:09:11|
|BiTunyQ73aT9WBnpR...|OyoGAe7OKpv6SyGZT...|7ATYjTIgM3jUlt4UM...|  5.0|     1|    0|   1|I've taken a lot ...|2012-01-03 15:28:18|
|saUsX_uimxRlCVr67...|8g_iMtfSiwikVnbP2...|YjUWPpI6HXG530lwP...|  3.0|     0|    0|   0|Family diner. Had...|2014-02-05 20:30:30|
|AqPFMleE6RsU23_au...|_7bHUi9Uuf5__HHc_...|kxX2SOes4o-D3ZQBk...|  5.0|     1|    0|   1|Wow!  Yummy, diff...|2015-01-04 00:01:03|
|Sx8TMOWLNuJBWer-0...|bcjbaE6dDog4jkNY9...|e4Vwtrqf-wpJfwesg...|  4.0|     1|    0|   1|Cu

+--------------------+--------------------+--------------------+-----+------+-----+----+--------------------+-------------------+
|           review_id|             user_id|         business_id|stars|useful|funny|cool|                text|               date|
+--------------------+--------------------+--------------------+-----+------+-----+----+--------------------+-------------------+
|KU_O5udG6zpxOg-Vc...|mh_-eMZ6K5RLWhZyI...|XQfwVwDr-v0ZS3_Cb...|  3.0|     0|    0|   0|If you decide to ...|2018-07-07 22:09:11|
|BiTunyQ73aT9WBnpR...|OyoGAe7OKpv6SyGZT...|7ATYjTIgM3jUlt4UM...|  5.0|     1|    0|   1|I've taken a lot ...|2012-01-03 15:28:18|
|saUsX_uimxRlCVr67...|8g_iMtfSiwikVnbP2...|YjUWPpI6HXG530lwP...|  3.0|     0|    0|   0|Family diner. Had...|2014-02-05 20:30:30|
|AqPFMleE6RsU23_au...|_7bHUi9Uuf5__HHc_...|kxX2SOes4o-D3ZQBk...|  5.0|     1|    0|   1|Wow!  Yummy, diff...|2015-01-04 00:01:03|
|Sx8TMOWLNuJBWer-0...|bcjbaE6dDog4jkNY9...|e4Vwtrqf-wpJfwesg...|  4.0|     1|    0|   1|Cu

+--------------------+--------------------+--------------------+-----+------+-----+----+--------------------+-------------------+
|           review_id|             user_id|         business_id|stars|useful|funny|cool|                text|               date|
+--------------------+--------------------+--------------------+-----+------+-----+----+--------------------+-------------------+
|KU_O5udG6zpxOg-Vc...|mh_-eMZ6K5RLWhZyI...|XQfwVwDr-v0ZS3_Cb...|  3.0|     0|    0|   0|If you decide to ...|2018-07-07 22:09:11|
|BiTunyQ73aT9WBnpR...|OyoGAe7OKpv6SyGZT...|7ATYjTIgM3jUlt4UM...|  5.0|     1|    0|   1|I've taken a lot ...|2012-01-03 15:28:18|
|saUsX_uimxRlCVr67...|8g_iMtfSiwikVnbP2...|YjUWPpI6HXG530lwP...|  3.0|     0|    0|   0|Family diner. Had...|2014-02-05 20:30:30|
|AqPFMleE6RsU23_au...|_7bHUi9Uuf5__HHc_...|kxX2SOes4o-D3ZQBk...|  5.0|     1|    0|   1|Wow!  Yummy, diff...|2015-01-04 00:01:03|
|Sx8TMOWLNuJBWer-0...|bcjbaE6dDog4jkNY9...|e4Vwtrqf-wpJfwesg...|  4.0|     1|    0|   1|Cu

In [4]:
def handle_missing_values(df):
    df = df.na.fill({
        "text": "No tip text provided",
        "date": "1970-01-01",
        "compliment_count": 0
    })
    return df

df = handle_missing_values(df)
df.show(5)


+--------------------+--------------------+--------------------+-------------------+----------------+
|             user_id|         business_id|                text|               date|compliment_count|
+--------------------+--------------------+--------------------+-------------------+----------------+
|AGNUgVwnZUey3gcPC...|3uLgwr0qeCNMjKenH...|Avengers time wit...|2012-05-18 02:17:21|               0|
|NBN4MgHP9D3cw--Sn...|QoezRbYQncpRqyrLH...|They have lots of...|2013-02-05 18:35:10|               0|
|-copOvldyKh1qr-vz...|MYoRNLb5chwjQe3c_...|It's open even wh...|2013-08-18 00:56:08|               0|
|FjMQVZjSqY8syIO-5...|hV-bABTK-glh5wj31...|Very decent fried...|2017-06-27 23:05:38|               0|
|ld0AperBXk1h6Ubqm...|_uN0OudeJ3Zl_tf6n...|Appetizers.. plat...|2012-10-06 19:43:09|               0|
+--------------------+--------------------+--------------------+-------------------+----------------+
only showing top 5 rows



In [5]:
def standardize_data(df):
    return df \
        .withColumn("text", trim(lower(col("text")))) \
        .withColumn("date", to_timestamp(col("date"))) \
        .withColumn("year", year(col("date"))) \
        .withColumn("month", month(col("date"))) \
        .withColumn("day", dayofmonth(col("date")))

df = standardize_data(df)
df.show(5)


+--------------------+--------------------+--------------------+-------------------+----------------+----+-----+---+
|             user_id|         business_id|                text|               date|compliment_count|year|month|day|
+--------------------+--------------------+--------------------+-------------------+----------------+----+-----+---+
|AGNUgVwnZUey3gcPC...|3uLgwr0qeCNMjKenH...|avengers time wit...|2012-05-18 02:17:21|               0|2012|    5| 18|
|NBN4MgHP9D3cw--Sn...|QoezRbYQncpRqyrLH...|they have lots of...|2013-02-05 18:35:10|               0|2013|    2|  5|
|-copOvldyKh1qr-vz...|MYoRNLb5chwjQe3c_...|it's open even wh...|2013-08-18 00:56:08|               0|2013|    8| 18|
|FjMQVZjSqY8syIO-5...|hV-bABTK-glh5wj31...|very decent fried...|2017-06-27 23:05:38|               0|2017|    6| 27|
|ld0AperBXk1h6Ubqm...|_uN0OudeJ3Zl_tf6n...|appetizers.. plat...|2012-10-06 19:43:09|               0|2012|   10|  6|
+--------------------+--------------------+--------------------+

In [6]:
def feature_engineering(df):
    df = df.withColumn("tip_length", length(col("text")))
    
    df = df.withColumn("tip_age_days", 
        datediff(current_date(), col("date")))
    
    df = df.withColumn("tip_popularity",
        when(col("compliment_count") >= 10, "High")
        .when(col("compliment_count") >= 5, "Medium")
        .when(col("compliment_count") >= 1, "Low")
        .otherwise("None"))
    
    df = df.withColumn("day_of_week", date_format(col("date"), "EEEE"))
    
    return df

df = feature_engineering(df)
df.show(5)


+--------------------+--------------------+--------------------+-------------------+----------------+----+-----+---+----------+------------+--------------+-----------+
|             user_id|         business_id|                text|               date|compliment_count|year|month|day|tip_length|tip_age_days|tip_popularity|day_of_week|
+--------------------+--------------------+--------------------+-------------------+----------------+----+-----+---+----------+------------+--------------+-----------+
|AGNUgVwnZUey3gcPC...|3uLgwr0qeCNMjKenH...|avengers time wit...|2012-05-18 02:17:21|               0|2012|    5| 18|        30|        4668|          None|     Friday|
|NBN4MgHP9D3cw--Sn...|QoezRbYQncpRqyrLH...|they have lots of...|2013-02-05 18:35:10|               0|2013|    2|  5|        57|        4405|          None|    Tuesday|
|-copOvldyKh1qr-vz...|MYoRNLb5chwjQe3c_...|it's open even wh...|2013-08-18 00:56:08|               0|2013|    8| 18|        38|        4211|          None|     

In [7]:
def validate_data(df):
    df = df.filter(
        (col("compliment_count") >= 0) &
        (length(col("text")) > 0)
    )
    
    df = df.filter(
        (length(col("business_id")) > 0) &
        (length(col("user_id")) > 0)
    )
    
    return df

df = validate_data(df)
df.show(5)


+--------------------+--------------------+--------------------+-------------------+----------------+----+-----+---+----------+------------+--------------+-----------+
|             user_id|         business_id|                text|               date|compliment_count|year|month|day|tip_length|tip_age_days|tip_popularity|day_of_week|
+--------------------+--------------------+--------------------+-------------------+----------------+----+-----+---+----------+------------+--------------+-----------+
|AGNUgVwnZUey3gcPC...|3uLgwr0qeCNMjKenH...|avengers time wit...|2012-05-18 02:17:21|               0|2012|    5| 18|        30|        4668|          None|     Friday|
|NBN4MgHP9D3cw--Sn...|QoezRbYQncpRqyrLH...|they have lots of...|2013-02-05 18:35:10|               0|2013|    2|  5|        57|        4405|          None|    Tuesday|
|-copOvldyKh1qr-vz...|MYoRNLb5chwjQe3c_...|it's open even wh...|2013-08-18 00:56:08|               0|2013|    8| 18|        38|        4211|          None|     

In [8]:
def quality_checks(df):
    null_counts = df.select([sum(col(c).isNull().cast("int")).alias(c) 
                           for c in df.columns])
    
    duplicate_count = df.count() - df.dropDuplicates().count()
    
    value_dist = df.select(
        mean("compliment_count").alias("mean_compliments"),
        stddev("compliment_count").alias("stddev_compliments"),
        mean("tip_length").alias("mean_tip_length"),
        mean("tip_age_days").alias("mean_tip_age")
    )
    
    print("Null Counts:")
    null_counts.show()
    print(f"\nDuplicate Count: {duplicate_count}")
    print("\nValue Distributions:")
    value_dist.show()
    
    return df

df = quality_checks(df)


Null Counts:
+-------+-----------+----+----+----------------+----+-----+---+----------+------------+--------------+-----------+
|user_id|business_id|text|date|compliment_count|year|month|day|tip_length|tip_age_days|tip_popularity|day_of_week|
+-------+-----------+----+----+----------------+----+-----+---+----------+------------+--------------+-----------+
|      0|          0|   0|   0|               0|   0|    0|  0|         0|           0|             0|          0|
+-------+-----------+----+----+----------------+----+-----+---+----------+------------+--------------+-----------+


Duplicate Count: 67

Value Distributions:
+--------------------+-------------------+-----------------+------------------+
|    mean_compliments| stddev_compliments|  mean_tip_length|      mean_tip_age|
+--------------------+-------------------+-----------------+------------------+
|0.012524823553357574|0.12076339327984317|62.58009494837251|3546.1379193874013|
+--------------------+-------------------+------

In [9]:
df.write.format("delta") \
    .option("mergeSchema", "true") \
    .mode("overwrite") \
    .partitionBy("year", "month") \
    .save("D:/Project/delta_lake/silver/tips")
