In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.ml.feature import Imputer
import pyspark.sql.functions as F
from delta import *


In [2]:
builder = SparkSession.builder \
    .appName("Business to Silver") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:3.1.0") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [3]:
def load_business_data(spark, path):
    schema = StructType([
        StructField("business_id", StringType(), True),
        StructField("name", StringType(), True),
        StructField("address", StringType(), True),
        StructField("city", StringType(), True),
        StructField("state", StringType(), True),
        StructField("postal_code", StringType(), True),
        StructField("latitude", DoubleType(), True),
        StructField("longitude", DoubleType(), True),
        StructField("stars", DoubleType(), True),
        StructField("review_count", IntegerType(), True),
        StructField("is_open", IntegerType(), True),
        StructField("attributes", MapType(StringType(), StringType()), True),
        StructField("categories", StringType(), True),
        StructField("hours", MapType(StringType(), StringType()), True)
    ])
    return spark.read.json(path, schema=schema)

df = load_business_data(spark, "D:/Project/delta_lake/bronze/yelp_academic_dataset_business.json")
df.show(5)

+--------------------+--------------------+--------------------+-------------+-----+-----------+----------+------------+-----+------------+-------+--------------------+--------------------+--------------------+
|         business_id|                name|             address|         city|state|postal_code|  latitude|   longitude|stars|review_count|is_open|          attributes|          categories|               hours|
+--------------------+--------------------+--------------------+-------------+-----+-----------+----------+------------+-----+------------+-------+--------------------+--------------------+--------------------+
|Pns2l4eNsfO8kk83d...|Abby Rappoport, L...|1616 Chapala St, ...|Santa Barbara|   CA|      93101|34.4266787|-119.7111968|  5.0|           7|      0|{ByAppointmentOnl...|Doctors, Traditio...|                NULL|
|mpf3x-BjTdTEA3yCZ...|       The UPS Store|87 Grasso Plaza S...|       Affton|   MO|      63123| 38.551126|  -90.335695|  3.0|          15|      1|{Business

In [4]:
def handle_missing_values(df):
    imputer = Imputer(
        inputCols=["stars", "review_count"],
        outputCols=["stars_imputed", "review_count_imputed"]
    ).setStrategy("median")
    
    df = imputer.fit(df).transform(df)
    
    df = df.na.fill({
        "is_open": 0,
        "name": "Unknown",
        "city": "Unknown",
        "state": "Unknown",
        "categories": "Uncategorized"
    })
    return df

df = handle_missing_values(df)
df.show(5)

+--------------------+--------------------+--------------------+-------------+-----+-----------+----------+------------+-----+------------+-------+--------------------+--------------------+--------------------+-------------+--------------------+
|         business_id|                name|             address|         city|state|postal_code|  latitude|   longitude|stars|review_count|is_open|          attributes|          categories|               hours|stars_imputed|review_count_imputed|
+--------------------+--------------------+--------------------+-------------+-----+-----------+----------+------------+-----+------------+-------+--------------------+--------------------+--------------------+-------------+--------------------+
|Pns2l4eNsfO8kk83d...|Abby Rappoport, L...|1616 Chapala St, ...|Santa Barbara|   CA|      93101|34.4266787|-119.7111968|  5.0|           7|      0|{ByAppointmentOnl...|Doctors, Traditio...|                NULL|          5.0|                   7|
|mpf3x-BjTdTEA3y

In [5]:
def remove_duplicates(df):
    df = df.dropDuplicates()
    df = df.dropDuplicates(["business_id"])
    return df

df = df.dropDuplicates(subset=[col for col in df.columns if df.schema[col].dataType != MapType(StringType(), StringType())])
df.show(5)

+--------------------+--------------------+--------------------+--------+-----+-----------+-------------+--------------+-----+------------+-------+--------------------+--------------------+--------------------+-------------+--------------------+
|         business_id|                name|             address|    city|state|postal_code|     latitude|     longitude|stars|review_count|is_open|          attributes|          categories|               hours|stars_imputed|review_count_imputed|
+--------------------+--------------------+--------------------+--------+-----+-----------+-------------+--------------+-----+------------+-------+--------------------+--------------------+--------------------+-------------+--------------------+
|2vUErjw9pnsSwR5fG...|Abington Neurolog...|1151 Old York Roa...|Abington|   PA|      19001|40.1162277794|-75.1205835432|  1.5|           7|      1|{BusinessAcceptsC...|Neurologist, Heal...|{Monday -> 0:0-0:...|          1.5|                   7|
|-gg96o-MxRQ7s_N

In [6]:
def standardize_data(df):
    return df \
        .withColumn("name", trim(lower(col("name")))) \
        .withColumn("city", trim(lower(col("city")))) \
        .withColumn("state", trim(upper(col("state")))) \
        .withColumn("categories", trim(lower(col("categories")))) \
        .withColumn("postal_code", regexp_replace("postal_code", "[^0-9]", ""))

df = standardize_data(df)
df.show(5)

+--------------------+--------------------+--------------------+--------+-----+-----------+-------------+--------------+-----+------------+-------+--------------------+--------------------+--------------------+-------------+--------------------+
|         business_id|                name|             address|    city|state|postal_code|     latitude|     longitude|stars|review_count|is_open|          attributes|          categories|               hours|stars_imputed|review_count_imputed|
+--------------------+--------------------+--------------------+--------+-----+-----------+-------------+--------------+-----+------------+-------+--------------------+--------------------+--------------------+-------------+--------------------+
|2vUErjw9pnsSwR5fG...|abington neurolog...|1151 Old York Roa...|abington|   PA|      19001|40.1162277794|-75.1205835432|  1.5|           7|      1|{BusinessAcceptsC...|neurologist, heal...|{Monday -> 0:0-0:...|          1.5|                   7|
|-gg96o-MxRQ7s_N

In [7]:
def handle_outliers(df):
    stats = df.select([
        percentile_approx("stars", 0.25).alias("q1_stars"),
        percentile_approx("stars", 0.75).alias("q3_stars"),
        percentile_approx("review_count", 0.25).alias("q1_reviews"),
        percentile_approx("review_count", 0.75).alias("q3_reviews")
    ]).collect()[0]
    
    iqr_stars = stats["q3_stars"] - stats["q1_stars"]
    iqr_reviews = stats["q3_reviews"] - stats["q1_reviews"]
    
    return df.filter(
        (col("stars").between(
            stats["q1_stars"] - 1.5 * iqr_stars,
            stats["q3_stars"] + 1.5 * iqr_stars
        )) &
        (col("review_count").between(
            stats["q1_reviews"] - 1.5 * iqr_reviews,
            stats["q3_reviews"] + 1.5 * iqr_reviews
        ))
    )

df = handle_outliers(df)
df.show(5)

+--------------------+--------------------+--------------------+--------+-----+-----------+-------------+--------------+-----+------------+-------+--------------------+--------------------+--------------------+-------------+--------------------+
|         business_id|                name|             address|    city|state|postal_code|     latitude|     longitude|stars|review_count|is_open|          attributes|          categories|               hours|stars_imputed|review_count_imputed|
+--------------------+--------------------+--------------------+--------+-----+-----------+-------------+--------------+-----+------------+-------+--------------------+--------------------+--------------------+-------------+--------------------+
|2vUErjw9pnsSwR5fG...|abington neurolog...|1151 Old York Roa...|abington|   PA|      19001|40.1162277794|-75.1205835432|  1.5|           7|      1|{BusinessAcceptsC...|neurologist, heal...|{Monday -> 0:0-0:...|          1.5|                   7|
|-gg96o-MxRQ7s_N

In [8]:
def feature_engineering(df):
    df = df.withColumn("categories_array", split(col("categories"), ","))
    
    df = df.withColumn("price_range",
        when(col("attributes.RestaurantsPriceRange2") == "1", "Low")
        .when(col("attributes.RestaurantsPriceRange2") == "2", "Medium")
        .when(col("attributes.RestaurantsPriceRange2") == "3", "High")
        .when(col("attributes.RestaurantsPriceRange2") == "4", "Very High")
        .otherwise("Unknown")
    )
    
    df = df.withColumn("business_status", 
        when(col("is_open") == 1, "Active")
        .otherwise("Closed")
    )
    
    df = df.withColumn("rating_category",
        when(col("stars") >= 4.5, "Excellent")
        .when(col("stars") >= 4.0, "Very Good")
        .when(col("stars") >= 3.5, "Good")
        .when(col("stars") >= 3.0, "Average")
        .otherwise("Poor")
    )
    return df

In [9]:
def validate_data(df):
    df = df.filter(
        (col("stars").between(1, 5)) &
        (col("latitude").between(-90, 90)) &
        (col("longitude").between(-180, 180)) &
        (col("review_count") >= 0)
    )
    
    df = df.filter(
        (length(col("business_id")) > 0) &
        (length(col("postal_code")) == 5)
    )
    return df

df = validate_data(df)
df.show(5)

+--------------------+--------------------+--------------------+--------+-----+-----------+-------------+--------------+-----+------------+-------+--------------------+--------------------+--------------------+-------------+--------------------+
|         business_id|                name|             address|    city|state|postal_code|     latitude|     longitude|stars|review_count|is_open|          attributes|          categories|               hours|stars_imputed|review_count_imputed|
+--------------------+--------------------+--------------------+--------+-----+-----------+-------------+--------------+-----+------------+-------+--------------------+--------------------+--------------------+-------------+--------------------+
|2vUErjw9pnsSwR5fG...|abington neurolog...|1151 Old York Roa...|abington|   PA|      19001|40.1162277794|-75.1205835432|  1.5|           7|      1|{BusinessAcceptsC...|neurologist, heal...|{Monday -> 0:0-0:...|          1.5|                   7|
|-gg96o-MxRQ7s_N

In [10]:
def quality_checks(df):
    null_counts = df.select([sum(col(c).isNull().cast("int")).alias(c) 
                           for c in df.columns])
    
    duplicate_count = df.count() - df.dropDuplicates().count()
    
    value_dist = df.select(
        mean("stars").alias("mean_stars"),
        stddev("stars").alias("stddev_stars"),
        mean("review_count").alias("mean_reviews"),
        stddev("review_count").alias("stddev_reviews")
    )
    
    print("Null Counts:")
    null_counts.show()
    print(f"\nDuplicate Count: {duplicate_count}")
    print("\nValue Distributions:")
    value_dist.show()
    return df

# Remove duplicate rows based on all columns except 'attributes' and 'hours'
df_deduplicated = df.dropDuplicates([col for col in df.columns if df.schema[col].dataType != MapType(StringType(), StringType())])

# # Apply quality checks on the deduplicated DataFrame
# df_checked = quality_checks(df_deduplicated.drop("attributes", "hours"))

# Reassign the result back to df
df = df_deduplicated

In [12]:
df.write.format("delta") \
    .option("mergeSchema", "true") \
    .mode("overwrite") \
    .partitionBy("state") \
    .save("D:/Project/delta_lake/silver/business")

In [13]:
# Read business data from delta lake
business_df = spark.read.format("delta").load("D:/Project/delta_lake/silver/business")


In [14]:
# Display first few rows
business_df.show()

# Get schema information
business_df.printSchema()


+--------------------+--------------------+--------------------+-----+-----+-----------+-------------+--------------+-----+------------+-------+--------------------+-------------+--------------------+--------------------+--------------------+
|         business_id|                name|             address| city|state|postal_code|     latitude|     longitude|stars|review_count|is_open|          categories|stars_imputed|review_count_imputed|          attributes|               hours|
+--------------------+--------------------+--------------------+-----+-----+-----------+-------------+--------------+-----+------------+-------+--------------------+-------------+--------------------+--------------------+--------------------+
|WQYiPwm4iHCaJhyUl...| argosy casino alton|          1 Piasa St|alton|   IL|      62002|   38.8888296|-90.1868203282|  2.5|          20|      1|casinos, restaura...|          2.5|                  20|{GoodForDancing -...|{Monday -> 9:0-2:...|
|7AjGUHCmGHQfE9cxu...|      