In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.ml.feature import Imputer
import pyspark.sql.functions as F
from delta import *


In [None]:
builder = SparkSession.builder \
    .appName("Tips to Silver") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:3.1.0") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

spark = configure_spark_with_delta_pip(builder).getOrCreate()


In [None]:
def handle_missing_values(df):
    df = df.na.fill({
        "text": "No tip text provided",
        "date": "1970-01-01",
        "compliment_count": 0
    })
    return df

df = handle_missing_values(df)
df.show(5)


In [None]:
def standardize_data(df):
    return df \
        .withColumn("text", trim(lower(col("text")))) \
        .withColumn("date", to_timestamp(col("date"))) \
        .withColumn("year", year(col("date"))) \
        .withColumn("month", month(col("date"))) \
        .withColumn("day", dayofmonth(col("date")))

df = standardize_data(df)
df.show(5)


In [None]:
def feature_engineering(df):
    df = df.withColumn("tip_length", length(col("text")))
    
    df = df.withColumn("tip_age_days", 
        datediff(current_date(), col("date")))
    
    df = df.withColumn("tip_popularity",
        when(col("compliment_count") >= 10, "High")
        .when(col("compliment_count") >= 5, "Medium")
        .when(col("compliment_count") >= 1, "Low")
        .otherwise("None"))
    
    df = df.withColumn("day_of_week", date_format(col("date"), "EEEE"))
    
    return df

df = feature_engineering(df)
df.show(5)


In [None]:
def validate_data(df):
    df = df.filter(
        (col("compliment_count") >= 0) &
        (length(col("text")) > 0)
    )
    
    df = df.filter(
        (length(col("business_id")) > 0) &
        (length(col("user_id")) > 0)
    )
    
    return df

df = validate_data(df)
df.show(5)


In [None]:
def quality_checks(df):
    null_counts = df.select([sum(col(c).isNull().cast("int")).alias(c) 
                           for c in df.columns])
    
    duplicate_count = df.count() - df.dropDuplicates().count()
    
    value_dist = df.select(
        mean("compliment_count").alias("mean_compliments"),
        stddev("compliment_count").alias("stddev_compliments"),
        mean("tip_length").alias("mean_tip_length"),
        mean("tip_age_days").alias("mean_tip_age")
    )
    
    print("Null Counts:")
    null_counts.show()
    print(f"\nDuplicate Count: {duplicate_count}")
    print("\nValue Distributions:")
    value_dist.show()
    
    return df

df = quality_checks(df)


In [None]:
df.write.format("delta") \
    .mode("overwrite") \
    .partitionBy("year", "month") \
    .save("D:/Project/delta_lake/silver/tips")
