In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.ml.feature import Imputer
import pyspark.sql.functions as F
from delta import *

In [None]:
builder = SparkSession.builder \
    .appName("Checkins to Silver") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:3.1.0") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [None]:
def load_checkin_data(spark, path):
    schema = StructType([
        StructField("business_id", StringType(), True),
        StructField("date", StringType(), True)
    ])
    return spark.read.json(path, schema=schema)

df = load_checkin_data(spark, "D:/Project/delta_lake/bronze/yelp_academic_dataset_checkin.json")
df.show(5)

In [None]:
def transform_dates(df):
    # Split the date string into individual timestamps
    df = df.withColumn("date_array", split(col("date"), ", "))
    
    # Explode the array to create individual rows for each checkin
    df = df.withColumn("checkin_timestamp", explode(col("date_array")))
    
    # Convert to timestamp and extract components
    df = df.withColumn("checkin_timestamp", to_timestamp(col("checkin_timestamp"))) \
        .withColumn("year", year(col("checkin_timestamp"))) \
        .withColumn("month", month(col("checkin_timestamp"))) \
        .withColumn("day", dayofmonth(col("checkin_timestamp"))) \
        .withColumn("hour", hour(col("checkin_timestamp"))) \
        .withColumn("day_of_week", date_format(col("checkin_timestamp"), "EEEE"))
    
    return df.drop("date_array", "date")

df = transform_dates(df)
df.show(5)


In [None]:
def feature_engineering(df):
    # Time period categorization
    df = df.withColumn("time_period",
        when((col("hour") >= 6) & (col("hour") < 12), "Morning")
        .when((col("hour") >= 12) & (col("hour") < 17), "Afternoon")
        .when((col("hour") >= 17) & (col("hour") < 22), "Evening")
        .otherwise("Night"))
    
    # Weekend flag
    df = df.withColumn("is_weekend",
        when(col("day_of_week").isin(["Saturday", "Sunday"]), True)
        .otherwise(False))
    
    # Business checkin frequency
    window_daily = Window.partitionBy("business_id", "year", "month", "day")
    df = df.withColumn("daily_checkins", count("*").over(window_daily))
    
    return df

df = feature_engineering(df)
df.show(5)


In [None]:
def calculate_metrics(df):
    # Daily aggregations
    daily_metrics = df.groupBy("business_id", "year", "month", "day") \
        .agg(
            count("*").alias("total_checkins"),
            countDistinct("hour").alias("unique_hours"),
            sum(when(col("is_weekend"), 1).otherwise(0)).alias("weekend_checkins"),
            sum(when(col("time_period") == "Morning", 1).otherwise(0)).alias("morning_checkins"),
            sum(when(col("time_period") == "Afternoon", 1).otherwise(0)).alias("afternoon_checkins"),
            sum(when(col("time_period") == "Evening", 1).otherwise(0)).alias("evening_checkins"),
            sum(when(col("time_period") == "Night", 1).otherwise(0)).alias("night_checkins")
        )
    
    return daily_metrics

df_metrics = calculate_metrics(df)
df_metrics.show(5)


In [None]:
def validate_data(df):
    df = df.filter(
        (length(col("business_id")) > 0) &
        (col("checkin_timestamp").isNotNull()) &
        (col("year") >= 2004) &  # Yelp founding year
        (col("year") <= year(current_date())) &
        (col("hour").between(0, 23))
    )
    return df

df = validate_data(df)
df.show(5)


In [None]:
def quality_checks(df):
    null_counts = df.select([sum(col(c).isNull().cast("int")).alias(c) 
                           for c in df.columns])
    
    time_dist = df.groupBy("time_period") \
        .agg(count("*").alias("checkin_count")) \
        .orderBy("time_period")
    
    weekday_dist = df.groupBy("day_of_week") \
        .agg(count("*").alias("checkin_count")) \
        .orderBy("day_of_week")
    
    print("Null Counts:")
    null_counts.show()
    print("\nTime Period Distribution:")
    time_dist.show()
    print("\nWeekday Distribution:")
    weekday_dist.show()
    
    return df

df = quality_checks(df)


In [None]:
# Save detailed checkins
df.write.format("delta") \
    .mode("overwrite") \
    .partitionBy("year", "month") \
    .save("D:/Project/delta_lake/silver/checkins_detailed")

# Save daily metrics
df_metrics.write.format("delta") \
    .mode("overwrite") \
    .partitionBy("year", "month") \
    .save("D:/Project/delta_lake/silver/checkins_metrics")
