In [0]:
%run ../config/pipeline_config

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

spark = SparkSession.builder.getOrCreate()
print("\nStep 1 : Reading Bronze Delta Table")
df_bronze = spark.table(BRONZE_TABLE_FQN)

bronze_count = df_bronze.count()
print(f" Bronze table has: {bronze_count:,}")

In [0]:
#  next type casting
print(f"casting columns data in a proper data type...")

df_silver = df_bronze \
    .withColumn("event_timestamp", expr("try_to_timestamp(timestamp)")) \
    .withColumn("pickup_lat", col("pickup_lat").cast("double")) \
    .withColumn("pickup_lng", col("pickup_lng").cast("double")) \
    .withColumn("session_duration_sec", col("session_duration_sec").cast("int")) \
    .withColumn("is_repeat_search",col("is_repeat_search").cast("boolean"))



In [0]:
# step - 3 data Quality filters

print(f"\nStep 3: Applying validation rules .... ")

df_silver = df_silver.filter(
    (col("session_duration_sec")>= MIN_SESSION_DURATION) &
    (col("session_duration_sec") <= MAX_SESSION_DURATION)
    )
silver_count = df_silver.count()
print(f"Silver table has : {silver_count:,}")

In [0]:
# display(df_silver.limit(10))

In [0]:
# Dropping the origional timestamp
df_silver = df_silver.drop("timestamp")
#  Validate Timestamps so nulls timestamps will droped
df_silver = df_silver.filter(col("event_timestamp").isNotNull())

In [0]:
# Write to Silver table
print(f"\nStep 4: writing to silver table...")

df_silver.write \
    .format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .saveAsTable(SILVER_TABLE_FQN)
print(f"successfully created  silver layer table")

In [0]:
# Step 6: Validation
print(f"\nStep 6: validating records count in silver table...")

df_verify = spark.table(SILVER_TABLE_FQN)
verify_count = df_verify.count()
print(f"Records in Silver table : {verify_count:,}")
print(f"Dropped records          : {bronze_count - verify_count:,}")
print(f"Data quality applied     : YES")

In [0]:
# Validate No Invalid Timestamps Remain
spark.table(SILVER_TABLE_FQN) \
    .filter("event_timestamp IS NULL") \
    .count()

In [0]:
# Validating Session Duration Bounds
spark.table(SILVER_TABLE_FQN) \
    .filter(
        (col("session_duration_sec") < MIN_SESSION_DURATION) |
        (col("session_duration_sec") > MAX_SESSION_DURATION)
    ).count()

In [0]:
spark.table(SILVER_TABLE_FQN).printSchema()