In [0]:
%run ./00_setup_and_config

In [0]:
from pyspark.sql.functions import col, explode_outer, when, lit, struct, array, zip_with, StringType, array_size, regexp_extract, lower

In [0]:
bronze_stream_df = spark \
    .readStream \
    .format("delta") \
    .load(bronze_layer_path)

In [0]:
silver_df = bronze_stream_df.withColumn("domain", regexp_extract(col("page_url"), "https?://([^/]+)", 1)) \
    .withColumn("browser_standardized",
                when(lower(col("browser")).contains("chrome"), "Chrome")
                .when(lower(col("browser")).contains("firefox"), "Firefox")
                .when(lower(col("browser")).contains("safari"), "Safari")
                .when(lower(col("browser")).contains("edge"), "Edge")
                .otherwise("Other")) \
    .filter(
        ~(
            (col("event_type") == "page_view") &
            (col("page_url").like("%/product%")) &
            (col("product_id").isNull())
        )
    )


In [0]:
silver_df = silver_df.withColumn("product_data",
                explode_outer(
                    when(
                        col("product_id").isNotNull() & (array_size(col("product_id")) > 0),
                        zip_with(col("product_id"), col("product_name"),
                                 lambda id, name: struct(id.alias("id"), name.alias("name"), lit(None).alias("brand")))
                    ).otherwise(
                        array(struct(lit(None).cast(StringType()).alias("id"),
                                     lit(None).cast(StringType()).alias("name"),
                                     lit(None).cast(StringType()).alias("brand")))
                    )
                )
    ) \
    .withColumn("product_id_flat", col("product_data.id")) \
    .withColumn("product_name_flat", col("product_data.name")) \
    .withColumn("product_brand_flat", col("product_data.brand")) \
    .drop("product_id", "product_name", "product_brand", "product_data")

In [0]:

final_silver_df = silver_df.select(
    "user_id",
    "session_id",
    "timestamp",
    "event_type",
    "page_url",
    "domain",
    "browser_standardized", 
    "product_id_flat",
    "product_name_flat",  
    "product_brand_flat",   
    "product_price",       
    "category",
    "browser", 
    "os",
    "ip_address",
    "referral_source",
    "device_type",
    "geo_country",
    "geo_city",
    "is_new_user",
    "cart_size",
    "eventhub_enqueued_time",
    "spark_ingestion_time"
)


In [0]:
final_silver_df.printSchema()

In [0]:
final_silver_df.writeStream \
    .format("delta") \
    .outputMode("append") \
    .option("checkpointLocation", silver_checkpoint_location) \
    .option("mergeSchema", "true") \
    .queryName("ClickstreamSilverTansformation") \
    .start(silver_layer_path)

In [0]:
# display(final_silver_df)