In [0]:
%run ./00_setup_and_config

In [0]:
from pyspark.sql.types import *
from pyspark.sql.functions import from_json, col, current_timestamp, to_timestamp

In [0]:
clickstreamSchema = StructType([
    StructField("user_id", StringType(), True),
    StructField("session_id", StringType(), True),
    StructField("timestamp", StringType(), True),
    StructField("event_type", StringType(), True),
    StructField("page_url", StringType(), True),
    StructField("product_id", ArrayType(StringType()), True),
    StructField("product_name", ArrayType(StringType()), True),
    StructField("product_brand", ArrayType(StringType()), True),
    StructField("product_price", DoubleType(), True),
    StructField("category", StringType(), True),
    StructField("browser", StringType(), True),
    StructField("os", StringType(), True),
    StructField("ip_address", StringType(), True),
    StructField("referral_source", StringType(), True),
    StructField("device_type", StringType(), True),
    StructField("geo_country", StringType(), True),
    StructField("geo_city", StringType(), True),
    StructField("is_new_user", BooleanType(), True),
    StructField("cart_size", IntegerType(), True)
])

In [0]:
df_raw_stream = spark \
    .readStream \
    .format("eventhubs") \
    .options(**ehConf) \
    .load()

event_df = df_raw_stream.selectExpr("CAST(body AS STRING) as json_body", "enqueuedTime")

parsed_df = event_df \
    .withColumn("data", from_json(col("json_body"), clickstreamSchema)) \
    .select("data.*", col("enqueuedTime").alias("eventhub_enqueued_time")) \
    .withColumn("spark_ingestion_time", current_timestamp())

In [0]:
final_bronze_df = parsed_df.withColumn("timestamp", to_timestamp(col("timestamp"))) \
    .select(
        "user_id", "session_id", "timestamp", "event_type", "page_url",
        "product_id", "product_name", "product_price", "category", "browser",
        "os", "ip_address", "referral_source", "device_type", "geo_country",
        "geo_city", "is_new_user", "cart_size", "eventhub_enqueued_time",
        "spark_ingestion_time"
    )


In [0]:
query = final_bronze_df \
    .writeStream \
    .format("delta") \
    .outputMode("append") \
    .option("checkpointLocation", bronze_checkpoint_location) \
    .option("mergeSchema", "true") \
    .queryName("ClickstreamBronzeIngestion") \
    .start(bronze_layer_path)

print(f"Streaming query started to: {bronze_layer_path}")

In [0]:
# display(final_bronze_df)