In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [0]:
# Define the path to the Delta table in the bronze layer
bronze_table_path = "amazon.streamdb.streaming_raw"

# Read the streaming data from the Delta table
streaming_bronze_df = spark.readStream \
    .table(bronze_table_path)

In [0]:
from pyspark.sql.functions import from_json, col, get_json_object

# Extract the cart_contents JSON string from _rescued_data
parsed_df = streaming_bronze_df.withColumn("cart_contents_str", get_json_object(col("_rescued_data"), "$.cart_contents"))

In [0]:
# Define the schema for the cart_contents field
cart_contents_schema = "array<struct<product_id: string, quantity: int, price: double>>"

# Parse the cart_contents field from the JSON string
parsed_df = parsed_df.withColumn("cart_contents", from_json(col("cart_contents_str"), cart_contents_schema))

In [0]:
# Explode the cart_contents column to get individual product items
exploded_df = parsed_df.withColumn("exploded_cart_contents", explode_outer(col("cart_contents")))

# Create new columns based on the exploded data
exploded_df = exploded_df.withColumn("product_id_cart_contents", col("exploded_cart_contents.product_id")) \
    .withColumn("price_cart_contents", col("exploded_cart_contents.price")) \
    .withColumn("quantity_cart_contents", col("exploded_cart_contents.quantity"))


In [0]:
# Fill product_id and price using either the original values or the values from exploded cart_contents
cleaned_df = exploded_df.withColumn(
    "product_id",
    when(col("product_id").isNull(), col("product_id_cart_contents")).otherwise(col("product_id"))
).withColumn(
    "price",
    when(col("price").isNull(), col("price_cart_contents")).otherwise(col("price"))
).withColumn(
    "category",
    when(col("category").isNull() & col("filter_details").isNotNull(), split(col("filter_details"), ": ").getItem(1))
    .otherwise(col("category"))
)

# Drop the exploded_cart_contents column as it's no longer needed
cleaned_df = cleaned_df.drop("exploded_cart_contents")


In [0]:
# Ensure that null values are preserved where necessary, and all columns are displayed
final_df = cleaned_df.select(
    "timestamp",
    "user_id",
    "event_type",
    "device_type",
    "session_id",
    "EventProcessedUtcTime",
    "PartitionId",
    "EventEnqueuedUtcTime",
    "product_id",  # this is the combined product_id
    "category",  # combined category with filter_details
    "sub_category",
    "price",  # combined price with cart_contents
    "search_term",
    "product_id_cart_contents",
    "quantity_cart_contents",
    "price_cart_contents"
)

# Display the cleaned DataFrame
final_df.display()


timestamp,user_id,event_type,device_type,session_id,EventProcessedUtcTime,PartitionId,EventEnqueuedUtcTime,product_id,category,sub_category,price,search_term,product_id_cart_contents,quantity_cart_contents,price_cart_contents
1723429520.1133268,CUST-14631,Add to Cart,Mobile,S959-4745249960,2024-08-12T02:25:20.763695Z,0,2024-08-12T02:25:20.639Z,WW-561417,Women'S Clothing,western wear,,,,,
1723429520.9070451,CUST-17226,Browse Product,Desktop,S647-5737266877,2024-08-12T02:25:21.561912Z,0,2024-08-12T02:25:21.514Z,F-221971,home & kitchen,Furniture,,,,,
1723429520.9729657,CUST-13727,Search,Desktop,S806-7767844990,2024-08-12T02:25:21.562041Z,0,2024-08-12T02:25:21.529Z,,Accessories,,,handbags & clutches,,,
1723429522.1478453,CUST-11875,Apply Filter,Mobile,S128-5985264117,2024-08-12T02:25:22.873224Z,0,2024-08-12T02:25:22.67Z,,accessories,,,,,,
1723429524.0960815,CUST-08853,Browse Product,Desktop,S594-7571885355,2024-08-12T02:25:25.07663Z,0,2024-08-12T02:25:24.904Z,CS-117376,men's shoes,Casual Shoes,,,,,
1723429524.5287745,CUST-17226,Add to Cart,Desktop,S647-5737266877,2024-08-12T02:25:25.5142Z,0,2024-08-12T02:25:25.31Z,F-221971,Home & Kitchen,furniture,,,,,
1723429524.8170297,CUST-16063,AppLogin,Mobile,S485-7090286144,2024-08-12T02:25:25.51439Z,0,2024-08-12T02:25:25.389Z,,,,,,,,
1723429524.5831337,CUST-13727,Browse Product,Desktop,S806-7767844990,2024-08-12T02:25:25.514655Z,0,2024-08-12T02:25:25.404Z,PCA-426149,beauty & health,Personal Care Appliances,,,,,
1723429525.687539,CUST-11875,Apply Filter,Mobile,S128-5985264117,2024-08-12T02:25:26.404585Z,0,2024-08-12T02:25:26.232Z,,accessories,,,,,,
1723429529.3363063,CUST-17226,Browse Product,Desktop,S647-5737266877,2024-08-12T02:25:30.014005Z,0,2024-08-12T02:25:29.842Z,J-335697,men's clothing,Jeans,,,,,


In [0]:
%sql
drop table amazon.streamdb.streaming_cleaned;

In [0]:
%sql
CREATE TABLE amazon.streamdb.streaming_cleaned (
    timestamp DOUBLE,
    user_id STRING NOT NULL,
    event_type STRING,
    device_type STRING,
    session_id STRING NOT NULL,
    EventProcessedUtcTime TIMESTAMP,
    PartitionId LONG,
    EventEnqueuedUtcTime TIMESTAMP,
    product_id STRING,
    category STRING,
    sub_category STRING,
    price DOUBLE,
    search_term STRING,
    product_id_cart_contents STRING,
    quantity_cart_contents INT,
    price_cart_contents DOUBLE,
    PRIMARY KEY (EventProcessedUtcTime)
)
USING DELTA;


In [0]:
# checkpoint_path_window = "/mnt/amazonopcheckpoint/chekpointlocwindow"
checkpoint_path_silver = "/mnt/amazonopcheckpoint/chekpointlocsilver"

In [0]:
# Write the entire valid streaming data to the silver layer without specific columns
final_df.writeStream \
    .outputMode("append")\
    .option("checkpointLocation", checkpoint_path_silver ) \
    .toTable("amazon.streamdb.streaming_cleaned")

<pyspark.sql.streaming.query.StreamingQuery at 0x7fd1d4b10280>

In [0]:
%sql
select * from amazon.streamdb.streaming_cleaned;

timestamp,user_id,event_type,device_type,session_id,EventProcessedUtcTime,PartitionId,EventEnqueuedUtcTime,product_id,category,sub_category,price,search_term,product_id_cart_contents,quantity_cart_contents,price_cart_contents
1723429520.1133268,CUST-14631,Add to Cart,Mobile,S959-4745249960,2024-08-12T02:25:20.763695Z,0,2024-08-12T02:25:20.639Z,WW-561417,Women'S Clothing,western wear,,,,,
1723429520.9070451,CUST-17226,Browse Product,Desktop,S647-5737266877,2024-08-12T02:25:21.561912Z,0,2024-08-12T02:25:21.514Z,F-221971,home & kitchen,Furniture,,,,,
1723429520.9729657,CUST-13727,Search,Desktop,S806-7767844990,2024-08-12T02:25:21.562041Z,0,2024-08-12T02:25:21.529Z,,Accessories,,,handbags & clutches,,,
1723429522.1478453,CUST-11875,Apply Filter,Mobile,S128-5985264117,2024-08-12T02:25:22.873224Z,0,2024-08-12T02:25:22.67Z,,accessories,,,,,,
1723429524.0960815,CUST-08853,Browse Product,Desktop,S594-7571885355,2024-08-12T02:25:25.07663Z,0,2024-08-12T02:25:24.904Z,CS-117376,men's shoes,Casual Shoes,,,,,
1723429524.5287745,CUST-17226,Add to Cart,Desktop,S647-5737266877,2024-08-12T02:25:25.5142Z,0,2024-08-12T02:25:25.31Z,F-221971,Home & Kitchen,furniture,,,,,
1723429524.8170297,CUST-16063,AppLogin,Mobile,S485-7090286144,2024-08-12T02:25:25.51439Z,0,2024-08-12T02:25:25.389Z,,,,,,,,
1723429524.5831337,CUST-13727,Browse Product,Desktop,S806-7767844990,2024-08-12T02:25:25.514655Z,0,2024-08-12T02:25:25.404Z,PCA-426149,beauty & health,Personal Care Appliances,,,,,
1723429525.687539,CUST-11875,Apply Filter,Mobile,S128-5985264117,2024-08-12T02:25:26.404585Z,0,2024-08-12T02:25:26.232Z,,accessories,,,,,,
1723429529.3363063,CUST-17226,Browse Product,Desktop,S647-5737266877,2024-08-12T02:25:30.014005Z,0,2024-08-12T02:25:29.842Z,J-335697,men's clothing,Jeans,,,,,
