In [0]:
### Build Bronze : Raw Ingestion 

In [0]:
from pyspark.sql import functions as F 
raw_data = spark.read.csv("/Volumes/workspace/ecommerce/ecommerce_data/2019-Nov.csv", header = True, inferSchema = True)
raw_data.withColumn("ingestion_timestamp", F.current_timestamp())\
        .write.format("delta")\
        .mode("append")\
        .save("/Volumes/workspace/ecommerce/ecommerce_data/bronze/events")

In [0]:
spark.read.format("delta").load("/Volumes/workspace/ecommerce/ecommerce_data/bronze/events").show(5)

+-------------------+----------+----------+-------------------+--------------------+------+------+---------+--------------------+--------------------+
|         event_time|event_type|product_id|        category_id|       category_code| brand| price|  user_id|        user_session| ingestion_timestamp|
+-------------------+----------+----------+-------------------+--------------------+------+------+---------+--------------------+--------------------+
|2019-11-01 00:00:00|      view|   1003461|2053013555631882655|electronics.smart...|xiaomi|489.07|520088904|4d3b30da-a5e4-49d...|2026-01-14 16:58:...|
|2019-11-01 00:00:00|      view|   5000088|2053013566100866035|appliances.sewing...|janome|293.65|530496790|8e5f4f83-366c-4f7...|2026-01-14 16:58:...|
|2019-11-01 00:00:01|      view|  17302664|2053013553853497655|                NULL| creed| 28.31|561587266|755422e7-9040-477...|2026-01-14 16:58:...|
|2019-11-01 00:00:01|      view|   3601530|2053013563810775923|appliances.kitche...|    lg|712

In [0]:
### Build Silver : Cleaning & Validation

In [0]:
from pyspark.sql import functions as F 
## reading the bronze data
bronze = spark.read.format("delta").load("/Volumes/workspace/ecommerce/ecommerce_data/bronze/events")
#cleaning the bronze data
silver = bronze.filter(F.col("price") > 0) \
               .filter(F.col("price") <10000) \
               .dropDuplicates(["user_session","event_time"])\
               .withColumn("event_date", F.to_date("event_time"))\
                .withColumn("price_category",
                            F.when(F.col("price")<100, "budget")
                            .when(F.col("price")<500, "affordable")
                            .otherwise("premium"))
#writing the silver data
silver.write.format("delta").mode("overwrite").save("/Volumes/workspace/ecommerce/ecommerce_data/silver/events")               
    

In [0]:
## Build Gold : Business Aggregates 

In [0]:
#product performance 
from pyspark.sql import functions as F
silver = spark.read.format("delta").load("/Volumes/workspace/ecommerce/ecommerce_data/silver/events")
product_performance = silver.groupBy("product_id","brand","category_code")\
                            .agg(F.count(F.when(F.col("event_type") == "view",True)).alias("total_views"),
                                 F.count(F.when(F.col("event_type") == "purchase", True)).alias("total_purchases"),
                                 F.sum(F.when(F.col("event_type")=="purchase",F.col("price"))).alias("total_revenue")
                                       )\
                                .withColumn("conversion_rate",
                                           F.when(
                                               F.col("total_views")>0, F.col("total_purchases")/F.col("total_views")*100).
                                           otherwise(0)
                                           )

product_performance.write.format("delta").mode("overwrite").save("/Volumes/workspace/ecommerce/ecommerce_data/gold/product_performance")