###Common Transformation class

In [0]:
# If utilities.py is a notebook in your workspace, use:
%run "./utilities.py"

transformer = transformations()



###Orders


In [0]:
class OrderSilverTransformations:
    def __init__(self, spark):
        self.spark = spark
    # --- Flatten items[] array ---
    def flatten_items_array(self, df):
        df = df.withColumn("item", F.explode("items")) \
               .withColumn("product_id", F.col("item.product_id")) \
               .withColumn("quantity", F.col("item.quantity")) \
               .withColumn("price", F.col("item.unit_price")) \
               .withColumn("item_value_in_order",F.col("item.amount"))\
               .drop("item", "items")
        return df

    # --- Compute total order value ---
    def compute_order_value(self, df):
        # df = df.withColumn("item_total_value", F.col("quantity") * F.col("price"))
        window_spec = Window.partitionBy("order_id")
        df = df.withColumn(
            "order_total_value",
            F.sum("item_value_in_order").over(window_spec)
        )
        return df

    # --- Derive is_high_value flag ---
    def derive_is_high_value(self, df, threshold=10000):
        df = df.withColumn("is_high_value", F.col("order_total_value") > threshold)
        return df

    # --- Calculate Fraud Score ---
    def calculate_fraud_score(self, df):
        df = df.withColumn(
            "fraud_score",
            F.when((F.col("payment_method") == "COD") & (F.col("is_high_value")), 30)
             .when((F.col("payment_method") == "CARD") & (F.col("is_high_value")), 10)
             .otherwise(5)
        )
        return df

    # --- Clean and Standardize Columns ---
    def clean_columns(self, df):
        df = df.withColumnRenamed("event_date", "order_date") \
               .withColumn("event_date", F.to_date("event_time"))
        return df

    # --- Orchestrate all transformations ---
    def transform(self, df):
        df = self.flatten_items_array(df)
        df = self.compute_order_value(df)
        df = self.derive_is_high_value(df)
        df = self.calculate_fraud_score(df)
        df = self.clean_columns(df)
        return df




In [0]:
%sql
select * from `e-commerce-project`.bronze.orders

order_id,customer_id,payment_method,order_status,event_time,shipping_city,shipping_pincode,ip_address,items,event_date
ORD020,C009,UPI,PLACED,2025-10-13T09:00:00Z,Kolkata,700003,203.0.113.19,"List(List(P010, 1, 20000.0, 20000.0))",2025-10-13
ORD019,C008,CARD,PLACED,2025-10-13T08:12:00Z,Mumbai,400002,203.0.113.18,"List(List(P009, 1, 5000.0, 5000.0))",2025-10-13
ORD018,C008,CARD,PLACED,2025-10-13T08:10:00Z,Mumbai,400002,203.0.113.18,"List(List(P009, 2, 5000.0, 10000.0))",2025-10-13
ORD017,C007,CARD,PLACED,2025-10-12T10:05:00Z,Delhi,110001,203.0.113.16,"List(List(P005, 1, 15000.0, 15000.0))",2025-10-12
ORD016,C006,COD,PLACED,2025-10-12T10:00:00Z,Delhi,110001,203.0.113.16,"List(List(P005, 1, 15000.0, 15000.0))",2025-10-12
ORD013,C004,CARD,PLACED,2025-10-21T09:05:00Z,Hyderabad,500002,203.0.113.14,"List(List(P004, 1, 2500.0, 2500.0), List(P012, 1, 2300.0, 2300.0), List(P015, 1, 1450.0, 1450.0))",2025-10-21
ORD006,C005,WALLET,PLACED,2025-10-13T09:15:00Z,Hyderabad,500001,203.0.113.15,"List(List(P006, 1, 5000.0, 5000.0), List(P012, 1, 2300.0, 2300.0))",2025-10-13
ORD011,C010,UPI,PLACED,2025-10-18T11:00:00Z,Bengaluru,560002,203.0.113.20,"List(List(P016, 1, 0.0, 0.0), List(P003, 1, 1500.0, 1500.0))",2025-10-18
ORD001,C001,COD,PLACED,2025-10-08T09:10:00Z,Mumbai,400001,203.0.113.11,"List(List(P001, 1, 25000.0, 25000.0), List(P008, 1, 950.0, 950.0))",2025-10-08
ORD002,C002,CARD,PLACED,2025-10-09T10:20:00Z,Delhi,110001,203.0.113.12,"List(List(P004, 1, 2500.0, 2500.0), List(P011, 1, 1750.0, 1750.0))",2025-10-09


In [0]:
# Read the raw data from the Bronze Delta table
bronze_orders_df = spark.read.table("`e-commerce-project`.bronze.orders")

print(f"Bronze DataFrame loaded before dedup {bronze_orders_df.count()} records.")
# deduplicate data
bronze_orders_df_new = transformer.dedup(
    bronze_orders_df,
    ['order_id', 'customer_id'],
    'event_time'
)
print(f"Bronze DataFrame loaded with {bronze_orders_df_new.count()} records.")


transformer_ORDER = OrderSilverTransformations(spark)
orders_transformed = transformer_ORDER.transform(bronze_orders_df_new)

display(orders_transformed)
# print(f"✅ Silver transformation complete — {silver_df_orders.count()} records generated.")



Bronze DataFrame loaded before dedup 20 records.
Bronze DataFrame loaded with 20 records.


order_id,customer_id,payment_method,order_status,event_time,shipping_city,shipping_pincode,ip_address,order_date,product_id,quantity,price,item_value_in_order,order_total_value,is_high_value,fraud_score,event_date
ORD001,C001,COD,PLACED,2025-10-08T09:10:00Z,Mumbai,400001,203.0.113.11,2025-10-08,P001,1,25000.0,25000.0,25950.0,True,30,2025-10-08
ORD001,C001,COD,PLACED,2025-10-08T09:10:00Z,Mumbai,400001,203.0.113.11,2025-10-08,P008,1,950.0,950.0,25950.0,True,30,2025-10-08
ORD002,C002,CARD,PLACED,2025-10-09T10:20:00Z,Delhi,110001,203.0.113.12,2025-10-09,P004,1,2500.0,2500.0,4250.0,False,5,2025-10-09
ORD002,C002,CARD,PLACED,2025-10-09T10:20:00Z,Delhi,110001,203.0.113.12,2025-10-09,P011,1,1750.0,1750.0,4250.0,False,5,2025-10-09
ORD003,C003,UPI,PLACED,2025-10-10T11:30:00Z,Bengaluru,560001,203.0.113.13,2025-10-10,P002,1,55000.0,55000.0,55000.0,True,5,2025-10-10
ORD004,C001,CARD,PLACED,2025-10-11T12:00:00Z,Pune,411001,203.0.113.11,2025-10-11,P003,2,1500.0,3000.0,6300.0,False,5,2025-10-11
ORD004,C001,CARD,PLACED,2025-10-11T12:00:00Z,Pune,411001,203.0.113.11,2025-10-11,P009,1,3300.0,3300.0,6300.0,False,5,2025-10-11
ORD005,C004,COD,PLACED,2025-10-12T08:45:00Z,Chennai,600001,203.0.113.14,2025-10-12,P010,1,4000.0,4000.0,5600.0,False,5,2025-10-12
ORD005,C004,COD,PLACED,2025-10-12T08:45:00Z,Chennai,600001,203.0.113.14,2025-10-12,P007,2,800.0,1600.0,5600.0,False,5,2025-10-12
ORD006,C005,WALLET,PLACED,2025-10-13T09:15:00Z,Hyderabad,500001,203.0.113.15,2025-10-13,P006,1,5000.0,5000.0,7300.0,False,5,2025-10-13


##Upserting-Orders

In [0]:
transformer.create_or_upsert(spark,orders_transformed,['order_id', 'customer_id','product_id'],"silver.orders",'event_time')

✅ Created new Delta table: `e-commerce-project`.silver.orders


In [0]:
%sql
select order_id,event_time,count(*) from `e-commerce-project`.silver.orders
group by order_id,event_time

order_id,event_time,count(*)
ORD001,2025-10-08T09:10:00Z,2
ORD002,2025-10-09T10:20:00Z,2
ORD003,2025-10-10T11:30:00Z,1
ORD004,2025-10-11T12:00:00Z,2
ORD005,2025-10-12T08:45:00Z,2
ORD006,2025-10-13T09:15:00Z,2
ORD007,2025-10-14T14:40:00Z,2
ORD008,2025-10-15T16:00:00Z,2
ORD009,2025-10-16T09:30:00Z,2
ORD010,2025-10-17T10:30:00Z,2


#Payments


In [0]:
bronze_payment_df = spark.read.table("`e-commerce-project`.bronze.payments")

# deduplicate data
payment_new = transformer.dedup(
    bronze_payment_df,
    ['order_id', 'customer_id'],
    'payment_timestamp'
)
#  Add payment flag
payment_transformed = payment_new.withColumn("is_successful"
                                     ,F.when(F.col("payment_status") == "SUCCESS", True).otherwise(False))


display(payment_transformed)

payment_id,order_id,customer_id,payment_timestamp,amount,payment_status,method,event_date,is_successful
PAY_ORD001_2,ORD001,C001,2025-10-10T12:30:00Z,25950.0,SUCCESS,COD,2025-10-10,True
PAY_ORD002_1,ORD002,C002,2025-10-09T10:22:00Z,4250.0,SUCCESS,CARD,2025-10-09,True
PAY_ORD003_1,ORD003,C003,2025-10-10T11:32:00Z,55000.0,SUCCESS,UPI,2025-10-10,True
PAY_ORD004_1,ORD004,C001,2025-10-11T12:03:00Z,6300.0,SUCCESS,CARD,2025-10-11,True
PAY_ORD005_2,ORD005,C004,2025-10-14T16:10:00Z,5600.0,SUCCESS,COD,2025-10-14,True
PAY_ORD006_1,ORD006,C005,2025-10-13T09:16:00Z,7300.0,SUCCESS,WALLET,2025-10-13,True
PAY_ORD007_1,ORD007,C006,2025-10-14T14:42:00Z,4200.0,SUCCESS,CARD,2025-10-14,True
PAY_ORD008_2,ORD008,C007,2025-10-17T10:00:00Z,4150.0,SUCCESS,COD,2025-10-17,True
PAY_ORD009_1,ORD009,C008,2025-10-16T09:35:00Z,29000.0,SUCCESS,UPI,2025-10-16,True
PAY_ORD010_1,ORD010,C009,2025-10-17T10:35:00Z,57400.0,SUCCESS,CARD,2025-10-17,True


###Upserting-Payments

In [0]:

transformer.create_or_upsert(spark,
                             payment_transformed,
                             key_cols=['order_id', 'customer_id'],
                             table='silver.payment',
                             cdc='payment_timestamp')

✅ Created new Delta table: `e-commerce-project`.silver.payment


#Shipments


In [0]:
# Databricks notebook: silver_shipments
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from delta.tables import DeltaTable

# ----------------------------------------
# 1️⃣ Read Bronze Shipments
# ----------------------------------------
bronze_shipments = spark.read.table("`e-commerce-project`.bronze.shipments")

# ----------------------------------------
# 2️⃣ Assign priority based on status
# ----------------------------------------
priority_expr = (
    F.when(F.upper(F.col("status")) == "DELIVERED", 3)
     .when(F.upper(F.col("status")) == "CANCELED", 3)
     .when(F.upper(F.col("status")) == "DISPATCHED", 2)
     .otherwise(1)
)

shipments_with_priority = (
    bronze_shipments
    .withColumn("status", F.upper(F.trim(F.col("status"))))
    .withColumn("status_rank", priority_expr)
)

# ----------------------------------------
# 3️⃣ Deduplicate: Keep highest priority per shipment_id
# ----------------------------------------
window_spec = Window.partitionBy("order_id").orderBy(F.col("status_rank").desc())
shipments_dedup = (
    shipments_with_priority
    .withColumn("rownum", F.row_number().over(window_spec))
    .filter(F.col("rownum") == 1)
    .drop("rownum")
)

# ----------------------------------------
# 4️⃣ Compute delivery delay (if delivered)
# ----------------------------------------
shipments_transformed = (
    shipments_dedup
    .withColumn("dispatched_at", F.to_timestamp("dispatched_at"))
    .withColumn("delivered_at", F.to_timestamp("delivered_at"))
    .withColumn(
        "delivery_delay_hours",
        F.when(
            F.col("delivered_at").isNotNull(),
            F.round((F.col("delivered_at").cast("long") - F.col("dispatched_at").cast("long")) / 3600, 2)
        ))
    .withColumn("is_delivered",F.when(F.col("status") == "DELIVERED",True).otherwise(False)) 
    .withColumn("is_canceled",F.when(F.col("status") == "CANCELED",True).otherwise(False))
)

display(shipments_transformed)



shipment_id,order_id,courier,dispatched_at,delivered_at,status,tracking_url,event_date,status_rank,delivery_delay_hours,is_delivered,is_canceled
SHIP_ORD001_3,ORD001,BlueDart,2025-10-09T08:00:00.000Z,2025-10-10T12:30:00.000Z,DELIVERED,https://track.bluedart/ORD001,2025-10-09,3,28.5,True,False
SHIP_ORD002_3,ORD002,Delhivery,2025-10-10T09:00:00.000Z,2025-10-11T15:20:00.000Z,DELIVERED,https://track.delhivery/ORD002,2025-10-10,3,30.33,True,False
SHIP_ORD003_3,ORD003,DTDC,2025-10-11T07:30:00.000Z,2025-10-12T18:00:00.000Z,DELIVERED,https://track.dtdc/ORD003,2025-10-11,3,34.5,True,False
SHIP_ORD004_3,ORD004,EcomExpress,2025-10-13T10:15:00.000Z,2025-10-14T13:20:00.000Z,DELIVERED,https://track.ecom/ORD004,2025-10-13,3,27.08,True,False
SHIP_ORD005_3,ORD005,BlueDart,2025-10-13T09:00:00.000Z,2025-10-14T17:45:00.000Z,DELIVERED,https://track.bluedart/ORD005,2025-10-13,3,32.75,True,False
SHIP_ORD006_3,ORD006,Delhivery,2025-10-16T18:00:00.000Z,2025-10-18T11:30:00.000Z,DELIVERED,https://track.delhivery/ORD006,2025-10-16,3,41.5,True,False
SHIP_ORD007_3,ORD007,DTDC,2025-10-17T16:00:00.000Z,2025-10-18T09:00:00.000Z,DELIVERED,https://track.dtdc/ORD007,2025-10-17,3,17.0,True,False
SHIP_ORD008_3,ORD008,EcomExpress,2025-10-16T07:30:00.000Z,2025-10-20T10:20:00.000Z,DELIVERED,https://track.ecom/ORD008,2025-10-16,3,98.83,True,False
SHIP_ORD009_3,ORD009,BlueDart,2025-10-17T09:30:00.000Z,2025-10-19T12:00:00.000Z,DELIVERED,https://track.bluedart/ORD009,2025-10-17,3,50.5,True,False
SHIP_ORD010_3,ORD010,Delhivery,2025-10-18T08:45:00.000Z,2025-10-20T18:10:00.000Z,DELIVERED,https://track.delhivery/ORD010,2025-10-18,3,57.42,True,False


## Upserting-shipments
 

In [0]:
# transformer.create_or_upsert(shipments_transformed,'shipment_id',"shipments","status_rank")
transformer.create_or_upsert(
    spark,
    shipments_transformed,
    key_cols=["order_id"],
    table="silver.shipments",
    cdc="status_rank"
)

✅ Created new Delta table: `e-commerce-project`.silver.shipments


In [0]:
%sql
select * from `e-commerce-project`.silver.shipments
-- drop table `e-commerce-project`.silver.shipments

shipment_id,order_id,courier,dispatched_at,delivered_at,status,tracking_url,event_date,status_rank,delivery_delay_hours,is_delivered,is_canceled
SHIP_ORD001_3,ORD001,BlueDart,2025-10-09T08:00:00.000Z,2025-10-10T12:30:00.000Z,DELIVERED,https://track.bluedart/ORD001,2025-10-09,3,28.5,True,False
SHIP_ORD002_3,ORD002,Delhivery,2025-10-10T09:00:00.000Z,2025-10-11T15:20:00.000Z,DELIVERED,https://track.delhivery/ORD002,2025-10-10,3,30.33,True,False
SHIP_ORD003_3,ORD003,DTDC,2025-10-11T07:30:00.000Z,2025-10-12T18:00:00.000Z,DELIVERED,https://track.dtdc/ORD003,2025-10-11,3,34.5,True,False
SHIP_ORD004_3,ORD004,EcomExpress,2025-10-13T10:15:00.000Z,2025-10-14T13:20:00.000Z,DELIVERED,https://track.ecom/ORD004,2025-10-13,3,27.08,True,False
SHIP_ORD005_3,ORD005,BlueDart,2025-10-13T09:00:00.000Z,2025-10-14T17:45:00.000Z,DELIVERED,https://track.bluedart/ORD005,2025-10-13,3,32.75,True,False
SHIP_ORD006_3,ORD006,Delhivery,2025-10-16T18:00:00.000Z,2025-10-18T11:30:00.000Z,DELIVERED,https://track.delhivery/ORD006,2025-10-16,3,41.5,True,False
SHIP_ORD007_3,ORD007,DTDC,2025-10-17T16:00:00.000Z,2025-10-18T09:00:00.000Z,DELIVERED,https://track.dtdc/ORD007,2025-10-17,3,17.0,True,False
SHIP_ORD008_3,ORD008,EcomExpress,2025-10-16T07:30:00.000Z,2025-10-20T10:20:00.000Z,DELIVERED,https://track.ecom/ORD008,2025-10-16,3,98.83,True,False
SHIP_ORD009_3,ORD009,BlueDart,2025-10-17T09:30:00.000Z,2025-10-19T12:00:00.000Z,DELIVERED,https://track.bluedart/ORD009,2025-10-17,3,50.5,True,False
SHIP_ORD010_3,ORD010,Delhivery,2025-10-18T08:45:00.000Z,2025-10-20T18:10:00.000Z,DELIVERED,https://track.delhivery/ORD010,2025-10-18,3,57.42,True,False
