## Silver to Gold: Building BI Ready Tables

In [0]:
import pyspark.sql.functions as F
from pyspark.sql.types import StringType, IntegerType, DateType, TimestampType, FloatType

In [0]:
catalog_name = 'ecommerce'

In [0]:
df = spark.table(f"{catalog_name}.silver.slv_order_items")

df.limit(10).display()

dt,order_ts,customer_id,order_id,item_seq,product_id,quantity,unit_price_currency,unit_price,discount_pct,tax_amount,channel,coupon_code,file_name,ingest_timestamp,processed_time
2025-08-02,2025-08-02T01:28:36.000Z,CUST000000198152,645377,2,2000000216447,1,SGD,138.0,19.0,21.0,Website,,dbfs:/Volumes/ecommerce/source_data/raw/order_items/landing/order_items_2025-08-02.csv,2025-12-26T14:02:42.666Z,2025-12-26T14:14:48.306Z
2025-08-03,2025-08-03T12:50:24.000Z,CUST000000260005,645815,2,2000000444215,1,INR,2930.0,4.0,142.0,Mobile,,dbfs:/Volumes/ecommerce/source_data/raw/order_items/landing/order_items_2025-08-03.csv,2025-12-26T14:02:42.666Z,2025-12-26T14:14:48.306Z
2025-08-03,2025-08-03T22:44:13.000Z,CUST000000183607,645817,1,2000000108100,1,INR,1767.0,16.0,179.0,Website,,dbfs:/Volumes/ecommerce/source_data/raw/order_items/landing/order_items_2025-08-03.csv,2025-12-26T14:02:42.666Z,2025-12-26T14:14:48.306Z
2025-08-03,2025-08-03T05:26:47.000Z,CUST000000050539,646863,1,2000000471051,1,INR,29146.0,5.0,3340.0,Website,,dbfs:/Volumes/ecommerce/source_data/raw/order_items/landing/order_items_2025-08-03.csv,2025-12-26T14:02:42.666Z,2025-12-26T14:14:48.306Z
2025-08-04,2025-08-04T16:32:02.000Z,CUST000000035207,647254,1,2000000116365,1,INR,1531.0,7.0,71.0,Website,,dbfs:/Volumes/ecommerce/source_data/raw/order_items/landing/order_items_2025-08-04.csv,2025-12-26T14:02:42.666Z,2025-12-26T14:14:48.306Z
2025-08-04,2025-08-04T21:47:56.000Z,CUST000000091982,647695,1,2000000000718,1,INR,3964.0,13.0,413.0,Website,,dbfs:/Volumes/ecommerce/source_data/raw/order_items/landing/order_items_2025-08-04.csv,2025-12-26T14:02:42.666Z,2025-12-26T14:14:48.306Z
2025-08-04,2025-08-04T03:53:17.000Z,CUST000000145355,647774,1,2000000437750,2,INR,111488.0,16.0,33532.0,Mobile,fest20,dbfs:/Volumes/ecommerce/source_data/raw/order_items/landing/order_items_2025-08-04.csv,2025-12-26T14:02:42.666Z,2025-12-26T14:14:48.306Z
2025-08-05,2025-08-05T03:35:06.000Z,CUST000000288236,648478,2,2000000464435,2,AUD,290.0,1.0,70.0,Mobile,new10,dbfs:/Volumes/ecommerce/source_data/raw/order_items/landing/order_items_2025-08-05.csv,2025-12-26T14:02:42.666Z,2025-12-26T14:14:48.306Z
2025-08-05,2025-08-05T20:03:00.000Z,CUST000000186369,649080,1,2000000317557,1,GBP,511.0,8.0,57.0,Mobile,save50,dbfs:/Volumes/ecommerce/source_data/raw/order_items/landing/order_items_2025-08-05.csv,2025-12-26T14:02:42.666Z,2025-12-26T14:14:48.306Z
2025-08-06,2025-08-06T00:11:24.000Z,CUST000000085405,649563,3,2000000306872,1,USD,62.0,15.0,7.0,Mobile,,dbfs:/Volumes/ecommerce/source_data/raw/order_items/landing/order_items_2025-08-06.csv,2025-12-26T14:02:42.666Z,2025-12-26T14:14:48.306Z


In [0]:
# 1) Add gross amount
df = df.withColumn(
    "gross_amount",
    F.col("quantity") * F.col("unit_price")
    )

# 2) Add discount_amount (discount_pct is already numeric, e.g., 21 -> 21%)
df = df.withColumn(
    "discount_amount",
    F.ceil(F.col("gross_amount") * (F.col("discount_pct") / 100.0))
)

# 3) Add sale_amount = gross - discount
df = df.withColumn(
    "sale_amount",
    F.col("gross_amount") - F.col("discount_amount") + F.col("tax_amount")
)

# add date id
df = df.withColumn("date_id", F.date_format(F.col("dt"), "yyyyMMdd").cast(IntegerType()))  # Create date_key

# Coupon flag
#  coupon flag = 1 if coupon_code is not null else 0
df = df.withColumn(
    "coupon_flag",
    F.when(F.col("coupon_code").isNotNull(), F.lit(1))
     .otherwise(F.lit(0))
)

df.limit(5).display()    

dt,order_ts,customer_id,order_id,item_seq,product_id,quantity,unit_price_currency,unit_price,discount_pct,tax_amount,channel,coupon_code,file_name,ingest_timestamp,processed_time,gross_amount,discount_amount,sale_amount,date_id,coupon_flag
2025-08-02,2025-08-02T01:28:36.000Z,CUST000000198152,645377,2,2000000216447,1,SGD,138.0,19.0,21.0,Website,,dbfs:/Volumes/ecommerce/source_data/raw/order_items/landing/order_items_2025-08-02.csv,2025-12-26T14:02:42.666Z,2025-12-26T14:14:48.306Z,138.0,27,132.0,20250802,0
2025-08-03,2025-08-03T12:50:24.000Z,CUST000000260005,645815,2,2000000444215,1,INR,2930.0,4.0,142.0,Mobile,,dbfs:/Volumes/ecommerce/source_data/raw/order_items/landing/order_items_2025-08-03.csv,2025-12-26T14:02:42.666Z,2025-12-26T14:14:48.306Z,2930.0,118,2954.0,20250803,0
2025-08-03,2025-08-03T22:44:13.000Z,CUST000000183607,645817,1,2000000108100,1,INR,1767.0,16.0,179.0,Website,,dbfs:/Volumes/ecommerce/source_data/raw/order_items/landing/order_items_2025-08-03.csv,2025-12-26T14:02:42.666Z,2025-12-26T14:14:48.306Z,1767.0,283,1663.0,20250803,0
2025-08-03,2025-08-03T05:26:47.000Z,CUST000000050539,646863,1,2000000471051,1,INR,29146.0,5.0,3340.0,Website,,dbfs:/Volumes/ecommerce/source_data/raw/order_items/landing/order_items_2025-08-03.csv,2025-12-26T14:02:42.666Z,2025-12-26T14:14:48.306Z,29146.0,1458,31028.0,20250803,0
2025-08-04,2025-08-04T16:32:02.000Z,CUST000000035207,647254,1,2000000116365,1,INR,1531.0,7.0,71.0,Website,,dbfs:/Volumes/ecommerce/source_data/raw/order_items/landing/order_items_2025-08-04.csv,2025-12-26T14:02:42.666Z,2025-12-26T14:14:48.306Z,1531.0,108,1494.0,20250804,0


In [0]:
# --- 1) Define your fixed FX rates (as of 2025-10-15, like your PBI note) ---
fx_rates = {
    "INR": 1.00,
    "AED": 24.18,
    "AUD": 57.55,
    "CAD": 62.93,
    "GBP": 117.98,
    "SGD": 68.18,
    "USD": 88.29,
}

rates = [(k, float(v)) for k, v in fx_rates.items()]
rates_df = spark.createDataFrame(rates, ["currency", "inr_rate"])
rates_df.show()

+--------+--------+
|currency|inr_rate|
+--------+--------+
|     INR|     1.0|
|     AED|   24.18|
|     AUD|   57.55|
|     CAD|   62.93|
|     GBP|  117.98|
|     SGD|   68.18|
|     USD|   88.29|
+--------+--------+



In [0]:
df = (
    df
    .join(
        rates_df,
        rates_df.currency == F.upper(F.trim(F.col("unit_price_currency"))),
        "left"
    )
    .withColumn("sale_amount_inr", F.col("sale_amount") * F.col("inr_rate"))
    .withColumn("sale_amount_inr", F.ceil(F.col("sale_amount_inr")))
)

In [0]:
df.limit(5).display()

dt,order_ts,customer_id,order_id,item_seq,product_id,quantity,unit_price_currency,unit_price,discount_pct,tax_amount,channel,coupon_code,file_name,ingest_timestamp,processed_time,gross_amount,discount_amount,sale_amount,date_id,coupon_flag,currency,inr_rate,sale_amount_inr
2025-08-02,2025-08-02T01:28:36.000Z,CUST000000198152,645377,2,2000000216447,1,SGD,138.0,19.0,21.0,Website,,dbfs:/Volumes/ecommerce/source_data/raw/order_items/landing/order_items_2025-08-02.csv,2025-12-26T14:02:42.666Z,2025-12-26T14:14:48.306Z,138.0,27,132.0,20250802,0,SGD,68.18,9000
2025-08-03,2025-08-03T12:50:24.000Z,CUST000000260005,645815,2,2000000444215,1,INR,2930.0,4.0,142.0,Mobile,,dbfs:/Volumes/ecommerce/source_data/raw/order_items/landing/order_items_2025-08-03.csv,2025-12-26T14:02:42.666Z,2025-12-26T14:14:48.306Z,2930.0,118,2954.0,20250803,0,INR,1.0,2954
2025-08-03,2025-08-03T22:44:13.000Z,CUST000000183607,645817,1,2000000108100,1,INR,1767.0,16.0,179.0,Website,,dbfs:/Volumes/ecommerce/source_data/raw/order_items/landing/order_items_2025-08-03.csv,2025-12-26T14:02:42.666Z,2025-12-26T14:14:48.306Z,1767.0,283,1663.0,20250803,0,INR,1.0,1663
2025-08-03,2025-08-03T05:26:47.000Z,CUST000000050539,646863,1,2000000471051,1,INR,29146.0,5.0,3340.0,Website,,dbfs:/Volumes/ecommerce/source_data/raw/order_items/landing/order_items_2025-08-03.csv,2025-12-26T14:02:42.666Z,2025-12-26T14:14:48.306Z,29146.0,1458,31028.0,20250803,0,INR,1.0,31028
2025-08-04,2025-08-04T16:32:02.000Z,CUST000000035207,647254,1,2000000116365,1,INR,1531.0,7.0,71.0,Website,,dbfs:/Volumes/ecommerce/source_data/raw/order_items/landing/order_items_2025-08-04.csv,2025-12-26T14:02:42.666Z,2025-12-26T14:14:48.306Z,1531.0,108,1494.0,20250804,0,INR,1.0,1494


In [0]:
orders_gold_df = df.select(
    F.col("date_id"),
    F.col("dt").alias("transaction_date"),
    F.col("order_ts").alias("transaction_ts"),
    F.col("order_id").alias("transaction_id"),
    F.col("customer_id"),
    F.col("item_seq").alias("seq_no"),
    F.col("product_id"),
    F.col("channel"),
    F.col("coupon_code"),
    F.col("coupon_flag"),
    F.col("unit_price_currency"),
    F.col("quantity"),
    F.col("unit_price"),
    F.col("gross_amount"),
    F.col("discount_pct").alias("discount_percent"),
    F.col("discount_amount"),
    F.col("tax_amount"),
    F.col("sale_amount").alias("net_amount"),
    F.col("sale_amount_inr").alias("net_amount_inr")
)

In [0]:
orders_gold_df.limit(5).display()

date_id,transaction_date,transaction_ts,transaction_id,customer_id,seq_no,product_id,channel,coupon_code,coupon_flag,unit_price_currency,quantity,unit_price,gross_amount,discount_percent,discount_amount,tax_amount,net_amount,net_amount_inr
20250802,2025-08-02,2025-08-02T01:28:36.000Z,645377,CUST000000198152,2,2000000216447,Website,,0,SGD,1,138.0,138.0,19.0,27,21.0,132.0,9000
20250803,2025-08-03,2025-08-03T12:50:24.000Z,645815,CUST000000260005,2,2000000444215,Mobile,,0,INR,1,2930.0,2930.0,4.0,118,142.0,2954.0,2954
20250803,2025-08-03,2025-08-03T22:44:13.000Z,645817,CUST000000183607,1,2000000108100,Website,,0,INR,1,1767.0,1767.0,16.0,283,179.0,1663.0,1663
20250803,2025-08-03,2025-08-03T05:26:47.000Z,646863,CUST000000050539,1,2000000471051,Website,,0,INR,1,29146.0,29146.0,5.0,1458,3340.0,31028.0,31028
20250804,2025-08-04,2025-08-04T16:32:02.000Z,647254,CUST000000035207,1,2000000116365,Website,,0,INR,1,1531.0,1531.0,7.0,108,71.0,1494.0,1494


In [0]:
# Write raw data to the gold layer (catalog: ecommerce, schema: gold, table: gld_fact_order_items)
orders_gold_df.write.format("delta") \
    .mode("overwrite") \
    .option("mergeSchema", "true") \
    .saveAsTable(f"{catalog_name}.gold.gld_fact_order_items")

In [0]:
spark.sql(f"SELECT count(*) FROM {catalog_name}.gold.gld_fact_order_items").show()

+--------+
|count(*)|
+--------+
|  183378|
+--------+

