In [0]:
!pip install dlt

In [0]:
import dlt
from pyspark.sql.functions import col, sum as pysum, desc

# --- Pipeline parameters for catalog/schema (default to 'main' and 'default') ---
# catalog = spark.conf.get("source_catalog", "main")
# schema = spark.conf.get("source_schema", "default")
sales_table = "data_university.dlt.demo_sales_source"
customers_table = "data_university.dlt.demo_customers_source"

In [0]:
# --------- BRONZE: Raw Sales Data (streaming table, but you can use batch too) ---------
@dlt.table(
    name="bronze_sales",
    comment=f"Raw sales data from {sales_table}"
)
def bronze_sales():
    # For demo datasets, use batch read for simplicity; use readStream for real streaming sources
    return spark.read.format("delta").table(sales_table)

# --------- BRONZE: Raw Customers Data (batch read as materialized view) ---------
@dlt.table(
    name="bronze_customers",
    comment=f"Raw customers data from {customers_table}"
)
def bronze_customers():
    return spark.read.format("delta").table(customers_table)

In [0]:
# --------- SILVER: Join Sales with Customer Names ---------
@dlt.table(
    name="silver_sales_with_customer",
    comment="Sales data with customer names"
)
def silver_sales_with_customer():
    sales = spark.read.table("LIVE.bronze_sales")
    customers = spark.read.table("LIVE.bronze_customers")
    return sales.join(customers, "customer_id", "left")

In [0]:
# --------- GOLD: Aggregate Sales by Customer ---------
@dlt.table(
    name="gold_sales_by_customer",
    comment="Total sales amount by customer"
)
def gold_sales_by_customer():
    df = spark.read.table("LIVE.silver_sales_with_customer")
    return (df.groupBy("customer_id", "customer_name")
              .agg(pysum("amount").alias("total_amount"))
              .orderBy(desc("total_amount")))

# --------- GOLD: Daily Sales Summary ---------
@dlt.table(
    name="gold_daily_sales",
    comment="Total sales per day"
)
def gold_daily_sales():
    df = spark.read.table("LIVE.bronze_sales")
    return (df.groupBy("order_date")
              .agg(pysum("amount").alias("total_amount"))
              .orderBy("order_date"))