In [0]:
# # ===================================================================
# # CREATE DEMO SOURCE DELTA TABLE
# # Run this in a standard notebook before creating the DLT pipeline
# # ===================================================================

# from pyspark.sql import Row
# from pyspark.sql.functions import *

# print("Creating demo source table...")

# # Create realistic but simple sales data
# initial_sales_data = [
#     Row(order_id=1, customer_id=101, product="Laptop", amount=1200.0, order_date="2024-01-01"),
#     Row(order_id=2, customer_id=102, product="Mouse", amount=25.0, order_date="2024-01-02"),
#     Row(order_id=3, customer_id=103, product="Keyboard", amount=75.0, order_date="2024-01-03"),
#     Row(order_id=4, customer_id=101, product="Monitor", amount=300.0, order_date="2024-01-04"),
# ]

# # Create DataFrame and write as Delta table
# sales_df = spark.createDataFrame(initial_sales_data)
# sales_df.write.format("delta").mode("overwrite").saveAsTable("data_university.dlt.demo_sales_01")

# print("Demo source table created with 4 records:")
# spark.sql("SELECT * FROM data_university.dlt.demo_sales_01").show()

In [0]:
# ===================================================================
# DLT PIPELINE: STREAMING TABLE vs MATERIALIZED VIEW DEMO
# Use this notebook as source code for your DLT pipeline
# ===================================================================

import dlt
from pyspark.sql.functions import col, sum as pysum, count, desc, avg

# Pipeline parameters for source configuration
catalog = "data_university"
schema = "dlt"
sales_source_table = f"{catalog}.{schema}.demo_sales_01"

print(f"Reading from source: {sales_source_table}")

# ===================================================================
# STREAMING TABLE: Uses spark.readStream for incremental processing
# ===================================================================

@dlt.table(
    name="bronze_streaming_sales",
    comment=f"DLT Streaming Table - Incremental processing from {sales_source_table}"
)
def bronze_streaming_sales():
    """
    This creates a STREAMING TABLE in DLT.
    - Uses spark.readStream for incremental data processing
    - Only processes new/changed records since last run
    - Maintains checkpoints for exactly-once processing
    - Ideal for Bronze layer and real-time ingestion
    """
    return (
        spark.readStream
        .format("delta")
        .table(sales_source_table)
    )

# ===================================================================
# MATERIALIZED VIEW: Uses spark.read for batch processing
# ===================================================================

@dlt.table(
    name="bronze_materialized_sales",
    comment=f"DLT Materialized View - Batch processing from {sales_source_table}"
)
def bronze_materialized_sales():
    """
    This creates a MATERIALIZED VIEW in DLT.
    - Uses spark.read for batch data processing
    - Reprocesses ALL data on each pipeline run
    - No incremental checkpointing
    - Ideal for Silver/Gold layers and analytics
    """
    return (
        spark.read
        .format("delta")
        .table(sales_source_table)
    )

# ===================================================================
# SILVER LAYER: Compare downstream processing from both sources
# ===================================================================

@dlt.table(
    name="silver_customer_summary_streaming",
    comment="Customer summary derived from streaming table"
)
def silver_customer_summary_streaming():
    """
    This processes data from the streaming table.
    Updates will be incremental based on new data.
    """
    return (
        spark.read.table("LIVE.bronze_streaming_sales")
        .groupBy("customer_id")
        .agg(
            count("order_id").alias("total_orders"),
            pysum("amount").alias("total_spent"),
            avg("amount").alias("avg_order_value")
        )
        .orderBy(desc("total_spent"))
    )

@dlt.table(
    name="silver_customer_summary_materialized",
    comment="Customer summary derived from materialized view"
)
def silver_customer_summary_materialized():
    """
    This processes data from the materialized view.
    Updates will recompute all data on each run.
    """
    return (
        spark.read.table("LIVE.bronze_materialized_sales")
        .groupBy("customer_id")
        .agg(
            count("order_id").alias("total_orders"),
            pysum("amount").alias("total_spent"),
            avg("amount").alias("avg_order_value")
        )
        .orderBy(desc("total_spent"))
    )

# ===================================================================
# GOLD LAYER: Product performance analysis
# ===================================================================

@dlt.table(
    name="gold_product_performance",
    comment="Product performance metrics from streaming data"
)
def gold_product_performance():
    """
    Product analysis showing the final aggregated results.
    This demonstrates how streaming data flows to analytics.
    """
    return (
        spark.read.table("LIVE.bronze_streaming_sales")
        .groupBy("product")
        .agg(
            count("order_id").alias("order_count"),
            pysum("amount").alias("total_revenue"),
            avg("amount").alias("avg_order_value")
        )
        .orderBy(desc("total_revenue"))
    )




In [0]:
# ===================================================================
# VIEW: Non-materialized intermediate processing
# ===================================================================

@dlt.view(
    name="high_value_orders",
    comment="View of orders above $100 - demonstrates DLT views"
)
def high_value_orders():
    """
    DLT View (not materialized) for high-value orders.
    This shows the difference between views and tables in DLT.
    """
    return (
        spark.read.table("LIVE.bronze_streaming_sales")
        .filter(col("amount") > 100.00)
        .select("order_id", "customer_id", "product", "amount", "order_date")
        .orderBy(desc("amount"))
    )

In [0]:
# # ===================================================================
# # SIMULATE NEW DATA ARRIVAL
# # Run this after the first DLT pipeline execution
# # ===================================================================

# from pyspark.sql import Row

# print("Adding new sales data to simulate incremental ingestion...")

# # Add new sales records
# new_sales_data = [
#     Row(order_id=5, customer_id=104, product="Tablet", amount=450.0, order_date="2024-01-05"),
#     Row(order_id=6, customer_id=102, product="Headphones", amount=150.0, order_date="2024-01-06"),
#     Row(order_id=7, customer_id=101, product="Webcam", amount=80.0, order_date="2024-01-07"),
# ]

# new_sales_df = spark.createDataFrame(new_sales_data)
# new_sales_df.write.format("delta").mode("append").saveAsTable("data_university.dlt.demo_sales_01")

# print("New data added! Source table now contains:")
# spark.sql("SELECT COUNT(*) as total_records FROM data_university.dlt.demo_sales_01").show()

In [0]:
%sql
-- Verify both tables have the same final data
-- SELECT 'Streaming Table' as table_type, COUNT(*) as record_count 
-- FROM data_university.dlt.bronze_streaming_sales
-- UNION ALL
-- SELECT 'Materialized View' as table_type, COUNT(*) as record_count 
-- FROM data_university.dlt.bronze_materialized_sales;

-- Compare customer summaries (should be identical)
-- SELECT 'From Streaming' as source, * FROM data_university.dlt.silver_customer_summary_streaming
-- UNION ALL  
-- SELECT 'From Materialized' as source, * FROM data_university.dlt.silver_customer_summary_materialized
-- ORDER BY source, total_spent DESC;

-- Check the high-value orders view
-- SELECT * FROM data_university.dlt.high_value_orders;

-- View product performance
-- SELECT * FROM data_university.dlt.gold_product_performance;
