In [0]:
from pyspark.sql import Row
from pyspark.sql.functions import *
from datetime import datetime

In [0]:

# # ===================================================================
# # CREATE DEMO SOURCE TABLES FOR SCD OPERATIONS
# # Run this in a standard notebook before creating the DLT pipeline
# # ===================================================================

# from pyspark.sql import Row
# from pyspark.sql.functions import *
# from datetime import datetime

# print("Creating demo source tables for SCD operations...")

# # ===================================================================
# # Customer Master Data (Main Source)
# # ===================================================================

# initial_customers = [
#     Row(customer_id=1, name="Alice Johnson", email="alice@email.com", city="New York", 
#         tier="Gold", status="Active", last_updated="2024-01-01 10:00:00"),
#     Row(customer_id=2, name="Bob Smith", email="bob@email.com", city="Chicago", 
#         tier="Silver", status="Active", last_updated="2024-01-01 10:00:00"),
#     Row(customer_id=3, name="Carol Davis", email="carol@email.com", city="Miami", 
#         tier="Bronze", status="Active", last_updated="2024-01-01 10:00:00"),
#     Row(customer_id=4, name="David Wilson", email="david@email.com", city="Seattle", 
#         tier="Gold", status="Active", last_updated="2024-01-01 10:00:00"),
# ]

# customers_df = spark.createDataFrame(initial_customers)
# customers_df.write.format("delta").mode("overwrite").saveAsTable("data_university.dlt.scd_customers_source")

# # ===================================================================
# # Change Data Capture (CDC) Events Table
# # ===================================================================

# # This table simulates incoming change events for realistic SCD processing
# initial_cdc_events = [
#     Row(customer_id=1, name="Alice Johnson", email="alice@email.com", city="New York", 
#         tier="Gold", operation="INSERT", event_timestamp="2024-01-01 10:00:00", change_sequence=1),
#     Row(customer_id=2, name="Bob Smith", email="bob@email.com", city="Chicago", 
#         tier="Silver", operation="INSERT", event_timestamp="2024-01-01 10:00:00", change_sequence=2),
#     Row(customer_id=3, name="Carol Davis", email="carol@email.com", city="Miami", 
#         tier="Bronze", operation="INSERT", event_timestamp="2024-01-01 10:00:00", change_sequence=3),
#     Row(customer_id=4, name="David Wilson", email="david@email.com", city="Seattle", 
#         tier="Gold", operation="INSERT", event_timestamp="2024-01-01 10:00:00", change_sequence=4),
# ]

# cdc_df = spark.createDataFrame(initial_cdc_events)
# cdc_df.write.format("delta").mode("overwrite").saveAsTable("data_university.dlt.scd_cdc_events")

# print("Demo source tables created successfully!")
# print("\nCustomers Source:")
# spark.sql("SELECT * FROM data_university.dlt.scd_customers_source").show()
# print("\nCDC Events:")
# spark.sql("SELECT * FROM data_university.dlt.scd_cdc_events").show()


In [0]:
# ===================================================================
# DLT PIPELINE: COMPREHENSIVE SCD OPERATIONS DEMO
# Use this notebook as source code for your DLT pipeline
# ===================================================================

import dlt
from pyspark.sql.functions import *

# Pipeline parameters
catalog = "data_university"
schema = "dlt"
customers_source = f"{catalog}.{schema}.scd_customers_source"
cdc_source = f"{catalog}.{schema}.scd_cdc_events"

print(f"Reading from customers source: {customers_source}")
print(f"Reading from CDC source: {cdc_source}")

# ===================================================================
# BRONZE LAYER: Raw Data Ingestion
# ===================================================================

@dlt.table(
    name="bronze_customers_raw",
    comment="Raw customer data for SCD processing"
)
def bronze_customers_raw():
    """Streaming table for raw customer data ingestion"""
    return (
        spark.readStream
        .format("delta")
        .table(customers_source)
    )

@dlt.table(
    name="bronze_cdc_events",
    comment="Raw CDC events for change data capture"
)
def bronze_cdc_events():
    """Streaming table for CDC events"""
    return (
        spark.readStream
        .format("delta")
        .table(cdc_source)
    )

# ===================================================================
# SCD TYPE 1: Overwrite Changes (No History Preservation)
# ===================================================================

# Create target table for SCD1
dlt.create_streaming_table(
    name="silver_customers_scd1",
    comment="SCD Type 1 - Overwrites changes, no history tracking"
)

# Apply SCD1 logic using DLT's apply_changes
dlt.apply_changes(
    target="silver_customers_scd1",
    source="bronze_cdc_events",
    keys=["customer_id"],
    sequence_by=col("change_sequence"),
    apply_as_deletes = expr("operation = 'DELETE'"),
    apply_as_truncates = expr("operation = 'TRUNCATE'"),
    except_column_list=["operation", "event_timestamp", "change_sequence"],
    stored_as_scd_type="1"
)

  

# ===================================================================
# SCD TYPE 2: Preserve Complete History
# ===================================================================

# Create target table for SCD2
dlt.create_streaming_table(
    name="silver_customers_scd2",
    comment="SCD Type 2 - Preserves complete history with effective dates"
)

# Apply SCD2 logic using DLT's apply_changes
dlt.apply_changes(
    target="silver_customers_scd2",
    source="bronze_cdc_events",
    keys=["customer_id"],
    sequence_by=col("change_sequence"),
    apply_as_deletes=expr("operation = 'DELETE'"),
    except_column_list=["operation", "event_timestamp", "change_sequence"],
    stored_as_scd_type="2"
)

# ===================================================================
# GOLD LAYER: Business Analytics on SCD Data
# ===================================================================

@dlt.table(
    name="gold_customer_current_state",
    comment="Current active customer snapshot from SCD2"
)
def gold_customer_current_state():
    """Current snapshot of customers from SCD Type 2 table"""
    return (
        spark.read.table("LIVE.silver_customers_scd2")
        .filter(col("__END_AT").isNull())  # Current records have null end date
        .select("customer_id", "name", "email", "city", "tier", "__START_AT")
        .orderBy("customer_id")
    )

@dlt.table(
    name="gold_customer_history_analysis",
    comment="Analysis of customer changes over time"
)
def gold_customer_history_analysis():
    """Analytics showing customer change patterns"""
    return (
        spark.read.table("LIVE.silver_customers_scd2")
        .groupBy("customer_id")
        .agg(
            first("name").alias("customer_name"),
            count("*").alias("total_versions"),
            min("__START_AT").alias("first_seen"),
            max("__START_AT").alias("last_changed"),
            sum(when(col("__END_AT").isNull(), 1).otherwise(0)).alias("active_versions")
        )
        .orderBy("customer_id")
    )

# ===================================================================
# DEMONSTRATION TABLES: Delete and Truncate Operations
# ===================================================================

@dlt.table(
    name="demo_scd1_with_deletes",
    comment="SCD1 table demonstrating delete operations"
)
def demo_scd1_with_deletes():
    """Demonstrates logical delete in SCD1 by filtering out deleted records"""
    return (
        spark.read.table("LIVE.silver_customers_scd1")
        .filter(col("customer_id") != 999)  # Simulate delete of customer 999
    )

@dlt.table(
    name="demo_scd2_soft_deletes",
    comment="SCD2 table showing soft delete behavior"
)
def demo_scd2_soft_deletes():
    """Shows how SCD2 handles soft deletes with end dates"""
    return (
        spark.read.table("LIVE.silver_customers_scd2")
        .select(
            "customer_id", "name", "city", "tier",
            "__START_AT", "__END_AT", 
            when(col("__END_AT").isNull(), "Active").otherwise("Deleted").alias("record_status")
        )
        .orderBy("customer_id", "__START_AT")
    )

# ===================================================================
# UTILITY VIEWS
# ===================================================================

@dlt.view(
    name="view_change_summary",
    comment="Summary of all change events processed"
)
def view_change_summary():
    """Non-materialized view showing change event summary"""
    return (
        spark.read.table("LIVE.bronze_cdc_events")
        .groupBy("operation")
        .agg(
            count("*").alias("event_count"),
            min("event_timestamp").alias("first_event"),
            max("event_timestamp").alias("last_event")
        )
        .orderBy("operation")
    )


### SCD1 Changes - Updates and New Records

In [0]:
# # ===================================================================
# # DEMONSTRATE SCD1 BEHAVIOR
# # Run this after initial pipeline execution
# # ===================================================================

# print("Adding SCD1 changes - updates and new records...")

# # Add change events for SCD1 demonstration
# scd1_changes = [
#     # Update Alice's tier (SCD1 will overwrite)
#     Row(customer_id=1, name="Alice Johnson", email="alice.johnson@newemail.com", city="Boston", 
#         tier="Platinum", operation="UPDATE", event_timestamp="2024-01-15 09:00:00", change_sequence=5),
    
#     # Update Bob's city (SCD1 will overwrite)
#     Row(customer_id=2, name="Bob Smith", email="bob@email.com", city="Denver", 
#         tier="Gold", operation="UPDATE", event_timestamp="2024-01-15 10:00:00", change_sequence=6),
    
#     # New customer
#     Row(customer_id=5, name="Eva Brown", email="eva@email.com", city="Portland", 
#         tier="Silver", operation="INSERT", event_timestamp="2024-01-15 11:00:00", change_sequence=7),
# ]

# scd1_df = spark.createDataFrame(scd1_changes)
# scd1_df.write.format("delta").mode("append").saveAsTable("data_university.dlt.scd_cdc_events")

# print("SCD1 changes added. Re-run DLT pipeline to see overwrite behavior.")

SCD2 Changes - Historical Tracking

In [0]:
# # ===================================================================
# # DEMONSTRATE SCD2 BEHAVIOR
# # Run this after SCD1 changes and pipeline update
# # ===================================================================

# print("Adding SCD2 changes - historical tracking...")

# # Add more change events to demonstrate SCD2 history preservation
# scd2_changes = [
#     # Multiple changes for Carol to show history
#     Row(customer_id=3, name="Carol Davis", email="carol@email.com", city="Orlando", 
#         tier="Silver", operation="UPDATE", event_timestamp="2024-01-20 09:00:00", change_sequence=8),
    
#     Row(customer_id=3, name="Carol Davis-Smith", email="carol.smith@email.com", city="Orlando", 
#         tier="Gold", operation="UPDATE", event_timestamp="2024-01-25 10:00:00", change_sequence=9),
    
#     # Change for David
#     Row(customer_id=4, name="David Wilson", email="david.wilson@email.com", city="Portland", 
#         tier="Platinum", operation="UPDATE", event_timestamp="2024-01-22 14:00:00", change_sequence=10),
# ]

# scd2_df = spark.createDataFrame(scd2_changes)
# scd2_df.write.format("delta").mode("append").saveAsTable("data_university.dlt.scd_cdc_events")

# print("SCD2 changes added. Re-run DLT pipeline to see history preservation.")


D. Delete Operations

In [0]:
# # ===================================================================
# # DEMONSTRATE DELETE OPERATIONS
# # ===================================================================

# print("Adding delete operations...")

# # Add delete events
# delete_events = [
#     # Soft delete for customer 4
#     Row(customer_id=4, name="David Wilson", email="david.wilson@email.com", city="Portland", 
#         tier="Platinum", operation="DELETE", event_timestamp="2024-01-30 15:00:00", change_sequence=11),
# ]

# delete_df = spark.createDataFrame(delete_events)
# delete_df.write.format("delta").mode("append").saveAsTable("data_university.dlt.scd_cdc_events")

# print("Delete events added. Re-run DLT pipeline to see delete handling.")
# print("- SCD1: Record will be removed")
# print("- SCD2: Record will have __END_AT populated (soft delete)")


Backfilling Historical Data

In [0]:
# # ===================================================================
# # DEMONSTRATE BACKFILLING FOR SCD2
# # ===================================================================

# print("Adding historical data for backfilling...")

# # Add historical events with earlier timestamps
# backfill_events = [
#     # Historical data for Alice before her first recorded change
#     Row(customer_id=1, name="Alice Smith", email="alice.smith@oldmail.com", city="Philadelphia", 
#         tier="Silver", operation="INSERT", event_timestamp="2023-12-01 08:00:00", change_sequence=0),
    
#     # Historical change for Bob
#     Row(customer_id=2, name="Robert Smith", email="robert@email.com", city="Milwaukee", 
#         tier="Bronze", operation="INSERT", event_timestamp="2023-12-15 09:00:00", change_sequence=1),
# ]

# backfill_df = spark.createDataFrame(backfill_events)
# backfill_df.write.format("delta").mode("append").saveAsTable("data_university.dlt.scd_cdc_events")

# print("Historical events added for backfilling!")
# print("Re-run DLT pipeline to see how SCD2 handles out-of-order data.")


Add a TRUNCATE Event

In [0]:
# from pyspark.sql import Row

# truncate_event = [
#     Row(customer_id=1, name="Sourav", email="sourav@oldmail.com", city="Bangalore", tier="Silver",
#         operation="TRUNCATE", event_timestamp="2024-02-02 09:00:00", change_sequence=4)
# ]
# spark.createDataFrame(truncate_event).write.format("delta").mode("append").saveAsTable("data_university.dlt.scd_cdc_events")


In [0]:
# new_data = [
#     Row(customer_id=4, name="David Wilson", email="david@email.com", city="Seattle", tier="Gold",
#         operation="INSERT", event_timestamp="2024-02-01 09:01:00", change_sequence=5),
#     Row(customer_id=5, name="Eva Brown", email="eva@email.com", city="Portland", tier="Silver",
#         operation="INSERT", event_timestamp="2024-02-01 09:02:00", change_sequence=6)
# ]
# spark.createDataFrame(new_data).write.format("delta").mode("append").saveAsTable("data_university.dlt.scd_cdc_events")
