In [0]:
# ===================================================================
# CREATE DEMO SOURCE TABLES FOR SCD OPERATIONS
# Run this in a standard notebook before creating the DLT pipeline
# ===================================================================

from pyspark.sql import Row
from pyspark.sql.functions import *
from datetime import datetime

print("Creating demo source tables for SCD operations...")

# ===================================================================
# Customer Master Data (Main Source)
# ===================================================================

initial_customers = [
    Row(customer_id=1, name="Alice Johnson", email="alice@email.com", city="New York", 
        tier="Gold", status="Active", last_updated="2024-01-01 10:00:00"),
    Row(customer_id=2, name="Bob Smith", email="bob@email.com", city="Chicago", 
        tier="Silver", status="Active", last_updated="2024-01-01 10:00:00"),
    Row(customer_id=3, name="Carol Davis", email="carol@email.com", city="Miami", 
        tier="Bronze", status="Active", last_updated="2024-01-01 10:00:00"),
    Row(customer_id=4, name="David Wilson", email="david@email.com", city="Seattle", 
        tier="Gold", status="Active", last_updated="2024-01-01 10:00:00"),
]

customers_df = spark.createDataFrame(initial_customers)
customers_df.write.format("delta").mode("overwrite").saveAsTable("data_university.dlt.scd_customers_source")

# ===================================================================
# Change Data Capture (CDC) Events Table
# ===================================================================

# This table simulates incoming change events for realistic SCD processing
initial_cdc_events = [
    Row(customer_id=1, name="Alice Johnson", email="alice@email.com", city="New York", 
        tier="Gold", operation="INSERT", event_timestamp="2024-01-01 10:00:00", change_sequence=1),
    Row(customer_id=2, name="Bob Smith", email="bob@email.com", city="Chicago", 
        tier="Silver", operation="INSERT", event_timestamp="2024-01-01 10:00:00", change_sequence=2),
    Row(customer_id=3, name="Carol Davis", email="carol@email.com", city="Miami", 
        tier="Bronze", operation="INSERT", event_timestamp="2024-01-01 10:00:00", change_sequence=3),
    Row(customer_id=4, name="David Wilson", email="david@email.com", city="Seattle", 
        tier="Gold", operation="INSERT", event_timestamp="2024-01-01 10:00:00", change_sequence=4),
]

cdc_df = spark.createDataFrame(initial_cdc_events)
cdc_df.write.format("delta").mode("overwrite").saveAsTable("data_university.dlt.scd_cdc_events")

print("Demo source tables created successfully!")
print("\nCustomers Source:")
spark.sql("SELECT * FROM data_university.dlt.scd_customers_source").show()
print("\nCDC Events:")
spark.sql("SELECT * FROM data_university.dlt.scd_cdc_events").show()
