In [0]:
spark.sql("""
CREATE TABLE IF NOT EXISTS incremental_load.default.raw_upi_transactions_v1
(
    transaction_id STRING,
    upi_id STRING,
    merchant_id STRING,
    txn_amount DOUBLE,
    transaction_time TIMESTAMP,
    txn_status STRING
)
USING DELTA
TBLPROPERTIES (delta.enableChangeDataFeed = true)
""")

print("Table 'incremental_load.default.raw_upi_transactions_v1' has been created with CDC enabled.")

In [0]:
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, TimestampType
from delta.tables import DeltaTable

# Define schema
schema = StructType([
    StructField("transaction_id", StringType(), True), 
    StructField("upi_id", StringType(), True),
    StructField("merchant_id", StringType(), True),
    StructField("txn_amount", DoubleType(), True),
    StructField("transaction_time", StringType(), True),
    StructField("txn_status", StringType(), True)
])

data_batch_1 = [
    ("txn_001", "upi_abc@bank", "m_001", 100.0, "2024-12-21 10:00:00", "initiated"),
    ("txn_002", "upi_xyz@bank", "m_002", 250.5, "2024-12-21 10:05:00", "initiated"),
    ("txn_003", "upi_pqr@bank", "m_003", 75.0, "2024-12-21 10:10:00", "initiated")
]

data_batch_2 = [
     ("txn_001", "upi_abc@bank", "m_001", 100.0, "2024-12-21 10:15:00", "completed"),
    ("txn_002", "upi_xyz@bank", "m_002", 250.5, "2024-12-21 10:20:00", "failed"),
    ("txn_003", "upi_pqr@bank", "m_003", 75.0, "2024-12-21 10:25:00", "initiated")
]

data_batch_3 = [
     ("txn_001", "upi_abc@bank", "m_001", 100.0, "2024-12-21 10:30:00", "refunded"),
    ("txn_003", "upi_pqr@bank", "m_003", 75.0, "2024-12-21 10:35:00", "completed")
]

mock_batches = [
    # Batch 1: Initial insert
    spark.createDataFrame(data_batch_1, schema),
    # Batch 2: Update and insert
    spark.createDataFrame(data_batch_2, schema),
    # Batch 3: Update and refunds
    spark.createDataFrame(data_batch_3, schema),
]


In [0]:
def merge_into_delta_table(delta_table_name: str, batch_df):
    delta_table = DeltaTable.forName(spark, delta_table_name)

    delta_table.alias("target").merge(
        batch_df.alias("source"),
        "target.transaction_id = source.transaction_id" 
    ).whenMatchedUpdate(
        set={
            "upi_id": "source.upi_id",
            "merchant_id": "source.merchant_id",
            "txn_amount": "source.txn_amount",
            "transaction_time": "source.transaction_time",
            "txn_status": "source.txn_status"
        }
    ).whenNotMatchedInsertAll() \
    .execute()

In [0]:
merge_into_delta_table("incremental_load.default.raw_upi_transactions_v1", mock_batches[2])
print(f"Batch processed successfully.")