In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json, col
from pyspark.sql.types import StructType, StructField, StringType

In [None]:
# import pyspark
# print(pyspark.__version__)
# import os
# os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-streaming-kafka-0-10_2.12:3.5.0,org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.0 pyspark-shell'

In [2]:
KAFKA_HOST = "broker:9092"  # "localhost:9092"
KAFKA_TOPIC = "cdc.public.transactions"

# Create a SparkSession

# spark = SparkSession.builder \
#     .appName("KafkaIntegration") \
#     .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.0") \
#     .getOrCreate()

spark = SparkSession.builder.appName("KafkaStructuredStreaming").getOrCreate()

In [3]:
# Define the schema for the "after" field in the JSON payload
after_schema = StructType(
    [
        StructField("transaction_id", StringType()),
        StructField("user_id", StringType()),
        StructField("timestamp", StringType()),
        StructField("amount", StringType()),
        StructField("currency", StringType()),
        StructField("city", StringType()),
        StructField("country", StringType()),
        StructField("merchant_name", StringType()),
        StructField("payment_method", StringType()),
        StructField("ip_address", StringType()),
        StructField("voucher_code", StringType()),
        StructField("affiliateid", StringType()),
        StructField("modified_by", StringType()),
        StructField("modified_at", StringType()),
        StructField("change_info", StringType()),
    ]
)

#### READ STREAM

In [4]:
# Read the Kafka stream from the beginning
kafka_df = (
    spark.readStream.format("kafka")
    .option("kafka.bootstrap.servers", KAFKA_HOST)
    .option("subscribe", KAFKA_TOPIC)
    .option("startingOffsets", "earliest")
    .load()
)

#### WRITE STREAM - CHECK 1

In [5]:
# Temporary query to display raw Kafka messages
raw_query = kafka_df.writeStream.outputMode("append").format("console").start()
raw_query.awaitTermination(timeout=20)  # Let it run for a short period, then stop

False

In [6]:
raw_query.stop()  # Stop the query to proceed with the rest of your codev

#### WRITE STREAM - CHECK 2

In [12]:
queryName = "streaming_table"

query = (
    kafka_df.writeStream.outputMode("append")
    .format("memory")
    .queryName(queryName)
    .start()
)

In [15]:
import time
from IPython.display import display, clear_output

try:
    while True:
        # Clear the output
        clear_output(wait=True)
        # Run a SQL query and display the result
        display(spark.sql(f"SELECT * FROM {queryName} LIMIT 1").show(5, False))
        # Wait for a few seconds before the next update
        time.sleep(5)
except KeyboardInterrupt:
    print("Streaming stopped.")

+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

None

Streaming stopped.


In [16]:
query.stop()

#### PROCESS & WRITE STREAM

In [None]:
# Parse the JSON messages from Kafka and select the "after" field
parsed_df = kafka_df.select(
    from_json(col("value").cast("string"), after_schema).alias("after")
)

# Flatten the "after" structure into individual columns
flattened_df = parsed_df.select("after.*")

# Show the parsed DataFrame on the console (for demonstration purposes)
query = flattened_df.writeStream.outputMode("append").format("console").start()

query.awaitTermination()