In [1]:
from pyspark.sql import SparkSession

from pyspark.sql.functions import from_json, col
from pyspark.sql.types import StructType, StringType, IntegerType, TimestampType, FloatType

In [32]:
spark = SparkSession.builder \
    .master("local[6]")\
    .appName("KafkaStreamingExample") \
    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.11:2.4.8") \
    .getOrCreate()

In [23]:
# Kafka connection details
bootstrap_servers = "pkc-56d1g.eastus.azure.confluent.cloud:9092"
kafka_topic = "Hassan_topic"
kafka_username = "JUKQQM4ZM632RECA"
kafka_password = "UUkrPuSttgOC0U9lY3ZansNsKfN9fbxZPFwrGxudDrfv+knTD4rCwK+KdIzVPX0D"

In [16]:
# Define schema for the incoming JSON data
schema = StructType() \
    .add("eventType", StringType()) \
    .add("customerId", StringType()) \
    .add("productId", StringType()) \
    .add("timestamp", TimestampType()) \
    .add("metadata", StructType()
         .add("category", StringType())
         .add("source", StringType())
    ) \
    .add("quantity", IntegerType()) \
    .add("totalAmount", FloatType()) \
    .add("paymentMethod", StringType()) \
    .add("recommendedProductId", StringType()) \
    .add("algorithm", StringType())

In [24]:
# Read data from Kafka topic as a streaming DataFrame
df = spark \
    .readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", bootstrap_servers) \
    .option("subscribe", kafka_topic) \
    .option("startingOffsets", "earliest") \
    .option("kafka.security.protocol", "SASL_SSL") \
    .option("kafka.sasl.mechanism", "PLAIN") \
    .option("kafka.sasl.jaas.config",
            f'org.apache.kafka.common.security.plain.PlainLoginModule required username="{kafka_username}" password="{kafka_password}";') \
    .load()

In [25]:
json_df = df.selectExpr("CAST(value AS STRING)").select(from_json("value", schema).alias("data")).select("data.*")

In [19]:
# Repartition the DataFrame before writing to HDFS
repartitioned_df = json_df.repartition(4)

In [43]:
query = json_df \
    .writeStream \
    .outputMode("append") \
    .format("parquet") \
    .option("path", "hdfs://localhost:9000/data/streaming/streaming-output") \
    .option("checkpointLocation", "hdfs://localhost:9000/data/streaming/checkpoint") \
    .start()

* The outputMode is set to append to append new data to the existing dataset
* The format is set to parquet to write the data in Parquet format.
* The path option specifies the HDFS directory where the Parquet files will be written.
* The checkpointLocation option specifies the HDFS directory for checkpointing to ensure fault tolerance.

In [44]:
events = spark.read.parquet(f"hdfs:///data/streaming/streaming-output/*")
events

eventType,customerId,productId,timestamp,metadata,quantity,totalAmount,paymentMethod,recommendedProductId,algorithm
recommendationClick,71818,4109,2024-07-11 22:55:03,"[,]",,,,2461.0,content_based
purchase,39173,6121,2024-07-10 12:00:24,"[,]",1.0,237.6,PayPal,,
recommendationClick,15996,4534,2024-07-12 02:55:52,"[,]",,,,9629.0,content_based
addToCart,41037,6064,2024-07-12 02:55:14,"[,]",4.0,,,,
productView,25519,6873,2024-07-10 12:00:38,"[Home & Kitchen, ...",,,,,
purchase,29354,7267,2024-07-11 22:55:41,"[,]",3.0,207.42,Debit Card,,
recommendationClick,28824,5021,2024-07-12 02:56:03,"[,]",,,,7370.0,collaborative_fil...
addToCart,17617,1792,2024-07-10 12:00:14,"[,]",2.0,,,,
purchase,77878,9731,2024-07-10 12:00:41,"[,]",4.0,248.52,Debit Card,,
addToCart,47828,9858,2024-07-12 02:55:23,"[,]",1.0,,,,


In [None]:
# query.awaitTermination()

In [45]:
query.stop()

In [None]:
# spark.stop()

In [46]:
# Create a temporary view from the DataFrame
json_df.createOrReplaceTempView("kafka_data")

In [47]:
# Execute a SQL query on the temporary view
result_df = spark.sql("""
    SELECT 
        eventType,
        customerId,
        productId,
        timestamp,
        metadata.category,
        metadata.source,
        quantity,
        totalAmount,
        paymentMethod,
        recommendedProductId,
        algorithm
    FROM kafka_data
""")