In [1]:
from pyspark.sql import SparkSession

from pyspark.sql.functions import from_json, col
from pyspark.sql.types import StructType, StringType, IntegerType, TimestampType, FloatType

In [2]:
spark = SparkSession.builder \
    .master("local[6]")\
    .appName("KafkaStreaming") \
    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.11:2.4.8") \
    .getOrCreate()

In [3]:
# Kafka connection details
bootstrap_servers = "pkc-56d1g.eastus.azure.confluent.cloud:9092"
kafka_topic = "Hassan_topic"
kafka_username = "JUKQQM4ZM632RECA"
kafka_password = "UUkrPuSttgOC0U9lY3ZansNsKfN9fbxZPFwrGxudDrfv+knTD4rCwK+KdIzVPX0D"

In [4]:
# Define schema for the incoming JSON data
schema = StructType() \
    .add("eventType", StringType()) \
    .add("customerId", StringType()) \
    .add("productId", StringType()) \
    .add("timestamp", TimestampType()) \
    .add("metadata", StructType()
         .add("category", StringType())
         .add("source", StringType())
    ) \
    .add("quantity", IntegerType()) \
    .add("totalAmount", FloatType()) \
    .add("paymentMethod", StringType()) \
    .add("recommendedProductId", StringType()) \
    .add("algorithm", StringType())

In [5]:
# Read data from Kafka topic as a streaming DataFrame
df = spark \
    .readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", bootstrap_servers) \
    .option("subscribe", kafka_topic) \
    .option("startingOffsets", "earliest") \
    .option("kafka.security.protocol", "SASL_SSL") \
    .option("kafka.sasl.mechanism", "PLAIN") \
    .option("kafka.sasl.jaas.config",
            f'org.apache.kafka.common.security.plain.PlainLoginModule required username="{kafka_username}" password="{kafka_password}";') \
    .load()

In [6]:
json_df = df.selectExpr("CAST(value AS STRING)").select(from_json("value", schema).alias("data")).select("data.*")

In [7]:
query = json_df.repartition(1) \
    .writeStream \
    .outputMode("append") \
    .format("parquet") \
    .option("path", "hdfs://localhost:9000/data/streaming/streaming-output") \
    .option("checkpointLocation", "hdfs://localhost:9000/data/streaming/checkpoint") \
    .start()

* The outputMode is set to append to append new data to the existing dataset
* The format is set to parquet to write the data in Parquet format.
* The path option specifies the HDFS directory where the Parquet files will be written.
* The checkpointLocation option specifies the HDFS directory for checkpointing to ensure fault tolerance.

In [8]:
events = spark.read.parquet(f"hdfs:///data/streaming/streaming-output/*")
events

eventType,customerId,productId,timestamp,metadata,quantity,totalAmount,paymentMethod,recommendedProductId,algorithm
addToCart,77429,4408,2024-07-10 12:00:02,"[,]",1.0,,,,
addToCart,29575,9480,2024-07-10 12:00:03,"[,]",3.0,,,,
purchase,91816,7641,2024-07-10 12:00:05,"[,]",3.0,259.46,Debit Card,,
addToCart,55263,3146,2024-07-10 12:00:05,"[,]",5.0,,,,
productView,91972,7018,2024-07-10 12:00:07,"[Home & Kitchen, ...",,,,,
addToCart,10756,1518,2024-07-10 12:00:09,"[,]",5.0,,,,
recommendationClick,56982,5270,2024-07-10 12:00:12,"[,]",,,,2936.0,collaborative_fil...
addToCart,17617,1792,2024-07-10 12:00:14,"[,]",2.0,,,,
productView,91514,8582,2024-07-10 12:00:16,"[Home & Kitchen, ...",,,,,
purchase,74669,4507,2024-07-10 12:00:18,"[,]",1.0,184.78,PayPal,,


In [None]:
# query.awaitTermination()

In [45]:
#query.stop()

In [None]:
# spark.stop()

In [9]:
# Filter and select columns based on eventType
df_recommendation_click = events.filter(col("eventType") == "recommendationClick").select("eventType", "customerId", "productId", "timestamp", "recommendedProductId", "algorithm")
df_purchase = events.filter(col("eventType") == "purchase").select("eventType", "customerId", "productId", "timestamp", "quantity", "totalAmount", "paymentMethod")
df_add_to_cart = events.filter(col("eventType") == "addToCart").select("eventType", "customerId", "productId", "timestamp", "quantity")
df_product_view = events.filter(col("eventType") == "productView").select("eventType", "customerId", "productId", "timestamp", "metadata")

# Show results
df_recommendation_click.show()
df_purchase.show()
df_add_to_cart.show()
df_product_view.show()
df_product_view.printSchema()

+-------------------+----------+---------+-------------------+--------------------+--------------------+
|          eventType|customerId|productId|          timestamp|recommendedProductId|           algorithm|
+-------------------+----------+---------+-------------------+--------------------+--------------------+
|recommendationClick|     56982|     5270|2024-07-10 12:00:12|                2936|collaborative_fil...|
|recommendationClick|     11525|     8945|2024-07-10 12:00:20|                9052|       content_based|
|recommendationClick|     33684|     2755|2024-07-10 12:00:43|                4169|       content_based|
|recommendationClick|     21583|     1446|2024-07-10 12:00:46|                2920|collaborative_fil...|
|recommendationClick|     41597|     2014|2024-07-10 12:01:12|                6237|       content_based|
|recommendationClick|     61395|     2664|2024-07-10 12:01:13|                4250|       content_based|
|recommendationClick|     11690|     2566|2024-07-10 12

In [10]:
name="recommendation_click"
df_recommendation_click.repartition(1).write.mode('overwrite').parquet(f"hdfs:///data/streaming/{name}")

In [11]:
name="purchase"
df_purchase.repartition(1).write.mode('overwrite').parquet(f"hdfs:///data/streaming/{name}")

In [12]:

name="add_to_cart"
df_add_to_cart.repartition(1).write.mode('overwrite').parquet(f"hdfs:///data/streaming/{name}")

In [13]:
name="product_view"
df_product_view.repartition(1).write.mode('overwrite').parquet(f"hdfs:///data/streaming/{name}")

In [46]:
# Create a temporary view from the DataFrame
json_df.createOrReplaceTempView("kafka_data")

In [47]:
# Execute a SQL query on the temporary view
result_df = spark.sql("""
    SELECT 
        eventType,
        customerId,
        productId,
        timestamp,
        metadata.category,
        metadata.source,
        quantity,
        totalAmount,
        paymentMethod,
        recommendedProductId,
        algorithm
    FROM kafka_data
""")

In [14]:
df_purchase.createOrReplaceTempView("topfivecustomer")
query_two=spark.sql("""SELECT 
    customerId, 
    SUM(totalAmount) AS totalPurchaseAmount
    FROM 
        topfivecustomer
    GROUP BY 
        customerId
    ORDER BY 
        totalPurchaseAmount DESC
    LIMIT 5
""")
query_two.show()

+----------+-------------------+
|customerId|totalPurchaseAmount|
+----------+-------------------+
|     49168|  494.6099853515625|
|     89942|  494.3599853515625|
|     17292|   489.760009765625|
|     44273|  485.2799987792969|
|     43561|   481.739990234375|
+----------+-------------------+

