In [None]:
import os
import sys
import pathlib
import findspark

from pyspark.sql import SparkSession
from pyspark.sql.functions import explode, split

prj_dir = pathlib.Path().resolve().parent.parent
spark_home = os.path.join(prj_dir / 'spark-3.5.0-bin-hadoop3')
findspark.init(spark_home)

SUBMIT_ARGS = f'--packages ' \
              f'org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.0,' \
              f'org.apache.kafka:kafka-clients:2.8.1 ' \
              f'pyspark-shell'

os.environ["PYSPARK_SUBMIT_ARGS"] = SUBMIT_ARGS




spark = SparkSession.builder \
    .master("local") \
    .appName("KafkaIntegration") \
    .getOrCreate()


kafka_bootstrap_server = "localhost:9092"
kafka_topic = "video-stream-event"


kafka_stream_df = spark \
    .readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", kafka_bootstrap_server) \
    .option("subscribe", kafka_topic) \
    .option("startingOffset", "earliest") \
    .option("auto.offset.reset", "earliest") \
    .option("includeHeaders", "true") \
    .option("failOnDataLoss", "false") \
    .load()

lines = kafka_stream_df.selectExpr("CAST(value AS STRING)")

# Split the lines into words
words = lines.select(explode(split("value", " ")).alias("word"))

# Perform word count
word_counts = words.groupBy("word").count()

# Start the query
query = word_counts.writeStream \
    .outputMode("complete") \
    .format("console") \
    .start()

# Await the termination of the streaming query (or use awaitTermination with a timeout)
query.awaitTermination()

                                                                                

-------------------------------------------
Batch: 0
-------------------------------------------
+----+-----+
|word|count|
+----+-----+
+----+-----+



                                                                                

-------------------------------------------
Batch: 1
-------------------------------------------
+-----+-----+
| word|count|
+-----+-----+
|salam|    2|
|  bah|    1|
+-----+-----+



                                                                                

-------------------------------------------
Batch: 2
-------------------------------------------
+-----+-----+
| word|count|
+-----+-----+
|salam|    2|
|  bah|    3|
+-----+-----+

