In [None]:
import os
import sys
import pathlib
import findspark

from pyspark.sql import SparkSession
from pyspark.sql.functions import explode, split

prj_dir = pathlib.Path().resolve().parent.parent
spark_home = os.path.join(prj_dir / 'spark-3.5.0-bin-hadoop3')
findspark.init(spark_home)

SUBMIT_ARGS = f'--packages ' \
              f'org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.0,' \
              f'org.apache.kafka:kafka-clients:2.8.1 ' \
              f'pyspark-shell'

os.environ["PYSPARK_SUBMIT_ARGS"] = SUBMIT_ARGS




spark = SparkSession.builder \
    .master("local") \
    .appName("KafkaIntegration") \
    .getOrCreate()


kafka_bootstrap_server = "localhost:9092"
kafka_topic = "video-stream-event"


kafka_stream_df = spark \
    .readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", kafka_bootstrap_server) \
    .option("subscribe", kafka_topic) \
    .option("startingOffset", "earliest") \
    .option("auto.offset.reset", "earliest") \
    .option("includeHeaders", "true") \
    .option("failOnDataLoss", "false") \
    .load()


lines = kafka_stream_df.selectExpr("CAST(value AS STRING)") # from now-on lines is value column of received stream.

# split("value", " "): Splits the "value" column, which now contains the string representation of the Kafka message, into an array of words based on the space character (" ").
# explode(...): Transforms each element of the array into a separate row, effectively "exploding" the array into multiple rows.
# alias("word"): Renames the resulting column to "word". This step is optional, but it gives a more meaningful name to the column.

words = lines.select(explode(split("value", " ")).alias("word"))

# # Perform word count
word_counts = words.groupBy("word").count()

# Start the query
"""
    Different output modes:
        1. Complete: shoes the complete track of the received data (aggregations applied) (in our example all words count).
        2. Update: shoes the only updated stream (in our example number of words those added newly our those their count has been updated)
"""
query = word_counts.writeStream \
    .outputMode("update") \
    .format("console") \
    .start() \
    .awaitTermination()

:: loading settings :: url = jar:file:/Users/audioworkstation/Documents/WORKSPACE/LEARNING/spark_streaming_using_x/spark-3.5.0-bin-hadoop3/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /Users/audioworkstation/.ivy2/cache
The jars for the packages stored in: /Users/audioworkstation/.ivy2/jars
org.apache.spark#spark-sql-kafka-0-10_2.12 added as a dependency
org.apache.kafka#kafka-clients added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-961f0db3-5347-4533-90b9-7111ee09e476;1.0
	confs: [default]
	found org.apache.spark#spark-sql-kafka-0-10_2.12;3.5.0 in central
	found org.apache.spark#spark-token-provider-kafka-0-10_2.12;3.5.0 in central
	found org.apache.kafka#kafka-clients;3.4.1 in central
	found org.lz4#lz4-java;1.8.0 in central
	found org.xerial.snappy#snappy-java;1.1.10.3 in central
	found org.slf4j#slf4j-api;2.0.7 in central
	found org.apache.hadoop#hadoop-client-runtime;3.3.4 in central
	found org.apache.hadoop#hadoop-client-api;3.3.4 in central
	found commons-logging#commons-logging;1.1.3 in central
	found com.google.code.findbugs#jsr305;3.0.0 in central
	found org.apache.commons#commons-pool2;2.11.

-------------------------------------------
Batch: 0
-------------------------------------------
+----+-----+
|word|count|
+----+-----+
+----+-----+



                                                                                

-------------------------------------------
Batch: 1
-------------------------------------------
+---------+-----+
|     word|count|
+---------+-----+
| bisschen|    1|
|schneller|    1|
|      sie|    1|
|    bitte|    1|
|arbeiten?|    1|
|      ein|    1|
|   Konnen|    1|
+---------+-----+



                                                                                

-------------------------------------------
Batch: 2
-------------------------------------------
+----------+-----+
|      word|count|
+----------+-----+
|verspatung|    1|
|       ein|    2|
|        Es|    1|
|      gibt|    1|
+----------+-----+

