In [1]:
import os
import pathlib
import findspark

from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json, col, avg
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

prj_dir = pathlib.Path().resolve().parent.parent
spark_home = os.path.join(prj_dir / 'spark-3.5.0-bin-hadoop3')
findspark.init(spark_home)


os.environ['PYSPARK_SUBMIT_ARGS'] = f'--packages ' \
                                    f'org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.0,' \
                                    f'org.apache.kafka:kafka-clients:2.8.1 ' \
                                    f'pyspark-shell'  


spark = SparkSession.builder.master("local").appName('Kafka Exercise').getOrCreate()

:: loading settings :: url = jar:file:/Users/audioworkstation/Documents/WORKSPACE/LEARNING/spark_streaming_using_x/spark-3.5.0-bin-hadoop3/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /Users/audioworkstation/.ivy2/cache
The jars for the packages stored in: /Users/audioworkstation/.ivy2/jars
org.apache.spark#spark-sql-kafka-0-10_2.12 added as a dependency
org.apache.kafka#kafka-clients added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-cf2612b4-3246-408d-921f-cfa0c9daf1b0;1.0
	confs: [default]
	found org.apache.spark#spark-sql-kafka-0-10_2.12;3.5.0 in central
	found org.apache.spark#spark-token-provider-kafka-0-10_2.12;3.5.0 in central
	found org.apache.kafka#kafka-clients;3.4.1 in central
	found org.lz4#lz4-java;1.8.0 in central
	found org.xerial.snappy#snappy-java;1.1.10.3 in central
	found org.slf4j#slf4j-api;2.0.7 in central
	found org.apache.hadoop#hadoop-client-runtime;3.3.4 in central
	found org.apache.hadoop#hadoop-client-api;3.3.4 in central
	found commons-logging#commons-logging;1.1.3 in central
	found com.google.code.findbugs#jsr305;3.0.0 in central
	found org.apache.commons#commons-pool2;2.11.

In [2]:
KAFKA_BOOTSTRAP_SERVER = 'localhost:9092'
KAFKA_TOPIC = 'personnal-info'

stream_df = spark \
    .readStream \
    .format('kafka') \
    .option('kafka.bootstrap.servers', KAFKA_BOOTSTRAP_SERVER) \
    .option('subscribe', KAFKA_TOPIC) \
    .option('startingOffset', 'earliest') \
    .option('auto.offset.reset', 'earliest') \
    .option('includeHeaders', 'true') \
    .option('failOnDataLoss', 'false') \
    .load()


schema = StructType([
    StructField("name", StringType()),
    StructField("age", IntegerType()),
    StructField("city", StringType())
])


extracted_data = stream_df.select(
    from_json(
        col('value').cast('string'),
        schema=schema
    ).alias('data')
).select("data.*")
clean_data = extracted_data.filter(col('data').isNotNull())

calculated_age_avg = clean_data.groupBy('city').agg(avg('age')).alias('avg_age')


In [None]:
calculated_age_avg.writeStream.outputMode('complete').format('console').start().awaitTermination()

23/11/23 11:50:01 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /private/var/folders/g6/vgc6wxj13x95m3zxhrn480540000gn/T/temporary-82c0edef-130f-4aa5-a0c6-55a07dbc56f3. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
23/11/23 11:50:01 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
23/11/23 11:50:01 WARN AdminClientConfig: These configurations '[key.deserializer, value.deserializer, enable.auto.commit, max.poll.records, auto.offset.reset]' were supplied but are not used yet.
                                                                                

-------------------------------------------
Batch: 0
-------------------------------------------
+----+--------+
|city|avg(age)|
+----+--------+
+----+--------+



                                                                                

-------------------------------------------
Batch: 1
-------------------------------------------
+-----+--------+
| city|avg(age)|
+-----+--------+
|rasht|    30.0|
+-----+--------+



                                                                                

-------------------------------------------
Batch: 2
-------------------------------------------
+------+--------+
|  city|avg(age)|
+------+--------+
| rasht|    30.0|
|mashad|    35.0|
+------+--------+



                                                                                

-------------------------------------------
Batch: 3
-------------------------------------------
+------+--------+
|  city|avg(age)|
+------+--------+
| rasht|    25.0|
|mashad|    35.0|
+------+--------+



                                                                                

-------------------------------------------
Batch: 4
-------------------------------------------
+-------+--------+
|   city|avg(age)|
+-------+--------+
|  rasht|    25.0|
| mashad|    35.0|
|ardabil|    50.0|
+-------+--------+



                                                                                

-------------------------------------------
Batch: 5
-------------------------------------------
+-------+--------+
|   city|avg(age)|
+-------+--------+
|  rasht|    25.0|
| mashad|    35.0|
|ardabil|    55.0|
+-------+--------+

