In [1]:
from delta import *
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [2]:
builder = (SparkSession.builder
           .appName("transform-filter-streaming")
           .master("spark://spark-master:7077")
           .config("spark.executor.memory", "512m")
           .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
           .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog"))

spark = configure_spark_with_delta_pip(builder,['org.apache.spark:spark-sql-kafka-0-10_2.12:3.4.1']).getOrCreate()
spark.sparkContext.setLogLevel("ERROR")

:: loading settings :: url = jar:file:/usr/local/lib/python3.10/dist-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
io.delta#delta-core_2.12 added as a dependency
org.apache.spark#spark-sql-kafka-0-10_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-007903a3-b9b0-49f1-ae26-e9223331af98;1.0
	confs: [default]
	found io.delta#delta-core_2.12;2.4.0 in central
	found io.delta#delta-storage;2.4.0 in central
	found org.antlr#antlr4-runtime;4.9.3 in central
	found org.apache.spark#spark-sql-kafka-0-10_2.12;3.4.1 in central
	found org.apache.spark#spark-token-provider-kafka-0-10_2.12;3.4.1 in central
	found org.apache.kafka#kafka-clients;3.3.2 in central
	found org.lz4#lz4-java;1.8.0 in central
	found org.xerial.snappy#snappy-java;1.1.10.1 in central
	found org.slf4j#slf4j-api;2.0.6 in central
	found org.apache.hadoop#hadoop-client-runtime;3.3.4 in central
	found org.apache.hadoop#hadoop-client-api;3.3.4 in central
	found commons-logging#commons-logging;1.1.3 in centra

In [3]:
df = (spark.readStream
      .format("kafka")
      .option("kafka.bootstrap.servers", "kafka:9092")
      .option("subscribe", "users")
      .option("startingOffsets", "earliest")
      .load())

In [4]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

schema = StructType([
    StructField('id', IntegerType(), True),
    StructField('name', StringType(), True),
    StructField('age', IntegerType(), True),
    StructField('gender', StringType(), True),
    StructField('country', StringType(), True)])

df = df.withColumn('value', from_json(col('value').cast("STRING"), schema))

In [5]:
from pyspark.sql.functions import col

df = df.select(
    col('value.id').alias('id'),
    col('value.name').alias('name'),
    col('value.age').alias('age'),
    col('value.gender').alias('gender'),
    col('value.country').alias('country'))

In [6]:
from pyspark.sql.functions import avg

df = (df.select('age','country', 'gender').filter("age >= 21").groupBy('country', 'gender').agg(avg('age').alias('avg_age')))

In [7]:
query = (df.writeStream
    .outputMode('complete')
    .format('console')
    .start())

                                                                                

-------------------------------------------
Batch: 0
-------------------------------------------
+---------+------+------------------+
|  country|gender|           avg_age|
+---------+------+------------------+
|   Brazil|     F|             43.75|
|   Brazil|     M| 44.10112359550562|
|      USA|     F| 43.29281767955801|
|Australia|     F| 44.05142857142857|
|   Canada|     M|43.795180722891565|
|       UK|     M|42.011428571428574|
|      USA|     M| 42.93069306930693|
|    India|     M| 43.69886363636363|
|    China|     M| 42.49230769230769|
|    China|     F| 44.51829268292683|
|   Canada|     F|43.016216216216215|
|Australia|     M| 42.25837320574163|
|    India|     F| 42.29943502824859|
|       UK|     F|  43.3072625698324|
+---------+------+------------------+



                                                                                

-------------------------------------------
Batch: 1
-------------------------------------------
+---------+------+------------------+
|  country|gender|           avg_age|
+---------+------+------------------+
|   Brazil|     F|             43.75|
|   Brazil|     M| 44.10112359550562|
|      USA|     F| 43.29281767955801|
|Australia|     F| 44.05142857142857|
|   Canada|     M|43.795180722891565|
|       UK|     M|42.011428571428574|
|      USA|     M| 42.93069306930693|
|    India|     M| 43.69886363636363|
|    China|     M|42.505102040816325|
|    China|     F| 44.51829268292683|
|   Canada|     F|43.016216216216215|
|Australia|     M| 42.25837320574163|
|    India|     F| 42.29943502824859|
|       UK|     F|43.333333333333336|
+---------+------+------------------+



                                                                                

-------------------------------------------
Batch: 2
-------------------------------------------
+---------+------+------------------+
|  country|gender|           avg_age|
+---------+------+------------------+
|   Brazil|     F|             43.75|
|   Brazil|     M| 44.10112359550562|
|      USA|     F| 43.29281767955801|
|Australia|     F| 44.05142857142857|
|   Canada|     M|43.795180722891565|
|       UK|     M|42.011428571428574|
|      USA|     M| 42.93069306930693|
|    India|     M| 43.69886363636363|
|    China|     M|42.505102040816325|
|    China|     F| 44.51829268292683|
|   Canada|     F|43.016216216216215|
|Australia|     M|42.319047619047616|
|    India|     F| 42.29943502824859|
|       UK|     F|43.333333333333336|
+---------+------+------------------+



                                                                                

-------------------------------------------
Batch: 3
-------------------------------------------
+---------+------+------------------+
|  country|gender|           avg_age|
+---------+------+------------------+
|   Brazil|     F|             43.75|
|   Brazil|     M| 44.10112359550562|
|      USA|     F| 43.29281767955801|
|Australia|     F| 44.05142857142857|
|   Canada|     M|43.795180722891565|
|       UK|     M|42.011428571428574|
|      USA|     M| 42.93069306930693|
|    India|     M| 43.69886363636363|
|    China|     M| 42.61421319796954|
|    China|     F| 44.51829268292683|
|   Canada|     F|43.016216216216215|
|Australia|     M|42.319047619047616|
|    India|     F| 42.29943502824859|
|       UK|     F|43.333333333333336|
+---------+------+------------------+



                                                                                

-------------------------------------------
Batch: 4
-------------------------------------------
+---------+------+------------------+
|  country|gender|           avg_age|
+---------+------+------------------+
|   Brazil|     F|             43.75|
|   Brazil|     M| 44.10112359550562|
|      USA|     F| 43.29281767955801|
|Australia|     F| 43.96590909090909|
|   Canada|     M|43.795180722891565|
|       UK|     M|42.011428571428574|
|      USA|     M| 42.93069306930693|
|    India|     M| 43.69886363636363|
|    China|     M| 42.61421319796954|
|    China|     F|44.624242424242425|
|   Canada|     F|43.016216216216215|
|Australia|     M|42.319047619047616|
|    India|     F| 42.29943502824859|
|       UK|     F|43.333333333333336|
+---------+------+------------------+



                                                                                

-------------------------------------------
Batch: 5
-------------------------------------------
+---------+------+------------------+
|  country|gender|           avg_age|
+---------+------+------------------+
|   Brazil|     F|             43.75|
|   Brazil|     M| 44.10112359550562|
|      USA|     F| 43.29281767955801|
|Australia|     F| 43.96590909090909|
|   Canada|     M|43.795180722891565|
|       UK|     M|42.011428571428574|
|      USA|     M| 42.93069306930693|
|    India|     M| 43.69886363636363|
|    China|     M| 42.61421319796954|
|    China|     F|44.624242424242425|
|   Canada|     F| 42.90860215053763|
|Australia|     M|42.319047619047616|
|    India|     F|42.353932584269664|
|       UK|     F|43.333333333333336|
+---------+------+------------------+



                                                                                

-------------------------------------------
Batch: 6
-------------------------------------------
+---------+------+------------------+
|  country|gender|           avg_age|
+---------+------+------------------+
|   Brazil|     F|             43.75|
|   Brazil|     M| 44.10112359550562|
|      USA|     F| 43.29281767955801|
|Australia|     F| 43.89265536723164|
|   Canada|     M|43.795180722891565|
|       UK|     M|42.011428571428574|
|      USA|     M| 42.93069306930693|
|    India|     M| 43.69886363636363|
|    China|     M| 42.61421319796954|
|    China|     F|44.624242424242425|
|   Canada|     F| 42.90860215053763|
|Australia|     M|42.319047619047616|
|    India|     F|42.353932584269664|
|       UK|     F|43.333333333333336|
+---------+------+------------------+



                                                                                

-------------------------------------------
Batch: 7
-------------------------------------------
+---------+------+------------------+
|  country|gender|           avg_age|
+---------+------+------------------+
|   Brazil|     F|             43.75|
|   Brazil|     M| 44.10112359550562|
|      USA|     F| 43.29281767955801|
|Australia|     F| 43.89265536723164|
|   Canada|     M|43.795180722891565|
|       UK|     M|42.011428571428574|
|      USA|     M| 42.98029556650246|
|    India|     M| 43.69886363636363|
|    China|     M| 42.61421319796954|
|    China|     F|44.624242424242425|
|   Canada|     F| 42.90860215053763|
|Australia|     M|42.319047619047616|
|    India|     F|42.353932584269664|
|       UK|     F|43.333333333333336|
+---------+------+------------------+





In [None]:
query.stop()

In [None]:
spark.stop() 

                                                                                

-------------------------------------------
Batch: 10
-------------------------------------------
+---------+------+------------------+
|  country|gender|           avg_age|
+---------+------+------------------+
|   Brazil|     F|             43.75|
|   Brazil|     M| 44.10112359550562|
|      USA|     F| 43.29281767955801|
|Australia|     F| 43.89265536723164|
|   Canada|     M|43.795180722891565|
|       UK|     M|42.011428571428574|
|      USA|     M| 42.98029556650246|
|    India|     M| 43.69886363636363|
|    China|     M| 42.61421319796954|
|    China|     F| 44.62048192771084|
|   Canada|     F| 42.90860215053763|
|Australia|     M|42.319047619047616|
|    India|     F|42.353932584269664|
|       UK|     F|43.324175824175825|
+---------+------+------------------+



                                                                                

-------------------------------------------
Batch: 11
-------------------------------------------
+---------+------+------------------+
|  country|gender|           avg_age|
+---------+------+------------------+
|   Brazil|     F|             43.75|
|   Brazil|     M| 44.10112359550562|
|      USA|     F| 43.29281767955801|
|Australia|     F| 43.78651685393258|
|   Canada|     M|43.795180722891565|
|       UK|     M|42.011428571428574|
|      USA|     M| 42.98029556650246|
|    India|     M| 43.69886363636363|
|    China|     M| 42.61421319796954|
|    China|     F| 44.62048192771084|
|   Canada|     F| 42.90860215053763|
|Australia|     M|42.319047619047616|
|    India|     F|42.353932584269664|
|       UK|     F|43.324175824175825|
+---------+------+------------------+

