Before we start, we need to make sure that we have a Kafka cluster running and a topic that produces some streaming data. For simplicity, we will use a single-node Kafka cluster and a topic named `users`. Open the `4.0 user-gen-kafka.ipynb` notebook and execute the cell. This notebook produces a user record every few seconds and put it on a Kafka topic called users. 

In [1]:
from delta import configure_spark_with_delta_pip
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, from_json
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

In [2]:
builder = (SparkSession.builder
           .appName("config-checkpoints")
           .master("spark://spark-master:7077")
           .config("spark.executor.memory", "512m")
           .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
           .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog"))

spark = configure_spark_with_delta_pip(builder,['org.apache.spark:spark-sql-kafka-0-10_2.12:3.4.1']).getOrCreate()
spark.sparkContext.setLogLevel("ERROR")

:: loading settings :: url = jar:file:/usr/local/lib/python3.10/dist-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
io.delta#delta-core_2.12 added as a dependency
org.apache.spark#spark-sql-kafka-0-10_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-88cc301f-1b47-43a6-b9a3-2206b14a3a2a;1.0
	confs: [default]
	found io.delta#delta-core_2.12;2.4.0 in central
	found io.delta#delta-storage;2.4.0 in central
	found org.antlr#antlr4-runtime;4.9.3 in central
	found org.apache.spark#spark-sql-kafka-0-10_2.12;3.4.1 in central
	found org.apache.spark#spark-token-provider-kafka-0-10_2.12;3.4.1 in central
	found org.apache.kafka#kafka-clients;3.3.2 in central
	found org.lz4#lz4-java;1.8.0 in central
	found org.xerial.snappy#snappy-java;1.1.10.1 in central
	found org.slf4j#slf4j-api;2.0.6 in central
	found org.apache.hadoop#hadoop-client-runtime;3.3.4 in central
	found org.apache.hadoop#hadoop-client-api;3.3.4 in central
	found commons-logging#commons-logging;1.1.3 in centra

In [3]:
get_ipython().run_line_magic('load_ext', 'sparksql_magic')
get_ipython().run_line_magic('config', 'SparkSql.limit=20')

In [5]:
df = (spark.readStream
      .format("kafka")
      .option("kafka.bootstrap.servers", "kafka:9092")
      .option("subscribe", "users")
      .option("startingOffsets", "earliest")
      .load())

In [6]:
schema = StructType([
    StructField('id', IntegerType(), True),
    StructField('name', StringType(), True),
    StructField('age', IntegerType(), True),
    StructField('gender', StringType(), True),
    StructField('country', StringType(), True)])

df = df.withColumn('value', from_json(col('value').cast("STRING"), schema))

In [7]:
df = df.select(
    col('value.id').alias('id'),
    col('value.name').alias('name'),
    col('value.age').alias('age'),
    col('value.gender').alias('gender'),
    col('value.country').alias('country'))

In [8]:
query = (df.writeStream
   .format("console")
   .outputMode("append")
   .option("checkpointLocation", "/opt/workspace/data/checkpoint")
   .start())

                                                                                

-------------------------------------------
Batch: 0
-------------------------------------------
+---+------+---+------+---------+
| id|  name|age|gender|  country|
+---+------+---+------+---------+
| 23|user13| 25|     F|    China|
| 64|user35| 60|     F|      USA|
| 20|user81| 41|     F|   Canada|
| 55|user18| 35|     M|    India|
| 63|user23| 41|     F|   Brazil|
| 78|user74| 59|     F|      USA|
| 84|user34| 18|     F|       UK|
| 56|user64| 32|     M|    China|
| 85|user13| 61|     M|      USA|
| 44|user86| 44|     F|    India|
|  5|user48| 32|     M|Australia|
| 50|user73| 60|     F|Australia|
|  7|user93| 38|     F|    China|
| 20|user45| 45|     M|   Canada|
| 57|user22| 43|     F|    China|
| 71|user13| 27|     F|    China|
| 92|user39| 20|     F|   Brazil|
| 53|user21| 64|     F|    India|
| 47| user2| 21|     M|       UK|
| 88|user86| 44|     M|    India|
+---+------+---+------+---------+
only showing top 20 rows



                                                                                

-------------------------------------------
Batch: 1
-------------------------------------------
+---+-----+---+------+-------+
| id| name|age|gender|country|
+---+-----+---+------+-------+
| 60|user8| 48|     F|    USA|
+---+-----+---+------+-------+

-------------------------------------------
Batch: 2
-------------------------------------------
+---+------+---+------+-------+
| id|  name|age|gender|country|
+---+------+---+------+-------+
| 12|user41| 29|     F| Canada|
+---+------+---+------+-------+



                                                                                

-------------------------------------------
Batch: 3
-------------------------------------------
+---+------+---+------+-------+
| id|  name|age|gender|country|
+---+------+---+------+-------+
| 12|user78| 28|     F|  China|
+---+------+---+------+-------+



                                                                                

-------------------------------------------
Batch: 4
-------------------------------------------
+---+------+---+------+---------+
| id|  name|age|gender|  country|
+---+------+---+------+---------+
| 63|user39| 57|     M|Australia|
+---+------+---+------+---------+



                                                                                

-------------------------------------------
Batch: 5
-------------------------------------------
+---+------+---+------+-------+
| id|  name|age|gender|country|
+---+------+---+------+-------+
| 32|user23| 54|     M|  China|
+---+------+---+------+-------+

-------------------------------------------
Batch: 6
-------------------------------------------
+---+------+---+------+-------+
| id|  name|age|gender|country|
+---+------+---+------+-------+
| 32|user53| 19|     M|    USA|
+---+------+---+------+-------+



                                                                                

-------------------------------------------
Batch: 7
-------------------------------------------
+---+------+---+------+-------+
| id|  name|age|gender|country|
+---+------+---+------+-------+
| 58|user52| 48|     F|  India|
+---+------+---+------+-------+

-------------------------------------------
Batch: 8
-------------------------------------------
+---+------+---+------+---------+
| id|  name|age|gender|  country|
+---+------+---+------+---------+
| 65|user65| 44|     F|Australia|
+---+------+---+------+---------+

-------------------------------------------
Batch: 9
-------------------------------------------
+---+------+---+------+-------+
| id|  name|age|gender|country|
+---+------+---+------+-------+
| 60|user20| 33|     F|    USA|
+---+------+---+------+-------+



In [9]:
query.stop()

In [10]:
query = (df.writeStream 
   .format("console") 
   .outputMode("append") 
   .option("checkpointLocation", "/opt/workspace/data/checkpoint") 
   .start()) 

                                                                                

-------------------------------------------
Batch: 10
-------------------------------------------
+---+------+---+------+-------+
| id|  name|age|gender|country|
+---+------+---+------+-------+
| 71|user61| 26|     F| Canada|
+---+------+---+------+-------+



                                                                                

-------------------------------------------
Batch: 11
-------------------------------------------
+---+------+---+------+-------+
| id|  name|age|gender|country|
+---+------+---+------+-------+
| 69|user74| 28|     F|  India|
+---+------+---+------+-------+



                                                                                

-------------------------------------------
Batch: 12
-------------------------------------------
+---+------+---+------+-------+
| id|  name|age|gender|country|
+---+------+---+------+-------+
| 85|user29| 56|     F| Brazil|
+---+------+---+------+-------+



                                                                                

-------------------------------------------
Batch: 13
-------------------------------------------
+---+------+---+------+-------+
| id|  name|age|gender|country|
+---+------+---+------+-------+
| 90|user18| 29|     F|     UK|
+---+------+---+------+-------+



In [11]:
query.stop()

24/02/04 18:00:17 ERROR WriteToDataSourceV2Exec: Data source write support MicroBatchWrite[epoch: 14, writer: ConsoleWriter[numRows=20, truncate=true]] is aborting.
24/02/04 18:00:17 ERROR WriteToDataSourceV2Exec: Data source write support MicroBatchWrite[epoch: 14, writer: ConsoleWriter[numRows=20, truncate=true]] aborted.


In [12]:
spark.stop() 