Before we start, we need to make sure that we have a Kafka cluster running and a topic that produces some streaming data. For simplicity, we will use a single-node Kafka cluster and a topic named `users`. Open the `5.0 user-gen-kafka.ipynb` notebook and execute the cell. This notebook produces a user record every few seconds and put it on a Kafka topic called `users`. 

In [17]:
from delta import configure_spark_with_delta_pip, DeltaTable
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, from_json
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

In [18]:
builder = (SparkSession.builder
           .appName("idempotent-stream-write-delta")
           .master("spark://spark-master:7077")
           .config("spark.executor.memory", "512m")
           .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
           .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog"))

spark = configure_spark_with_delta_pip(builder,['org.apache.spark:spark-sql-kafka-0-10_2.12:3.4.1']).getOrCreate()
spark.sparkContext.setLogLevel("ERROR")

In [19]:
get_ipython().run_line_magic('load_ext', 'sparksql_magic')
get_ipython().run_line_magic('config', 'SparkSql.limit=20')

The sparksql_magic extension is already loaded. To reload it, use:
  %reload_ext sparksql_magic


In [20]:
%%sparksql
CREATE OR REPLACE TABLE default.users (
    id INT,
    name STRING,
    age INT,
    gender STRING,
    country STRING 
) USING DELTA LOCATION '/opt/workspace/data/delta_lake/idempotent-stream-write-delta/users';

                                                                                

In [21]:
df = (spark.readStream
      .format("kafka")
      .option("kafka.bootstrap.servers", "kafka:9092")
      .option("subscribe", "users")
      .option("startingOffsets", "earliest")
      .load())

In [22]:
schema = StructType([
    StructField('id', IntegerType(), True),
    StructField('name', StringType(), True),
    StructField('age', IntegerType(), True),
    StructField('gender', StringType(), True),
    StructField('country', StringType(), True)])

df = df.withColumn('value', from_json(col('value').cast("STRING"), schema))

In [23]:
df = df.select(
    col('value.id').alias('id'),
    col('value.name').alias('name'),
    col('value.age').alias('age'),
    col('value.gender').alias('gender'),
    col('value.country').alias('country'))

In [24]:
query = (df.writeStream
   .format("delta")
   .outputMode("append")
   .option("checkpointLocation", "/opt/workspace/data/delta_lake/idempotent-stream-write-delta/users/_checkpoints/")
   .start("/opt/workspace/data/delta_lake/idempotent-stream-write-delta/users"))

In [25]:
# Define a function writing to two destinations
app_id = 'idempotent-stream-write-delta'
def writeToDeltaLakeTableIdempotent(batch_df, batch_id):
    # location 1
    (batch_df.filter("country IN ('India','China')")
     .write
     .format("delta")
     .mode("append")
     .option("txnVersion", batch_id)
     .option("txnAppId", app_id)
     .save("/opt/workspace/data/delta_lake/idempotent-stream-write-delta/user_asia"))
    # location 2
    (batch_df.filter("country IN ('USA','Canada','Brazil')")
     .write
     .format("delta")
     .mode("append")
     .option("txnVersion", batch_id)
     .option("txnAppId", app_id)
     .save("/opt/workspace/data/delta_lake/idempotent-stream-write-delta/user_americas"))

[Stage 1:>                                                          (0 + 2) / 2]

In [26]:
# Apply the function against the micro-batches using ‘foreachBatch’
write_query = (df
 .writeStream
 .format("delta")
 .queryName("Users By Region")
 .foreachBatch(writeToDeltaLakeTableIdempotent)
 .start())

[Stage 2:>                                                         (0 + 2) / 50]

In [27]:
%%sparksql
SELECT COUNT(*) FROM delta.`/opt/workspace/data/delta_lake/idempotent-stream-write-delta/user_asia`;

                                                                                

0
count(1)
44


In [28]:
%%sparksql
SELECT COUNT(*) FROM delta.`/opt/workspace/data/delta_lake/idempotent-stream-write-delta/user_americas`;

                                                                                

0
count(1)
65


In [37]:
%%sparksql
SELECT COUNT(*) FROM delta.`/opt/workspace/data/delta_lake/idempotent-stream-write-delta/user_asia`;

                                                                                

0
count(1)
45


[Stage 147:====>                                                   (4 + 2) / 50]

In [38]:
%%sparksql
SELECT COUNT(*) FROM delta.`/opt/workspace/data/delta_lake/idempotent-stream-write-delta/user_americas`;

                                                                                

0
count(1)
68


In [39]:
query.stop()
write_query.stop()

In [40]:
spark.stop() 