# PySpark consumer from Kafka setup

Mostly based on [this Medium article](https://medium.com/@mrugankray/real-time-avro-data-analysis-with-spark-streaming-and-confluent-kafka-in-python-426f5e05392d)

In [None]:
from confluent_kafka.schema_registry import SchemaRegistryClient
import pyspark.sql.functions as func
from pyspark.sql.avro.functions import from_avro
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

In [None]:
spark = (
    SparkSession.builder.appName("kafka_test")
    .config(
        "spark.jars.packages",
        "org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.1," "org.apache.spark:spark-avro_2.12:3.5.1",
    )
    .getOrCreate()
)

topic = "postgres.public.resources_01_2014"
subject = "".join([topic, "-value"])
kafka_bootstrap_servers = "localhost:9091,localhost:9092,localhost:9093"


# For some weird reason, I'm not able to access my schema registry from outside the container using localhost. 
# If thats the case, do a docker network inspect on the default network and set the schema-registry-1 IPv4 here.
schema_registry_url = "http://172.19.0.13:8096"


schema_registry_client = SchemaRegistryClient({"url": schema_registry_url})
avro_schema = schema_registry_client.get_latest_version(subject)

In [None]:
df = (
    spark.readStream.format("kafka")
    .option("kafka.bootstrap.servers", kafka_bootstrap_servers)
    .option("subscribe", topic)
    .option("startingOffsets", "earliest")
    .load()
)

In [None]:
df = df.withColumn("magicByte", func.expr("substring(value, 1, 1)"))
df = df.withColumn("valueSchemaId", func.expr("substring(value, 2, 4)"))
df = df.withColumn("fixedValue", func.expr("substring(value, 6, len(value)-5)"))
v_df = df.select("magicByte", "valueSchemaId", "fixedValue")

In [None]:
avro_opts = {"mode": "PERMISSIVE"}
decoded_output = v_df.select(from_avro(func.col("fixedValue"), avro_schema.schema.schema_str, avro_opts).alias("resources"))
v_df = decoded_output.select("resources.*")

In [None]:
v_df.printSchema()

In [None]:
df = v_df.select("after").select("after.*")

query = df.writeStream \
    .outputMode("append") \
    .format("console") \
    .start()

query.awaitTermination()

24/08/05 10:13:04 WARN NetworkClient: [AdminClient clientId=adminclient-1] Connection to node 6 (localhost/127.0.0.1:9093) could not be established. Broker may not be available.
24/08/05 10:13:04 WARN NetworkClient: [AdminClient clientId=adminclient-1] Connection to node 5 (localhost/127.0.0.1:9092) could not be established. Broker may not be available.
24/08/05 10:13:05 WARN NetworkClient: [AdminClient clientId=adminclient-1] Connection to node 4 (localhost/127.0.0.1:9091) could not be established. Broker may not be available.
24/08/05 10:13:05 WARN NetworkClient: [AdminClient clientId=adminclient-1] Connection to node 5 (localhost/127.0.0.1:9092) could not be established. Broker may not be available.
24/08/05 10:13:05 WARN NetworkClient: [AdminClient clientId=adminclient-1] Connection to node 6 (localhost/127.0.0.1:9093) could not be established. Broker may not be available.
24/08/05 10:13:06 WARN NetworkClient: [AdminClient clientId=adminclient-1] Connection to node 5 (localhost/127