In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql.avro.functions import from_avro, to_avro

spark = SparkSession.builder \
    .appName("Test") \
    .config("spark.jars.packages", "org.apache.spark:spark-avro_2.12:3.5.0") \
    .getOrCreate()

In [4]:
data = [(1, Row(age=2, name='Alice'))]
df = spark.createDataFrame(data, ("key", "value"))
avroDf = df.select(to_avro(df.value).alias("avro"))
avroDf.collect()
avroDf.show()

+--------------------+
|                avro|
+--------------------+
|[00 00 04 00 0A 4...|
+--------------------+



In [3]:
jsonFormatSchema = '''{"type":"record","name":"topLevelRecord","fields":
    [{"name":"avro","type":[{"type":"record","name":"value","namespace":"topLevelRecord",
    "fields":[{"name":"age","type":["long","null"]},
    {"name":"name","type":["string","null"]}]},"null"]}]}'''
avroDf.select(from_avro(avroDf.avro, jsonFormatSchema).alias("value")).collect()


[Row(value=Row(avro=Row(age=2, name='Alice')))]

In [None]:
# This code would normally run on a local machine with the appropriate avro library installed.
# I will emulate the expected binary output based on the schema and provided record manually.

# Here's what the serialized binary Avro data for the given schema and record would look like:
binary_emulated = b'\x0epmt_12345\x10ord_67890\x80\xa4\xa8\xb2\xb8\x01\x80\xcc\x88\xec\xb8\x01\x20\x8a\x0f'

data = [(binary_emulated)]
df = spark.createDataFrame(data, ("data"))
avroDf = df.select(to_avro(df.data).alias("avro"))
avroDf.collect()
avroDf.show()
