In [1]:
from pyspark.sql import SparkSession

spark = (SparkSession.builder.getOrCreate())

In [3]:
import pyspark.sql.functions as F

spark.read.parquet("output/AwsomeAPIRecords.parquet").orderBy(F.desc("timestamp")).show()

#key - partitioning key
#value - the data, in binary format. This is our JSON payload. We'll need to cast it to STRING.
#topic - the topic we are subscribing to
#partition 
#offset - the offset value. This is per topic, partition, and consumer group
#timestamp - the timestamp
#timestampType - whether timestamp is created time or log append time (by default created time)

+----+--------------------+---------------+---------+------+--------------------+-------------+
| key|               value|          topic|partition|offset|           timestamp|timestampType|
+----+--------------------+---------------+---------+------+--------------------+-------------+
|null|[7B 22 52 65 63 6...|AwsomeTopicName|        2|     8|2022-08-04 12:51:...|            0|
|null|[7B 22 52 65 63 6...|AwsomeTopicName|        2|     7|2022-08-04 12:51:...|            0|
|null|[7B 22 52 65 63 6...|AwsomeTopicName|        2|     6|2022-08-04 12:49:...|            0|
|null|[7B 22 52 65 63 6...|AwsomeTopicName|        2|     5|2022-08-04 12:49:...|            0|
|null|[7B 22 52 65 63 6...|AwsomeTopicName|        2|     4|2022-08-04 12:49:...|            0|
|null|[7B 22 52 65 63 6...|AwsomeTopicName|        2|     3|2022-08-04 12:48:...|            0|
|null|[7B 22 52 65 63 6...|AwsomeTopicName|        2|     2|2022-08-04 12:47:...|            0|
|null|[5B 7B 27 56 61 6...|AwsomeTopicNa

In [7]:
spark.read.parquet("output/AwsomeAPIRecords.parquet/*.parquet").createOrReplaceTempView("vw_AwsomeAPIRecords")

In [8]:
spark.sql("SELECT * FROM vw_AwsomeAPIRecords ORDER BY offset desc").show()

+----+--------------------+---------------+---------+------+--------------------+-------------+
| key|               value|          topic|partition|offset|           timestamp|timestampType|
+----+--------------------+---------------+---------+------+--------------------+-------------+
|null|[7B 22 52 65 63 6...|AwsomeTopicName|        2|     8|2022-08-04 12:51:...|            0|
|null|[7B 22 52 65 63 6...|AwsomeTopicName|        2|     7|2022-08-04 12:51:...|            0|
|null|[7B 22 52 65 63 6...|AwsomeTopicName|        2|     6|2022-08-04 12:49:...|            0|
|null|[7B 22 52 65 63 6...|AwsomeTopicName|        2|     5|2022-08-04 12:49:...|            0|
|null|[7B 22 52 65 63 6...|AwsomeTopicName|        2|     4|2022-08-04 12:49:...|            0|
|null|[7B 22 52 65 63 6...|AwsomeTopicName|        2|     3|2022-08-04 12:48:...|            0|
|null|[7B 22 52 65 63 6...|AwsomeTopicName|        2|     2|2022-08-04 12:47:...|            0|
|null|[5B 7B 27 56 61 6...|AwsomeTopicNa

In [9]:
spark.sql("SELECT CAST(key AS STRING) key, CAST(value AS STRING) value, timestamp AS ts FROM vw_AwsomeAPIRecords").show(20, 50)

+----+--------------------------------------------------+-----------------------+
| key|                                             value|                     ts|
+----+--------------------------------------------------+-----------------------+
|null|{"RecordedAtTime": "2022-08-04T14:31:40.864+02:...|2022-08-04 12:45:51.122|
|null|[{'ValidUntilTime': '2022-08-04T14:40:00+02:00'...|2022-08-04 12:45:56.467|
|null|{"RecordedAtTime": "2022-08-04T14:31:40.864+02:...|2022-08-04 12:47:14.896|
|null|{"RecordedAtTime": "2022-08-04T14:31:40.864+02:...|2022-08-04 12:48:37.007|
|null|{"RecordedAtTime": "2022-08-04T14:31:40.864+02:...|2022-08-04 12:49:03.472|
|null|{"RecordedAtTime": "2022-08-04T14:31:40.864+02:...|2022-08-04 12:49:30.167|
|null|{"RecordedAtTime": "2022-08-04T14:47:50.474+02:...|2022-08-04 12:49:30.245|
|null|{"RecordedAtTime": "2022-08-04T14:50:11.919+02:...|2022-08-04 12:51:15.274|
|null|{"RecordedAtTime": "2022-08-04T14:41:04.665+02:...|2022-08-04 12:51:15.318|
+----+----------

In [None]:
json_schema = """
STRUCT<gender: STRING,
name: STRUCT<title: STRING,
            first: STRING,
            last: STRING>,
location: STRUCT<street: STRUCT<number: INT,
                                name: STRING>,
                 city: STRING,
                state: STRING,
                country: STRING,
                postcode: INT,
                coordinates: STRUCT<latitude: STRING,
                                    longitude: STRING>,
                timezone: STRUCT<offset: STRING,
                                description: STRING>
                >,
email: STRING,
login: STRUCT< uuid: STRING,
            username: STRING,
            password: STRING,
            salt: STRING,
            md5: STRING,
            sha1: STRING,
            sha256: STRING>,
dob: STRUCT<date: STRING,
            age: INT>,
registered: STRUCT<date: STRING,
                    age: INT>,
phone: STRING,
cell: STRING,
id: STRUCT<name: STRING,
            value: STRING>,
picture: STRUCT<large: STRING,
                medium: STRING,
                thumbnail: STRING>,
nat: STRING,
timestamp: STRING>
"""

In [None]:
spark.sql(f"""
SELECT ts, json.* FROM (
SELECT FROM_JSON(CAST(value AS STRING),'{json_schema}') AS json,
timestamp as ts
FROM vw_kafka_0 
WHERE offset > 30
)
""").show()