In [1]:
from pyspark.sql import SparkSession

spark = (SparkSession.builder.getOrCreate())

In [2]:
import pyspark.sql.functions as F

spark.read.parquet("output/UsefullAPIResults.parquet").orderBy(F.desc("timestamp")).show()

#key - partitioning key
#value - the data, in binary format. This is our JSON payload. We'll need to cast it to STRING.
#topic - the topic we are subscribing to
#partition 
#offset - the offset value. This is per topic, partition, and consumer group
#timestamp - the timestamp
#timestampType - whether timestamp is created time or log append time (by default created time)

+----+--------------------+-----------------+---------+------+--------------------+-------------+
| key|               value|            topic|partition|offset|           timestamp|timestampType|
+----+--------------------+-----------------+---------+------+--------------------+-------------+
|null|[7B 22 52 65 63 6...|UsefullAPIResults|        2|   199|2022-08-05 07:23:...|            0|
|null|[7B 22 52 65 63 6...|UsefullAPIResults|        2|   198|2022-08-05 07:23:...|            0|
|null|[7B 22 52 65 63 6...|UsefullAPIResults|        2|   197|2022-08-05 07:23:...|            0|
|null|[7B 22 52 65 63 6...|UsefullAPIResults|        2|   196|2022-08-05 07:23:...|            0|
|null|[7B 22 52 65 63 6...|UsefullAPIResults|        2|   195|2022-08-05 07:23:...|            0|
|null|[7B 22 52 65 63 6...|UsefullAPIResults|        2|   194|2022-08-05 07:23:...|            0|
|null|[7B 22 52 65 63 6...|UsefullAPIResults|        2|   193|2022-08-05 07:23:...|            0|
|null|[7B 22 52 65 6

In [3]:
spark.read.parquet("output/UsefullAPIResults.parquet/*.parquet").createOrReplaceTempView("vw_UsefullAPIResults")

In [4]:
spark.sql("SELECT * FROM vw_UsefullAPIResults ORDER BY offset desc").show()

+----+--------------------+-----------------+---------+------+--------------------+-------------+
| key|               value|            topic|partition|offset|           timestamp|timestampType|
+----+--------------------+-----------------+---------+------+--------------------+-------------+
|null|[7B 22 52 65 63 6...|UsefullAPIResults|        2|   199|2022-08-05 07:23:...|            0|
|null|[7B 22 52 65 63 6...|UsefullAPIResults|        2|   198|2022-08-05 07:23:...|            0|
|null|[7B 22 52 65 63 6...|UsefullAPIResults|        2|   197|2022-08-05 07:23:...|            0|
|null|[7B 22 52 65 63 6...|UsefullAPIResults|        2|   196|2022-08-05 07:23:...|            0|
|null|[7B 22 52 65 63 6...|UsefullAPIResults|        2|   195|2022-08-05 07:23:...|            0|
|null|[7B 22 52 65 63 6...|UsefullAPIResults|        2|   194|2022-08-05 07:23:...|            0|
|null|[7B 22 52 65 63 6...|UsefullAPIResults|        2|   193|2022-08-05 07:23:...|            0|
|null|[7B 22 52 65 6

In [16]:
spark.sql("SELECT CAST(key AS STRING) key, CAST(value AS STRING) value, timestamp AS ts FROM vw_UsefullAPIResults").show(20, 50)

+----+--------------------------------------------------+-----------------------+
| key|                                             value|                     ts|
+----+--------------------------------------------------+-----------------------+
|null|{"RecordedAtTime": "2022-08-05T09:20:19.7140615...|2022-08-05 07:22:06.227|
|null|{"RecordedAtTime": "2022-08-05T09:11:58.706+02:...|2022-08-05 07:22:06.276|
|null|{"RecordedAtTime": "2022-08-05T09:20:13.079+02:...|2022-08-05 07:22:06.303|
|null|{"RecordedAtTime": "2022-08-05T09:21:53.072+02:...|2022-08-05 07:22:06.339|
|null|{"RecordedAtTime": "2022-08-05T08:59:05.8+02:00...|2022-08-05 07:22:06.394|
|null|{"RecordedAtTime": "2022-08-05T09:22:04+02:00",...|2022-08-05 07:22:06.423|
|null|{"RecordedAtTime": "2022-08-05T09:21:17+02:00",...|2022-08-05 07:22:07.004|
|null|{"RecordedAtTime": "2022-08-05T09:21:52.105+02:...|2022-08-05 07:22:07.173|
|null|{"RecordedAtTime": "2022-08-05T09:21:03.128+02:...|2022-08-05 07:22:07.249|
|null|{"Recorded

In [24]:
json_schema = """
STRUCT<
RecordedAtTime: STRING,
VehicleMode: STRING,
OperatorRef: STRING,
VehicleRef: STRING,
ArrivalStatus: STRING,
VehicleAtStop: STRING,
VehicleLatitude: FLOAT,
VehicleLongitude: FLOAT
>
"""

In [25]:
spark.sql(f"""
SELECT ts, json.* FROM (
SELECT FROM_JSON(CAST(value AS STRING),'{json_schema}') AS json,
timestamp as ts
FROM vw_UsefullAPIResults
WHERE offset > 30
)
""").show()

+--------------------+--------------------+-----------+-----------------+-------------+--------------------+-------------+---------------+----------------+
|                  ts|      RecordedAtTime|VehicleMode|      OperatorRef|   VehicleRef|       ArrivalStatus|VehicleAtStop|VehicleLatitude|VehicleLongitude|
+--------------------+--------------------+-----------+-----------------+-------------+--------------------+-------------+---------------+----------------+
|2022-08-05 07:22:...|2022-08-05T09:22:...|        BUS|              160|         2133|             DELAYED|         null|      63.284897|       10.284815|
|2022-08-05 07:22:...|2022-08-05T09:19:...|       null|          Unibuss|       103081|{"OnwardCall":[{"...|         true|        59.8956|       10.802183|
|2022-08-05 07:22:...|2022-08-05T09:18:...|        BUS|               80|       811222|             ON_TIME|         null|       59.00076|         9.74867|
|2022-08-05 07:22:...|2022-08-05T09:18:...|        BUS|         