In [15]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json, col, get_json_object
from pyspark.sql.types import StructType, StructField, StringType, LongType

In [3]:
spark = SparkSession.builder.appName("ReadingJson").getOrCreate()

#### CONSTRUCTORS JSON

In [4]:
constructors_schema = "constructorId INT, constructorRef STRING, name STRING, nationality STRING, url STRING"

In [6]:
df = spark.read.schema(constructors_schema).json("constructors.json")

In [7]:
display(df)

DataFrame[constructorId: int, constructorRef: string, name: string, nationality: string, url: string]

In [10]:
df.take(5)

[Row(constructorId=1, constructorRef='mclaren', name='McLaren', nationality='British', url='http://en.wikipedia.org/wiki/McLaren'),
 Row(constructorId=2, constructorRef='bmw_sauber', name='BMW Sauber', nationality='German', url='http://en.wikipedia.org/wiki/BMW_Sauber'),
 Row(constructorId=3, constructorRef='williams', name='Williams', nationality='British', url='http://en.wikipedia.org/wiki/Williams_Grand_Prix_Engineering'),
 Row(constructorId=4, constructorRef='renault', name='Renault', nationality='French', url='http://en.wikipedia.org/wiki/Renault_in_Formula_One'),
 Row(constructorId=5, constructorRef='toro_rosso', name='Toro Rosso', nationality='Italian', url='http://en.wikipedia.org/wiki/Scuderia_Toro_Rosso')]

In [9]:
df.toPandas()

Unnamed: 0,constructorId,constructorRef,name,nationality,url
0,1,mclaren,McLaren,British,http://en.wikipedia.org/wiki/McLaren
1,2,bmw_sauber,BMW Sauber,German,http://en.wikipedia.org/wiki/BMW_Sauber
2,3,williams,Williams,British,http://en.wikipedia.org/wiki/Williams_Grand_Pr...
3,4,renault,Renault,French,http://en.wikipedia.org/wiki/Renault_in_Formul...
4,5,toro_rosso,Toro Rosso,Italian,http://en.wikipedia.org/wiki/Scuderia_Toro_Rosso
...,...,...,...,...,...
206,209,manor,Manor Marussia,British,http://en.wikipedia.org/wiki/Manor_Motorsport
207,210,haas,Haas F1 Team,American,http://en.wikipedia.org/wiki/Haas_F1_Team
208,211,racing_point,Racing Point,British,http://en.wikipedia.org/wiki/Racing_Point_F1_Team
209,213,alphatauri,AlphaTauri,Italian,http://en.wikipedia.org/wiki/Scuderia_AlphaTauri


#### DEBEZIUM JSON

In [42]:
db = spark.read.option("multiline", "true").json("sample.json")

In [43]:
db.printSchema()

root
 |-- payload: struct (nullable = true)
 |    |-- after: struct (nullable = true)
 |    |    |-- affiliateid: string (nullable = true)
 |    |    |-- amount: string (nullable = true)
 |    |    |-- city: string (nullable = true)
 |    |    |-- country: string (nullable = true)
 |    |    |-- currency: string (nullable = true)
 |    |    |-- ip_address: string (nullable = true)
 |    |    |-- merchant_name: string (nullable = true)
 |    |    |-- payment_method: string (nullable = true)
 |    |    |-- timestamp: long (nullable = true)
 |    |    |-- transaction_id: string (nullable = true)
 |    |    |-- user_id: string (nullable = true)
 |    |    |-- voucher_code: string (nullable = true)
 |    |-- before: string (nullable = true)
 |    |-- op: string (nullable = true)
 |    |-- source: struct (nullable = true)
 |    |    |-- connector: string (nullable = true)
 |    |    |-- db: string (nullable = true)
 |    |    |-- lsn: long (nullable = true)
 |    |    |-- name: string (nulla

In [49]:
db.select("payload.after.*").toPandas()

Unnamed: 0,affiliateid,amount,city,country,currency,ip_address,merchant_name,payment_method,timestamp,transaction_id,user_id,voucher_code
0,ef7b99bb-5be4-47f9-8136-75cae8d24331,18.04,Adamchester,Ghana,USD,125.111.176.120,"Perry, Miller and Zimmerman",debit_card,1709846057000000,fd48ad7b-f02a-4c6b-8bb0-56adbd677838,robert38,
