In [0]:
dbutils.fs.put("/Volumes/workspace/default/stream_vol/json/input/complex_json.json", 
               """ [{
	"id": "0001",
	"type": "donut",
	"name": "Cake",
	"ppu": 0.55,
	"batters":
		{
			"batter":
				[
					{ "id": "1001", "type": "Regular" },
					{ "id": "1002", "type": "Chocolate" },
					{ "id": "1003", "type": "Blueberry" },
					{ "id": "1004", "type": "Devil's Food" }
				]
		},
	"topping":
		[
			{ "id": "5001", "type": "None" },
			{ "id": "5002", "type": "Glazed" },
			{ "id": "5005", "type": "Sugar" },
			{ "id": "5007", "type": "Powdered Sugar" },
			{ "id": "5006", "type": "Chocolate with Sprinkles" },
			{ "id": "5003", "type": "Chocolate" },
			{ "id": "5004", "type": "Maple" }
		]
}]""" , True)

In [0]:
df_json = spark.read.option("multiline", "true").json("/Volumes/workspace/default/stream_vol/json/input/complex_json.json")

display(df_json)
df_json.printSchema()



how to read /handle complex json files?

how to read multiline complex json files ?

Complex data type is : 
1. Array  
2. Struct 
3. map


In [0]:
display(df_json.select("batters.batter"))

In [0]:
df_json.select("batters.batter").printSchema()

In [0]:
from pyspark.sql.functions import explode , col 

df_json.select(explode("batters.batter")).display()

In [0]:
df_final = df_json.withColumn("topping_explode" , explode("topping"))\
    .withColumn("batters_explode" , explode("batters.batter"))\
            .withColumn("topping_id", col("topping_explode.id"))\
                .withColumn("topping_type", col("topping_explode.type"))\
                    .withColumn("batter_id", col("batters_explode.id"))\
                        .withColumn("batter_type", col("batters_explode.type"))\
                            .select("id", "type", "name", 'topping_id' , "topping_type" , "batter_id" , "batter_type" )
                                
df_final.display()


In [0]:
df_final.printSchema()


In [0]:
json_data = [
    {
        "customer_id": 101,
        "status": "active",
        "name": {"first": "John", "last": "Doe"},
        "address": {"street": "123 Elm St", "city": "Springfield", "zip": "62704"},
        "orders": [
            {
                "order_id": "A001",
                "amount": 250,
                "items": [
                    {"product": "Laptop", "qty": 1},
                    {"product": "Mouse", "qty": 2}
                ]
            },
            {
                "order_id": "A002",
                "amount": 150,
                "items": [
                    {"product": "Keyboard", "qty": 1}
                ]
            }
        ]
    },
    {
        "customer_id": 102,
        "status": "active",
        "name": {"first": "John1", "last": "Doe1"},
        "address": {"street": "123 Elm St", "city": "Springfield", "zip": "62704"},
        "orders": [
            {
                "order_id": "A003",
                "amount": 250,
                "items": [
                    {"product": "Laptop", "qty": 1},
                    {"product": "Mouse", "qty": 2}
                ]
            },
            {
                "order_id": "A004",
                "amount": 150,
                "items": [
                    {"product": "Keyboard", "qty": 1}
                ]
            }
        ]
    }
]

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode, col
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, ArrayType


schema = StructType([
    StructField("customer_id", IntegerType(), True),
    StructField("status", StringType(), True),
    StructField("name", StructType([
        StructField("first", StringType(), True),
        StructField("last", StringType(), True)
    ]), True),
    StructField("address", StructType([
        StructField("street", StringType(), True),
        StructField("city", StringType(), True),
        StructField("zip", StringType(), True)
    ]), True),
    StructField("orders", ArrayType(StructType([
        StructField("order_id", StringType(), True),
        StructField("amount", IntegerType(), True),
        StructField("items", ArrayType(StructType([
            StructField("product", StringType(), True),
            StructField("qty", IntegerType(), True)
        ])), True)
    ])), True)
])

# Create DataFrame from JSON with schema


df = spark.createDataFrame(json_data, schema=schema)

In [0]:
df.show()

In [0]:
df.printSchema()

In [0]:
df_orders = df.select(col("customer_id") , col("status") , col("name.first") , col("name.last") , col("address.street") , col("address.city") , col("address.zip"), col("orders"))

df_orders.display()
df_orders.printSchema()





In [0]:
df_orders_explode = df_orders.select(col("customer_id") , col("status") , col("first") , col("last") , col("street") , col("city") , col("zip"), explode("orders").alias("orders"))
df_orders_explode.display()
df_orders_explode.printSchema()

In [0]:
df_items = df_orders_explode.select(col("customer_id") , col("status") , col("first") , col("last") , col("street") , col("city") , col("zip") , col("orders.order_id") , col("orders.amount") , explode("orders.items").alias("items"))

display(df_items)


In [0]:
df_items.select(col("customer_id") , col("status") , col("first") , col("last") , col("street") , col("city") , col("zip") , col("order_id") , col("amount") , col("items.product") , col("items.qty")).display()

df_items.printSchema()
