In [0]:
df.printSchema()

root
 |-- address: struct (nullable = true)
 |    |-- city: string (nullable = true)
 |    |-- state: string (nullable = true)
 |    |-- street: string (nullable = true)
 |    |-- zip: string (nullable = true)
 |-- customer_id: long (nullable = true)
 |-- email: string (nullable = true)
 |-- name: string (nullable = true)
 |-- orders: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- items: array (nullable = true)
 |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |-- item_id: string (nullable = true)
 |    |    |    |    |-- price: long (nullable = true)
 |    |    |    |    |-- product: string (nullable = true)
 |    |    |    |    |-- quantity: long (nullable = true)
 |    |    |-- order_date: string (nullable = true)
 |    |    |-- order_id: string (nullable = true)



In [0]:
from pyspark.sql.functions import explode, col

# Load JSON file
df = spark.read.option("multiline", "true").json("/mnt/jsondata/data.json")

# Flatten top-level fields
df_flat = df.select(
    "customer_id",
    "name",
    "email",
    col("address.street").alias("street"),
    col("address.city").alias("city"),
    col("address.state").alias("state"),
    col("address.zip").alias("zip"),
    explode("orders").alias("order")
)

# Flatten orders
df_order_flat = df_flat.select(
    "customer_id",
    "name",
    "email",
    "street",
    "city",
    "state",
    "zip",
    col("order.order_id").alias("order_id"),
    col("order.order_date").alias("order_date"),
    explode("order.items").alias("item")
)

# Flatten items
final_df = df_order_flat.select(
    "customer_id",
    "name",
    "email",
    "street",
    "city",
    "state",
    "zip",
    "order_id",
    "order_date",
    col("item.item_id").alias("item_id"),
    col("item.product").alias("product"),
    col("item.quantity").alias("quantity"),
    col("item.price").alias("price")
)

final_df.show(truncate=False)


+-----------+-----------+-----------------------+-------------+----------+------+------+--------+----------+-------+--------+--------+-----+
|customer_id|name       |email                  |street       |city      |state |zip   |order_id|order_date|item_id|product |quantity|price|
+-----------+-----------+-----------------------+-------------+----------+------+------+--------+----------+-------+--------+--------+-----+
|101        |Rahul Gupta|rahul.gupta@example.com|123 Sector 17|Chandigarh|Punjab|160017|ORD001  |2025-06-15|A1     |Laptop  |1       |75000|
|101        |Rahul Gupta|rahul.gupta@example.com|123 Sector 17|Chandigarh|Punjab|160017|ORD001  |2025-06-15|A2     |Mouse   |2       |700  |
|101        |Rahul Gupta|rahul.gupta@example.com|123 Sector 17|Chandigarh|Punjab|160017|ORD002  |2025-06-30|B1     |Keyboard|1       |1200 |
+-----------+-----------+-----------------------+-------------+----------+------+------+--------+----------+-------+--------+--------+-----+



In [0]:
spark.conf.set(
  "fs.azure.account.key.jsonstorage2025.dfs.core.windows.net",
  "dnF+QxBu/wAnfhl9f/2qS5WHn4SYMCqpDGgkxYg3CTHTa8n1w1POTcVmiCQOa2wTeXlJALLSjEh0+AStb/QqBw=="
)


In [0]:
final_df.write.mode("overwrite").parquet("/mnt/jsondata/flattened_output")

final_df.write.mode("overwrite").parquet("abfss://jsondata@jsonstorage2025.dfs.core.windows.net/output/flattened_output")

In [0]:
# Checking the output
df = spark.read.parquet("abfss://jsondata@jsonstorage2025.dfs.core.windows.net/output/flattened_output/")
df.show(truncate=False)


+-----------+-----------+-----------------------+-------------+----------+------+------+--------+----------+-------+--------+--------+-----+
|customer_id|name       |email                  |street       |city      |state |zip   |order_id|order_date|item_id|product |quantity|price|
+-----------+-----------+-----------------------+-------------+----------+------+------+--------+----------+-------+--------+--------+-----+
|101        |Rahul Gupta|rahul.gupta@example.com|123 Sector 17|Chandigarh|Punjab|160017|ORD001  |2025-06-15|A1     |Laptop  |1       |75000|
|101        |Rahul Gupta|rahul.gupta@example.com|123 Sector 17|Chandigarh|Punjab|160017|ORD001  |2025-06-15|A2     |Mouse   |2       |700  |
|101        |Rahul Gupta|rahul.gupta@example.com|123 Sector 17|Chandigarh|Punjab|160017|ORD002  |2025-06-30|B1     |Keyboard|1       |1200 |
+-----------+-----------+-----------------------+-------------+----------+------+------+--------+----------+-------+--------+--------+-----+



In [0]:
%sql
CREATE TABLE IF NOT EXISTS flattened_output
USING PARQUET
OPTIONS (
  path "abfss://jsondata@jsonstorage2025.dfs.core.windows.net/output/flattened_output/"
);

In [0]:
%sql
SHOW TABLES;

database,tableName,isTemporary
default,flattened_output,False
,_sqldf,True


In [0]:
%sql
SELECT * FROM flattened_output LIMIT 10;

customer_id,name,email,street,city,state,zip,order_id,order_date,item_id,product,quantity,price
101,Rahul Gupta,rahul.gupta@example.com,123 Sector 17,Chandigarh,Punjab,160017,ORD001,2025-06-15,A1,Laptop,1,75000
101,Rahul Gupta,rahul.gupta@example.com,123 Sector 17,Chandigarh,Punjab,160017,ORD001,2025-06-15,A2,Mouse,2,700
101,Rahul Gupta,rahul.gupta@example.com,123 Sector 17,Chandigarh,Punjab,160017,ORD002,2025-06-30,B1,Keyboard,1,1200
