### Read RAW JSON file

In [0]:
import os
print(os.listdir("/data/input/")) 

['order_singleline.json', 'order_multiline.json']


In [0]:
dbutils.fs.cp("file:/data/input/order_multiline.json", "dbfs:/data/input/order_multiline.json")

Out[11]: True

In [0]:
dbutils.fs.cp("file:/data/input/order_singleline.json", "dbfs:/data/input/order_singleline.json")

Out[12]: True

In [0]:

df = spark.read.option("multiline", True).json("/data/input/order_multiline.json")
display(df)

customer,items,order_id,total
Alice,"List(List(1200, Laptop, 1), List(25, Mouse, 2))",1,1250
Bob,"List(List(800, Phone, 1))",2,800


### Define Python Function to flatten deeply nested JSON files

In [0]:
from pyspark.sql import DataFrame
from pyspark.sql.functions import col, explode_outer
from pyspark.sql.types import StructType, ArrayType

def flatten_any_json(df: DataFrame, prefix: str = "") -> DataFrame:
    """
    Recursively flattens all nested StructType columns and explodes all ArrayType columns in a DataFrame.
    Handles arrays of structs, arrays of arrays, and deeply nested structures.
    """
    while True:
        complex_fields = [(field.name, field.dataType) for field in df.schema.fields
                          if isinstance(field.dataType, (StructType, ArrayType))]
        if not complex_fields:
            break
        for col_name, dtype in complex_fields:
            if isinstance(dtype, StructType):
                expanded = [col(f"{col_name}.{subfield.name}").alias(f"{col_name}_{subfield.name}")
                            for subfield in dtype.fields]
                df = df.select("*", *expanded).drop(col_name)
            elif isinstance(dtype, ArrayType):
                # Explode arrays (outer to keep nulls), handle arrays of arrays recursively
                df = df.withColumn(col_name, explode_outer(col(col_name)))
    return df

# Usage:
# df = spark.read.option("multiline", True).json("path/to/your.json")
# flat_df = flatten_any_json(df)
# flat_df.show(truncate=False)

### Apply Flattening function and display JSON data

In [0]:
df_flatten = flatten(df)
display(df_flatten)

customer,items_price,items_product,items_quantity,order_id,total
Alice,1200,Laptop,1,1,1250
Alice,25,Mouse,2,1,1250
Bob,800,Phone,1,2,800


**How it works:**

- Recursively flattens all nested structs.
- Explodes all arrays (including arrays of arrays).
- Handles any depth of nesting and any JSON structure.


> Note:

- For very large or deeply nested JSON, the resulting DataFrame may be very wide or have many rows due to Cartesian products from multiple arrays.
- This function is generic and works for almost any JSON structure you encounter in Spark.