In [0]:
#Spark Session
from pyspark.sql import SparkSession
spark = (
        SparkSession
        .builder
        .appName("Reading And Parsing JSON files/data")
        .getOrCreate()
)

In [0]:
spark

In [0]:
# Single-line JSON (compact)
s1 = '{"order":{"id": "ORD1001", "details": {"customer": {"id": "CUST123", "name": "John Doe"}, "items": {"product": {"item_id": "ITM001", "name": "Product 1"}, "quantity": 2}}, "contact": {"email": "john.doe@example.com", "phone": "5551234567"}}}}'

# Multi-line JSON (pretty printed)
m1 = '''
{
  "order": {
    "id": "ORD1002",
    "details": {
      "customer": {
        "id": "CUST456",
        "name": "Jane Smith"
      },
      "items": {
        "product": {
          "item_id": "ITM003",
          "name": "Product 3"
        },
        "quantity": 5
      }
    },
    "contact": {
      "email": "jane.smith@example.com",
      "phone": "5559876543"
    }
  }
}
'''

In [0]:
# Create RDDs from these JSON strings
rdd_s1 = spark.sparkContext.parallelize([s1])
rdd_m1 = spark.sparkContext.parallelize([m1])


In [0]:
# Parse JSON into separate DataFrames
df_s1 = spark.read.json(rdd_s1)
df_m1 = spark.read.json(rdd_m1)

# for file
# spark.read.option("multiline", True).json("file_path")

In [0]:
df_s1.show(truncate=False)
df_m1.show(truncate=False)

+----------------------------------------------------------------------------------------------+
|order                                                                                         |
+----------------------------------------------------------------------------------------------+
|{{john.doe@example.com, 5551234567}, {{CUST123, John Doe}, {{ITM001, Product 1}, 2}}, ORD1001}|
+----------------------------------------------------------------------------------------------+

+--------------------------------------------------------------------------------------------------+
|order                                                                                             |
+--------------------------------------------------------------------------------------------------+
|{{jane.smith@example.com, 5559876543}, {{CUST456, Jane Smith}, {{ITM003, Product 3}, 5}}, ORD1002}|
+--------------------------------------------------------------------------------------------------+



- When you read JSON directly with spark.read.json(), Spark infers the schema based on the JSON structure, but sometimes nested fields are kept as nested structures, resulting in a single column with a complex type.
- To get multiple top-level columns like contact, customer_id, order_id, order_line_items, you need to flatten the nested JSON structure.

In [0]:
# For df_m1
df_flat_m1 = df_m1.select(
    col("order.id").alias("order_id"),
    col("order.details.customer.id").alias("customer_id"),
    col("order.details.customer.name").alias("customer_name"),
    col("order.details.items.product.item_id").alias("product_id"),
    col("order.details.items.product.name").alias("product_name"),
    col("order.details.items.quantity").alias("quantity"),
    col("order.contact.email").alias("contact_email"),
    col("order.contact.phone").alias("contact_phone")
)
df_flat_m1.show(truncate=False)

# The same will wokf for single line if the schema and nested levels are same

+--------+-----------+-------------+----------+------------+--------+----------------------+-------------+
|order_id|customer_id|customer_name|product_id|product_name|quantity|contact_email         |contact_phone|
+--------+-----------+-------------+----------+------------+--------+----------------------+-------------+
|ORD1002 |CUST456    |Jane Smith   |ITM003    |Product 3   |5       |jane.smith@example.com|5559876543   |
+--------+-----------+-------------+----------+------------+--------+----------------------+-------------+



In [0]:
# With Schema (Enforcing a schema and returning only required columns)
_schema = "order_id string, customer_id string, contact_phone string"
df_schema = spark.read.schema(_schema).json("Json file path")  --this cannot be done on dataframe

In [0]:
df_m1.printSchema()

# _schema = "order struct<contact struct<email string, phone string>, details struct<customer struct<id string, name string>, items struct<product struct<item_id string, name string>, quantity long>>>, id string>"  # for schema enforcing/changing on a json file

root
 |-- order: struct (nullable = true)
 |    |-- contact: struct (nullable = true)
 |    |    |-- email: string (nullable = true)
 |    |    |-- phone: string (nullable = true)
 |    |-- details: struct (nullable = true)
 |    |    |-- customer: struct (nullable = true)
 |    |    |    |-- id: string (nullable = true)
 |    |    |    |-- name: string (nullable = true)
 |    |    |-- items: struct (nullable = true)
 |    |    |    |-- product: struct (nullable = true)
 |    |    |    |    |-- item_id: string (nullable = true)
 |    |    |    |    |-- name: string (nullable = true)
 |    |    |    |-- quantity: long (nullable = true)
 |    |-- id: string (nullable = true)



IMPORTANT NOTE 
- Use 'from_json' and 'to_json' and 'explode' from pyspark.sql.functions 
- from_json: Parses a JSON string column into a StructType (i.e., a structured Spark DataFrame column). It's useful when you have JSON data stored as strings and want to extract its fields into separate columns
- --> String JSON → StructType
- to_json: Converts a complex column (like a StructType or ArrayType) into a JSON string. This is useful for serializing structured data back into JSON format.
- --> StructType → JSON string
- Use 'explode' to flatten/Expand the JSon files
