In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, from_json, explode, size
from pyspark.sql.types import StructType, StringType, IntegerType, ArrayType
SparkSession.builder.appName("Spark DF").getOrCreate()

In [0]:
sampleDf = [
  { "id": 1, "name": "Alice", "age": 30 },
  { "id": 2, "name": "Bob", "age": 25 },
  { "id": 3, "name": "Charlie", "age": 35 }
]

> For accessing below like data use dot notation col("address.city") or select("address.city")

In [0]:
sampleDf = [
  {
    "id": 1,
    "name": "Alice",
    "address": {
      "city": "New York",
      "zip": "10001"
    }
  },
  {
    "id": 2,
    "name": "Bob",
    "address": {
      "city": "Los Angeles",
      "zip": "90001"
    }
  }
]


In [0]:
df = spark.createDataFrame(sampleDf)

In [0]:
df.printSchema()

root
 |-- age: long (nullable = true)
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)



In [0]:
df.show()

+--------------------+---+-----+
|             address| id| name|
+--------------------+---+-----+
|{zip -> 10001, ci...|  1|Alice|
|{zip -> 90001, ci...|  2|  Bob|
+--------------------+---+-----+



In [0]:
df.select(col('address.zip')).show()

+-----+
|  zip|
+-----+
|10001|
|90001|
+-----+



Json with Arrays

- explode("skills")

- Array functions like size(), array_contains()

In [0]:
sampleDf = [
  {
    "id": 1,
    "name": "Alice",
    "skills": ["Python", "Spark", "SQL"]
  },
  {
    "id": 2,
    "name": "Bob",
    "skills": ["Java", "Scala"]
  }
]



In [0]:
df = spark.createDataFrame(sampleDf)

In [0]:
df.select('name',explode('skills').alias('skills')).show()

+-----+------+
| name|skills|
+-----+------+
|Alice|Python|
|Alice| Spark|
|Alice|   SQL|
|  Bob|  Java|
|  Bob| Scala|
+-----+------+



In [0]:
df.withColumn("skill_count", size("skills")).show()

+---+-----+--------------------+-----------+
| id| name|              skills|skill_count|
+---+-----+--------------------+-----------+
|  1|Alice|[Python, Spark, SQL]|          3|
|  2|  Bob|       [Java, Scala]|          2|
+---+-----+--------------------+-----------+



 Parsing JSON Strings with from_json()

In [0]:
sampleDf = [
  {
    "id": 1,
    "json_data": "{\"product\":\"Laptop\",\"price\":900}"
  }
]

df = spark.createDataFrame(sampleDf)


In [0]:
df.show()

+---+--------------------+
| id|           json_data|
+---+--------------------+
|  1|{"product":"Lapto...|
+---+--------------------+



In [0]:
schema = StructType() \
    .add("product", StringType()) \
    .add("price", IntegerType())
df_parsed = df.withColumn("data", from_json("json_data", schema))
df_parsed.select("id", "data.product", "data.price").show()

+---+-------+-----+
| id|product|price|
+---+-------+-----+
|  1| Laptop|  900|
+---+-------+-----+



 Handling Nulls & Missing Fields

In [0]:
sampleDf = [
  { "id": 1, "name": "Alice", "age": 30 },
  { "id": 2, "name": None },
  { "id": 3 }
]

df = spark.createDataFrame(sampleDf)


In [0]:
df.show()

# Fill nulls
df.na.fill({"name": "Unknown", "age": 0}).show()

# Drop rows with any nulls
df.na.drop().show()


+----+---+-----+
| age| id| name|
+----+---+-----+
|  30|  1|Alice|
|null|  2| null|
|null|  3| null|
+----+---+-----+

+---+---+-------+
|age| id|   name|
+---+---+-------+
| 30|  1|  Alice|
|  0|  2|Unknown|
|  0|  3|Unknown|
+---+---+-------+

+---+---+-----+
|age| id| name|
+---+---+-----+
| 30|  1|Alice|
+---+---+-----+



 Reading Multiline JSON
 
 Below you will see 2 approach using **createdataframe** and **read from dbfs**

 Problem
 - Without any options, Spark assumes each line is an individual JSON record. So it will fail or parse incorrectly.

In [0]:
sampleDf = {
  "id": 1,
  "name": "Alice",
  "details": {
    "hobbies": ["reading", "cycling"],
    "education": {
      "degree": "Masters",
      "field": "Computer Science"
    }
  }
}
# pass in a list or wrap the dict
df = spark.createDataFrame([sampleDf])


In [0]:
df.show()

+--------------------+---+-----+
|             details| id| name|
+--------------------+---+-----+
|{hobbies -> [read...|  1|Alice|
+--------------------+---+-----+



In [0]:
df.select("name", explode("details.hobbies").alias("hobby")).show()


+-----+-------+
| name|  hobby|
+-----+-------+
|Alice|reading|
|Alice|cycling|
+-----+-------+



to read from json file use multiline


In [0]:
df = spark.read.option("multiline", "true").json("dbfs:/FileStore/tables/multiline.json")
df.select("name", "details.education.field").show()

+-----+----------------+
| name|           field|
+-----+----------------+
|Alice|Computer Science|
+-----+----------------+



Handling Malformed JSON

Problem
- You’re reading a file where some records are corrupted or incomplete. By default, Spark tries to read every line as valid JSON.


| Mode            | Behavior                                                  |
| --------------- | --------------------------------------------------------- |
| `PERMISSIVE`    | Keeps corrupt records in `_corrupt_record` column         |
| `DROPMALFORMED` | Skips bad rows altogether                                 |
| `FAILFAST`      | Stops execution immediately if any malformed row is found |


In [0]:
df = spark.read.option("mode", "PERMISSIVE").json("dbfs:/FileStore/tables/malformed.json")
df.show()

# Try DROP mode to skip bad records
df_drop = spark.read.option("mode", "DROPMALFORMED").json("dbfs:/FileStore/tables/malformed.json")
df_drop.show()


+--------------------+----+-----+
|     _corrupt_record|  id| name|
+--------------------+----+-----+
|                null|   1|Alice|
|                null|   2|  Bob|
|{ "id": 3, "name"...|null| null|
+--------------------+----+-----+

+---+-----+
| id| name|
+---+-----+
|  1|Alice|
|  2|  Bob|
+---+-----+



 Deeply Nested JSON

In [0]:
df = spark.read.option("multiline", "true").json("dbfs:/FileStore/tables/deeplyNested.json")
df.printSchema()

df.select(
    "user.profile.name",
    "user.profile.contacts.email"
).show(truncate=False)


root
 |-- user: struct (nullable = true)
 |    |-- id: long (nullable = true)
 |    |-- profile: struct (nullable = true)
 |    |    |-- contacts: struct (nullable = true)
 |    |    |    |-- email: string (nullable = true)
 |    |    |    |-- phone: string (nullable = true)
 |    |    |-- name: string (nullable = true)

+-----+-----------------+
|name |email            |
+-----+-----------------+
|Alice|alice@example.com|
+-----+-----------------+

