In [1]:
# Spark Session
from pyspark.sql import SparkSession

spark = (
    SparkSession
    .builder
    .appName("Reading and Parsing JSON Files/Data")
    .master("local[*]")
    .getOrCreate()
)

spark

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/07/24 14:44:36 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
# Read Single line JSON file

df_single = spark.read.format("json").load("/Users/vinayakpawar/Desktop/Work/Full_Stack/Python/PySpark/Reference notes/datasets/order_singleline.json")

In [4]:
df_single.printSchema()

root
 |-- _corrupt_record: string (nullable = true)



In [5]:
df_single.show()

AnalysisException: Since Spark 2.3, the queries from raw JSON/CSV files are disallowed when the
referenced columns only include the internal corrupt record column
(named _corrupt_record by default). For example:
spark.read.schema(schema).csv(file).filter($"_corrupt_record".isNotNull).count()
and spark.read.schema(schema).csv(file).select("_corrupt_record").show().
Instead, you can cache or save the parsed results and then send the same query.
For example, val df = spark.read.schema(schema).csv(file).cache() and then
df.filter($"_corrupt_record".isNotNull).count().

In [None]:
# Read Multiline JSON file

df_multi = spark.read.format("json").option("multiLine", True).load("data/input/order_multiline.json")

In [None]:
df_multi.printSchema()

In [None]:
df_multi.show()

In [None]:
df = spark.read.format("text").load("data/input/order_singleline.json")

In [None]:
df.printSchema()

In [None]:
df.show(truncate=False)

In [None]:
# With Schema

_schema = "customer_id string, order_id string, contact array<long>"

df_schema = spark.read.format("json").schema(_schema).load("data/input/order_singleline.json")

In [None]:
df_schema.show()

In [None]:
root
 |-- contact: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- customer_id: string (nullable = true)
 |-- order_id: string (nullable = true)
 |-- order_line_items: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- amount: double (nullable = true)
 |    |    |-- item_id: string (nullable = true)
 |    |    |-- qty: long (nullable = true)

In [None]:
_schema = "contact array<string>, customer_id string, order_id string, order_line_items array<struct<amount double, item_id string, qty long>>"

In [None]:
df_schema_new = spark.read.format("json").schema(_schema).load("data/input/order_singleline.json")

In [None]:
df_schema_new.printSchema()

In [None]:
df_schema_new.show()

In [None]:
# Function from_json to read from a column

_schema = "contact array<string>, customer_id string, order_id string, order_line_items array<struct<amount double, item_id string, qty long>>"

from pyspark.sql.functions import from_json

df_expanded = df.withColumn("parsed", from_json(df.value, _schema))


In [None]:
df_expanded.printSchema()

In [None]:
df_expanded.show()

In [None]:
# Function to_json to parse a JSON string
from pyspark.sql.functions import to_json

df_unparsed = df_expanded.withColumn("unparsed", to_json(df_expanded.parsed))

In [None]:
df_unparsed.printSchema()

In [None]:
df_unparsed.select("unparsed").show(truncate=False)

In [None]:
# Get values from Parsed JSON

df_1 = df_expanded.select("parsed.*")

In [None]:
from pyspark.sql.functions import explode

df_2 = df_1.withColumn("expanded_line_items", explode("order_line_items"))

In [None]:
df_2.show()

In [None]:
df_3 = df_2.select("contact", "customer_id", "order_id", "expanded_line_items.*")

In [None]:
df_3.show()

In [None]:
# Explode Array fields
df_final = df_3.withColumn("contact_expanded", explode("contact"))


In [None]:
df_final.printSchema()

In [None]:
df_final.drop("contact").show()