In [42]:
# Spark Session
from pyspark.sql import SparkSession

spark = (
    SparkSession
    .builder
    .appName("Reading and Parsing JSON Files/Data")
    .master("local[*]")
    .getOrCreate()
)

spark

ConnectionRefusedError: [Errno 61] Connection refused

In [36]:
df_single = spark.read.format("json").load("data/order_singleline.json")
df_single.printSchema()
df_single.show()

ConnectionRefusedError: [Errno 61] Connection refused

In [37]:
df_multiline = spark.read.format("json").option("multiline", True).load("data/order_multiline.json")
df_multiline.printSchema()
df_multiline.show()

ConnectionRefusedError: [Errno 61] Connection refused

In [38]:
# input format text 

df_single_text = spark.read.format("text").load("data/order_singleline.json")
df_single_text.printSchema()
df_single_text.show()

ConnectionRefusedError: [Errno 61] Connection refused

In [39]:
# With Schema
_schema = "contact array<long>, customer_id string, order_id string"

df_singleline_schema_1 = spark.read.format("json").schema(_schema).load("data/order_singleline.json")
df_singleline_schema_1.printSchema()
df_singleline_schema_1.show()


ConnectionRefusedError: [Errno 61] Connection refused

In [40]:
# With Schema
_schema = "contact array<long>, customer_id string, order_id string, order_line_items array<struct<amount double, item_id string, qty long>> "

df_singleline_schema_2 = spark.read.format("json").schema(_schema).load("data/order_singleline.json")
df_singleline_schema_2.printSchema()
df_singleline_schema_2.show()

ConnectionRefusedError: [Errno 61] Connection refused

In [41]:
# With Schema
_schema = "contact array<long>, customer_id string, order_id string, order_line_items array<struct<amount double, item_id string, qty long>> "

from pyspark.sql.functions import from_json

df_expanded = df_single_text.withColumn("parsed", from_json(df_single_text.value, _schema))
df_expanded.show(truncate = False)
df_expanded.printSchema()
# df_single_text = spark.read.format("text").load("data/order_singleline.json")
# df_single_text.printSchema()
# df_single_text.show()

ConnectionRefusedError: [Errno 61] Connection refused

In [24]:
# Function to_json to parse a JSON string

from pyspark.sql.functions import to_json

df_unparsed = df_expanded.withColumn("unparsed", to_json(df_expanded.parsed))
df_unparsed.printSchema()
df_unparsed.show()

root
 |-- value: string (nullable = true)
 |-- parsed: struct (nullable = true)
 |    |-- contact: array (nullable = true)
 |    |    |-- element: long (containsNull = true)
 |    |-- customer_id: string (nullable = true)
 |    |-- order_id: string (nullable = true)
 |    |-- order_line_items: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- amount: double (nullable = true)
 |    |    |    |-- item_id: string (nullable = true)
 |    |    |    |-- qty: long (nullable = true)
 |-- unparsed: string (nullable = true)

+--------------------+--------------------+--------------------+
|               value|              parsed|            unparsed|
+--------------------+--------------------+--------------------+
|{"order_id":"O101...|{[9000010000, 900...|{"contact":[90000...|
+--------------------+--------------------+--------------------+



In [27]:
# Get values from Parsed JSON

df_pre_exploded = df_expanded.select("parsed.*")

from pyspark.sql.functions import explode

df_exploded = df_pre_exploded.withColumn("exploded_order_items", explode("order_line_items"))
df_exploded.printSchema()
df_exploded.show()

root
 |-- contact: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- customer_id: string (nullable = true)
 |-- order_id: string (nullable = true)
 |-- order_line_items: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- amount: double (nullable = true)
 |    |    |-- item_id: string (nullable = true)
 |    |    |-- qty: long (nullable = true)
 |-- exploded_order_items: struct (nullable = true)
 |    |-- amount: double (nullable = true)
 |    |-- item_id: string (nullable = true)
 |    |-- qty: long (nullable = true)

+--------------------+-----------+--------+--------------------+--------------------+
|             contact|customer_id|order_id|    order_line_items|exploded_order_items|
+--------------------+-----------+--------+--------------------+--------------------+
|[9000010000, 9000...|       C001|    O101|[{102.45, I001, 6...|   {102.45, I001, 6}|
|[9000010000, 9000...|       C001|    O101|[{102.45, I001, 6...|     {2

In [29]:
df_exploded_order_items = df_exploded.select("contact", "customer_id", "order_id", "exploded_order_items.*" )
df_exploded_order_items.printSchema()
df_exploded_order_items.show()

root
 |-- contact: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- customer_id: string (nullable = true)
 |-- order_id: string (nullable = true)
 |-- amount: double (nullable = true)
 |-- item_id: string (nullable = true)
 |-- qty: long (nullable = true)

+--------------------+-----------+--------+------+-------+---+
|             contact|customer_id|order_id|amount|item_id|qty|
+--------------------+-----------+--------+------+-------+---+
|[9000010000, 9000...|       C001|    O101|102.45|   I001|  6|
|[9000010000, 9000...|       C001|    O101|  2.01|   I003|  2|
+--------------------+-----------+--------+------+-------+---+



In [31]:
# Explode Contact Array fields
df_exploded_contact = df_exploded.withColumn("comtact_exploded", explode("contact"))
df_exploded_contact.show()

+--------------------+-----------+--------+--------------------+--------------------+----------------+
|             contact|customer_id|order_id|    order_line_items|exploded_order_items|comtact_exploded|
+--------------------+-----------+--------+--------------------+--------------------+----------------+
|[9000010000, 9000...|       C001|    O101|[{102.45, I001, 6...|   {102.45, I001, 6}|      9000010000|
|[9000010000, 9000...|       C001|    O101|[{102.45, I001, 6...|   {102.45, I001, 6}|      9000010001|
|[9000010000, 9000...|       C001|    O101|[{102.45, I001, 6...|     {2.01, I003, 2}|      9000010000|
|[9000010000, 9000...|       C001|    O101|[{102.45, I001, 6...|     {2.01, I003, 2}|      9000010001|
+--------------------+-----------+--------+--------------------+--------------------+----------------+



In [33]:
df_exploded_contact.drop("contact").show()

ConnectionRefusedError: [Errno 61] Connection refused

In [34]:
spark.stop()

ConnectionRefusedError: [Errno 61] Connection refused