In [None]:
%pip install pyspark

from pyspark.sql import SparkSession
from pyspark.sql.functions import explode, col
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, ArrayType

spark = SparkSession.builder.appName("DataJsonProcessing").getOrCreate()

In [10]:
#df = spark.read.option("multiLine", "true").json("file:///data/data.json")

# Read the inline JSON data
df = spark.read.json("file:///data/data.jsonl")


# Reads user information and orders from the JSON data
user_information = df.select(
    col("id"),
    col("user_info.name").alias("user_name"),
    col("user_info.age").alias("user_age"),
    col("orders")
)

# How the orders is a JSON list explodes it to extract the attributes
df_orders = user_information.withColumn("order", explode(col("orders")))

# Reads the orders data from the exploded orders
df_orders = df_orders.select(
    col("order.order_id").alias("order_id"),
    col("order.product").alias("product"),
    col("order.quantity").alias("quantity")
)

# Remove the orders column from user_information as it is no longer needed
user_information = user_information.drop("orders")

# Reads contact information from the JSON data
contact_information = df.select(
    col("user_info.contact.email").alias("email"),
    col("user_info.contact.phone").alias("phone")
)

# Write the dataframes to Parquet files
# Write user_information
user_information.write.mode("overwrite").parquet("/data/user_information")

#Write contact_information
contact_information.write.mode("overwrite").parquet("/data/contact_information")

#Write orders
df_orders.write.mode("overwrite").parquet("/data/orders")



25/07/08 20:06:23 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers
25/07/08 20:06:23 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 84.44% for 9 writers
25/07/08 20:06:23 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 76.00% for 10 writers
25/07/08 20:06:23 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 69.09% for 11 writers
25/07/08 20:06:23 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 63.33% for 12 writers
25/07/08 20:06:23 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 58.46% for 13 writers
25/07/08 20:06:24 WARN MemoryManager: Total allocation exceeds 95.

In [11]:
# Shows the initial dataframe size and written data sizes
print("Original DataFrame rows count: %d" % df.count())

print("User Information parquet rows count: %d" % spark.read.parquet("/data/user_information").count())

print("Contact Information parquet rows count:  %d" % spark.read.parquet("/data/contact_information").count())

print("Orders parquet file  rows count: %d" % spark.read.parquet("/data/orders").count())

Original DataFrame rows count: 1000000
User Information parquet rows count: 1000000
Contact Information parquet rows count:  1000000
Orders parquet file  rows count: 2501346
