# Scenario 3
An e-commerce platform receives customer order details from its mobile application in JSON format through a streaming pipeline. The JSON contains **nested fields** such as customer information, payment details, and a list of purchased items. To store and analyse this data efficiently in a data warehouse, the nested structure must be flattened into a tabular format using PySpark, ensuring all relevant attributes are readily accessible for reporting and analytics.

In [0]:
df = spark.read.format("json")\
  .option("inferSchema", True)\
  .option("multiLine", True)\
  .load("/Volumes/pyspark_cata/source/db_volume/jsonData/")

display(df)

customer,delivery_updates,items,order_id,order_timestamp,payment
"List(CUST101, john.doe@example.com, List(Toronto, Canada), John Doe)","List(Order Placed, Packed, Shipped, Out for Delivery)","List(List(ITEM1001, 25.5, Wireless Mouse, 2), List(ITEM1002, 199.75, Mechanical Keyboard, 1))",ORD001,2025-08-15T10:45:30Z,"List(250.75, CAD, Credit Card)"
"List(CUST102, jane.smith@example.com, List(Vancouver, Canada), Jane Smith)","List(Order Placed, Packed, Shipped)","List(List(ITEM1003, 89.99, USB-C Hub, 1))",ORD002,2025-08-15T11:10:15Z,"List(89.99, CAD, PayPal)"


In [0]:
df.schema

StructType([StructField('customer', StructType([StructField('customer_id', StringType(), True), StructField('email', StringType(), True), StructField('location', StructType([StructField('city', StringType(), True), StructField('country', StringType(), True)]), True), StructField('name', StringType(), True)]), True), StructField('delivery_updates', ArrayType(StringType(), True), True), StructField('items', ArrayType(StructType([StructField('item_id', StringType(), True), StructField('price_per_unit', DoubleType(), True), StructField('product_name', StringType(), True), StructField('quantity', LongType(), True)]), True), True), StructField('order_id', StringType(), True), StructField('order_timestamp', StringType(), True), StructField('payment', StructType([StructField('amount', DoubleType(), True), StructField('currency', StringType(), True), StructField('method', StringType(), True)]), True)])

In [0]:
from pyspark.sql.functions import *

In [0]:
df_cust = df.select("customer.customer_id", "customer.email", "customer.location.city", "customer.location.country", "*")\
    .drop("customer")

df_cust_upd = df_cust.withColumn("delivery_updates", explode("delivery_updates"))\
    .withColumn("items", explode("items"))\
    .select("*")

display(df_cust_upd)

customer_id,email,city,country,delivery_updates,items,order_id,order_timestamp,payment
CUST101,john.doe@example.com,Toronto,Canada,Order Placed,"List(ITEM1001, 25.5, Wireless Mouse, 2)",ORD001,2025-08-15T10:45:30Z,"List(250.75, CAD, Credit Card)"
CUST101,john.doe@example.com,Toronto,Canada,Order Placed,"List(ITEM1002, 199.75, Mechanical Keyboard, 1)",ORD001,2025-08-15T10:45:30Z,"List(250.75, CAD, Credit Card)"
CUST101,john.doe@example.com,Toronto,Canada,Packed,"List(ITEM1001, 25.5, Wireless Mouse, 2)",ORD001,2025-08-15T10:45:30Z,"List(250.75, CAD, Credit Card)"
CUST101,john.doe@example.com,Toronto,Canada,Packed,"List(ITEM1002, 199.75, Mechanical Keyboard, 1)",ORD001,2025-08-15T10:45:30Z,"List(250.75, CAD, Credit Card)"
CUST101,john.doe@example.com,Toronto,Canada,Shipped,"List(ITEM1001, 25.5, Wireless Mouse, 2)",ORD001,2025-08-15T10:45:30Z,"List(250.75, CAD, Credit Card)"
CUST101,john.doe@example.com,Toronto,Canada,Shipped,"List(ITEM1002, 199.75, Mechanical Keyboard, 1)",ORD001,2025-08-15T10:45:30Z,"List(250.75, CAD, Credit Card)"
CUST101,john.doe@example.com,Toronto,Canada,Out for Delivery,"List(ITEM1001, 25.5, Wireless Mouse, 2)",ORD001,2025-08-15T10:45:30Z,"List(250.75, CAD, Credit Card)"
CUST101,john.doe@example.com,Toronto,Canada,Out for Delivery,"List(ITEM1002, 199.75, Mechanical Keyboard, 1)",ORD001,2025-08-15T10:45:30Z,"List(250.75, CAD, Credit Card)"
CUST102,jane.smith@example.com,Vancouver,Canada,Order Placed,"List(ITEM1003, 89.99, USB-C Hub, 1)",ORD002,2025-08-15T11:10:15Z,"List(89.99, CAD, PayPal)"
CUST102,jane.smith@example.com,Vancouver,Canada,Packed,"List(ITEM1003, 89.99, USB-C Hub, 1)",ORD002,2025-08-15T11:10:15Z,"List(89.99, CAD, PayPal)"
