In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode, col, sum as _sum, count, when

spark = SparkSession.builder.getOrCreate()

from pyspark.sql import Row
data = [
    Row(OrderID=101, Customer="Ali", Items=[{"Product":"Laptop", "Qty":1}, {"Product":"Mouse", "Qty":2}], Region="Asia", Amount=1200.0),
    Row(OrderID=102, Customer="Zara", Items=[{"Product":"Tablet", "Qty":1}], Region="Europe", Amount=650.0),
    Row(OrderID=103, Customer="Mohan", Items=[{"Product":"Phone", "Qty":2}, {"Product":"Charger", "Qty":1}], Region="Asia", Amount=890.0),
    Row(OrderID=104, Customer="Sara", Items=[{"Product":"Desk", "Qty":1}], Region="US", Amount=450.0)
]

df_sales = spark.createDataFrame(data)
df_sales.show(truncate=False)


+-------+--------+--------------------------------------------------------------+------+------+
|OrderID|Customer|Items                                                         |Region|Amount|
+-------+--------+--------------------------------------------------------------+------+------+
|101    |Ali     |[{Product -> Laptop, Qty -> 1}, {Product -> Mouse, Qty -> 2}] |Asia  |1200.0|
|102    |Zara    |[{Product -> Tablet, Qty -> 1}]                               |Europe|650.0 |
|103    |Mohan   |[{Product -> Phone, Qty -> 2}, {Product -> Charger, Qty -> 1}]|Asia  |890.0 |
|104    |Sara    |[{Product -> Desk, Qty -> 1}]                                 |US    |450.0 |
+-------+--------+--------------------------------------------------------------+------+------+



In [0]:
# Working with JSON & Nested Fields
# 1. Flatten the Items array using explode() to create one row per product.

from pyspark.sql.functions import explode, col

df_flat = df_sales.withColumn("Item", explode("Items")).withColumn("Product", col("Item.Product")) \
                  .withColumn("Qty", col("Item.Qty")) \
                  .drop("Items", "Item")

df_flat.show(truncate=False)



+-------+--------+------+------+-------+---+
|OrderID|Customer|Region|Amount|Product|Qty|
+-------+--------+------+------+-------+---+
|101    |Ali     |Asia  |1200.0|Laptop |1  |
|101    |Ali     |Asia  |1200.0|Mouse  |2  |
|102    |Zara    |Europe|650.0 |Tablet |1  |
|103    |Mohan   |Asia  |890.0 |Phone  |2  |
|103    |Mohan   |Asia  |890.0 |Charger|1  |
|104    |Sara    |US    |450.0 |Desk   |1  |
+-------+--------+------+------+-------+---+



In [0]:
# 2. Count total quantity sold per product.
df_flat.groupBy("Product").agg(_sum("Qty").alias("TotalQty")).show()


+-------+--------+
|Product|TotalQty|
+-------+--------+
| Laptop|     1.0|
|  Mouse|     2.0|
| Tablet|     1.0|
|  Phone|     2.0|
|Charger|     1.0|
|   Desk|     1.0|
+-------+--------+



In [0]:
# 3. Count number of orders per region.
df_sales.groupBy("Region").agg(count("OrderID").alias("NumberOfOrders")).show()


+------+--------------+
|Region|NumberOfOrders|
+------+--------------+
|  Asia|             2|
|Europe|             1|
|    US|             1|
+------+--------------+



In [0]:
# Using when and otherwise
# 4. Create a new column HighValueOrder :
# "Yes" if Amount > 1000
# "No" otherwise
df_sales_with_flag = df_sales.withColumn(
    "HighValueOrder",
    when(col("Amount") > 1000, "Yes").otherwise("No")
)

df_sales_with_flag.select("OrderID", "Amount", "HighValueOrder").show()


+-------+------+--------------+
|OrderID|Amount|HighValueOrder|
+-------+------+--------------+
|    101|1200.0|           Yes|
|    102| 650.0|            No|
|    103| 890.0|            No|
|    104| 450.0|            No|
+-------+------+--------------+



In [0]:
# 5. Add a column ShippingZone :
# Asia → "Zone A", Europe → "Zone B", US → "Zone C"
df_sales_with_zone = df_sales_with_flag.withColumn(
    "ShippingZone",
    when(col("Region") == "Asia", "Zone A")
    .when(col("Region") == "Europe", "Zone B")
    .when(col("Region") == "US", "Zone C")
    .otherwise("Other")
)

df_sales_with_zone.select("OrderID", "Region", "ShippingZone").show()


+-------+------+------------+
|OrderID|Region|ShippingZone|
+-------+------+------------+
|    101|  Asia|      Zone A|
|    102|Europe|      Zone B|
|    103|  Asia|      Zone A|
|    104|    US|      Zone C|
+-------+------+------------+



In [0]:
# Temporary & Permanent Views
# 6. Register df_sales as a temporary view named sales_view .
df_sales.createOrReplaceTempView("sales_view")

# 7. Write a SQL query to:
# Count orders by Region
spark.sql("""
    SELECT Region, COUNT(*) AS OrderCount
    FROM sales_view
    GROUP BY Region
""").show()

# Find average amount per region
spark.sql("""
    SELECT Region, AVG(Amount) AS AvgAmount
    FROM sales_view
    GROUP BY Region
""").show()

# 8. Create a permanent view using saveAsTable() .
df_sales.write.mode("overwrite").saveAsTable("sales_permanent_view")

+------+----------+
|Region|OrderCount|
+------+----------+
|  Asia|         2|
|Europe|         1|
|    US|         1|
+------+----------+

+------+---------+
|Region|AvgAmount|
+------+---------+
|  Asia|   1045.0|
|Europe|    650.0|
|    US|    450.0|
+------+---------+



In [0]:
# 9. Use SQL to filter all orders with more than 1 item.
spark.sql("""
    SELECT OrderID, Customer, Size(Items) AS NumItems, Region, Amount
    FROM sales_view
    WHERE Size(Items) > 1
""").show()

# 10. Use SQL to extract customer names where Amount > 800.
spark.sql("""
    SELECT Customer, Amount
    FROM sales_view
    WHERE Amount > 800
""").show()


+-------+--------+--------+------+------+
|OrderID|Customer|NumItems|Region|Amount|
+-------+--------+--------+------+------+
|    101|     Ali|       2|  Asia|1200.0|
|    103|   Mohan|       2|  Asia| 890.0|
+-------+--------+--------+------+------+

+--------+------+
|Customer|Amount|
+--------+------+
|     Ali|1200.0|
|   Mohan| 890.0|
+--------+------+



In [0]:
# Saving as Parquet and Reading Again
# 11. Save the exploded product-level DataFrame as a partitioned Parquet file by
# Region .
from pyspark.sql.functions import explode, col

df_flat = df_sales.withColumn("Item", explode("Items")).withColumn("Product", col("Item.Product")).withColumn("Qty", col("Item.Qty")).drop("Items", "Item")

df_flat.write.mode("overwrite").partitionBy("Region").parquet("/tmp/product_sales_by_region")

# 12. Read the parquet back and perform a group-by on Product .

df_read = spark.read.parquet("/tmp/product_sales_by_region")

df_read.groupBy("Product").sum("Qty").withColumnRenamed("sum(Qty)", "TotalQty").show()
