In [0]:
from pyspark.sql import Row
from pyspark.sql.functions import explode, col, when
data = [
    Row(OrderID=101, Customer="Ali", Items=[{"Product": "Laptop", "Qty": 1}, {"Product": "Mouse", "Qty": 2}], Region="Asia", Amount=1200.0),
    Row(OrderID=102, Customer="Zara", Items=[{"Product": "Tablet", "Qty": 1}], Region="Europe", Amount=650.0),
    Row(OrderID=103, Customer="Mohan", Items=[{"Product": "Phone", "Qty": 2}, {"Product": "Charger", "Qty": 1}], Region="Asia", Amount=890.0),
    Row(OrderID=104, Customer="Sara", Items=[{"Product": "Desk", "Qty": 1}], Region="US", Amount=450.0)
]

df_sales = spark.createDataFrame(data)
df_sales.show(truncate=False)


+-------+--------+--------------------------------------------------------------+------+------+
|OrderID|Customer|Items                                                         |Region|Amount|
+-------+--------+--------------------------------------------------------------+------+------+
|101    |Ali     |[{Product -> Laptop, Qty -> 1}, {Product -> Mouse, Qty -> 2}] |Asia  |1200.0|
|102    |Zara    |[{Product -> Tablet, Qty -> 1}]                               |Europe|650.0 |
|103    |Mohan   |[{Product -> Phone, Qty -> 2}, {Product -> Charger, Qty -> 1}]|Asia  |890.0 |
|104    |Sara    |[{Product -> Desk, Qty -> 1}]                                 |US    |450.0 |
+-------+--------+--------------------------------------------------------------+------+------+



Working with JSON & Nested Fields

In [0]:
df_exploded = df_sales.withColumn("Item", explode("Items")) \
    .withColumn("Product", col("Item.Product")) \
    .withColumn("Qty", col("Item.Qty")) \
    .drop("Items", "Item")

df_exploded.show(truncate=False)


+-------+--------+------+------+-------+---+
|OrderID|Customer|Region|Amount|Product|Qty|
+-------+--------+------+------+-------+---+
|101    |Ali     |Asia  |1200.0|Laptop |1  |
|101    |Ali     |Asia  |1200.0|Mouse  |2  |
|102    |Zara    |Europe|650.0 |Tablet |1  |
|103    |Mohan   |Asia  |890.0 |Phone  |2  |
|103    |Mohan   |Asia  |890.0 |Charger|1  |
|104    |Sara    |US    |450.0 |Desk   |1  |
+-------+--------+------+------+-------+---+



In [0]:
from pyspark.sql.types import IntegerType
df_exploded = df_exploded.withColumn("Qty", col("Qty").cast(IntegerType()))

df_exploded.groupBy("Product").sum("Qty").withColumnRenamed("sum(Qty)", "TotalQty").show()
df_sales.groupBy("Region").count().withColumnRenamed("count", "OrderCount").show()


+-------+--------+
|Product|TotalQty|
+-------+--------+
| Laptop|       1|
|  Mouse|       2|
| Tablet|       1|
|  Phone|       2|
|Charger|       1|
|   Desk|       1|
+-------+--------+

+------+----------+
|Region|OrderCount|
+------+----------+
|  Asia|         2|
|Europe|         1|
|    US|         1|
+------+----------+



Using when and otherwise

In [0]:
df_sales = df_sales.withColumn(
    "HighValueOrder",
    when(col("Amount") > 1000, "Yes").otherwise("No")
)

df_sales = df_sales.withColumn(
    "ShippingZone",
    when(col("Region") == "Asia", "Zone A")
    .when(col("Region") == "Europe", "Zone B")
    .when(col("Region") == "US", "Zone C")
    .otherwise("Unknown")
)

df_sales.select("OrderID", "Amount", "HighValueOrder", "ShippingZone").show()


+-------+------+--------------+------------+
|OrderID|Amount|HighValueOrder|ShippingZone|
+-------+------+--------------+------------+
|    101|1200.0|           Yes|      Zone A|
|    102| 650.0|            No|      Zone B|
|    103| 890.0|            No|      Zone A|
|    104| 450.0|            No|      Zone C|
+-------+------+--------------+------------+



Temporary & Permanent Views

In [0]:
df_sales.createOrReplaceTempView("sales_view")
spark.sql("""
SELECT Region, COUNT(*) AS OrderCount, AVG(Amount) AS AvgAmount
FROM sales_view
GROUP BY Region
""").show()
df_sales.write.mode("overwrite").saveAsTable("sales_permanent_view")
spark.sql("SELECT * FROM sales_permanent_view").show()


+------+----------+---------+
|Region|OrderCount|AvgAmount|
+------+----------+---------+
|  Asia|         2|   1045.0|
|Europe|         1|    650.0|
|    US|         1|    450.0|
+------+----------+---------+

+-------+--------+--------------------+------+------+--------------+------------+
|OrderID|Customer|               Items|Region|Amount|HighValueOrder|ShippingZone|
+-------+--------+--------------------+------+------+--------------+------------+
|    103|   Mohan|[{Product -> Phon...|  Asia| 890.0|            No|      Zone A|
|    101|     Ali|[{Product -> Lapt...|  Asia|1200.0|           Yes|      Zone A|
|    102|    Zara|[{Product -> Tabl...|Europe| 650.0|            No|      Zone B|
|    104|    Sara|[{Product -> Desk...|    US| 450.0|            No|      Zone C|
+-------+--------+--------------------+------+------+--------------+------------+



SQL Queries via Spark

In [0]:
spark.sql("""
SELECT *
FROM sales_view
WHERE size(Items) > 1
""").show(truncate=False)


+-------+--------+--------------------------------------------------------------+------+------+--------------+------------+
|OrderID|Customer|Items                                                         |Region|Amount|HighValueOrder|ShippingZone|
+-------+--------+--------------------------------------------------------------+------+------+--------------+------------+
|101    |Ali     |[{Product -> Laptop, Qty -> 1}, {Product -> Mouse, Qty -> 2}] |Asia  |1200.0|Yes           |Zone A      |
|103    |Mohan   |[{Product -> Phone, Qty -> 2}, {Product -> Charger, Qty -> 1}]|Asia  |890.0 |No            |Zone A      |
+-------+--------+--------------------------------------------------------------+------+------+--------------+------------+



In [0]:
spark.sql("""
SELECT Customer
FROM sales_view
WHERE Amount > 800
""").show()


+--------+
|Customer|
+--------+
|     Ali|
|   Mohan|
+--------+



Saving as Parquet and Reading Again

In [0]:

df_exploded.write.mode("overwrite").partitionBy("Region").parquet("/dbfs/FileStore/sales_data_parquet")
df_loaded = spark.read.parquet("/dbfs/FileStore/sales_data_parquet")
df_loaded.groupBy("Product").sum("Qty").withColumnRenamed("sum(Qty)", "TotalQty").show()


+-------+--------+
|Product|TotalQty|
+-------+--------+
|  Phone|       2|
|Charger|       1|
| Laptop|       1|
|  Mouse|       2|
| Tablet|       1|
|   Desk|       1|
+-------+--------+

