In [0]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Set-1").getOrCreate()

spark 

In [0]:
from pyspark.sql import Row
data = [
Row(OrderID=101, Customer="Ali", Items=[{"Product":"Laptop", "Qty":1},
{"Product":"Mouse", "Qty":2}], Region="Asia", Amount=1200.0),
Row(OrderID=102, Customer="Zara", Items=[{"Product":"Tablet", "Qty":1}],
Region="Europe", Amount=650.0),
Row(OrderID=103, Customer="Mohan", Items=[{"Product":"Phone", "Qty":2},
{"Product":"Charger", "Qty":1}], Region="Asia", Amount=890.0),
Row(OrderID=104, Customer="Sara", Items=[{"Product":"Desk", "Qty":1}],
Region="US", Amount=450.0)
]
df_sales = spark.createDataFrame(data)
df_sales.show(truncate=False)

df_sales.show()

+-------+--------+--------------------------------------------------------------+------+------+
|OrderID|Customer|Items                                                         |Region|Amount|
+-------+--------+--------------------------------------------------------------+------+------+
|101    |Ali     |[{Product -> Laptop, Qty -> 1}, {Product -> Mouse, Qty -> 2}] |Asia  |1200.0|
|102    |Zara    |[{Product -> Tablet, Qty -> 1}]                               |Europe|650.0 |
|103    |Mohan   |[{Product -> Phone, Qty -> 2}, {Product -> Charger, Qty -> 1}]|Asia  |890.0 |
|104    |Sara    |[{Product -> Desk, Qty -> 1}]                                 |US    |450.0 |
+-------+--------+--------------------------------------------------------------+------+------+

+-------+--------+--------------------+------+------+
|OrderID|Customer|               Items|Region|Amount|
+-------+--------+--------------------+------+------+
|    101|     Ali|[{Product -> Lapt...|  Asia|1200.0|
|    102|    Za

**PySpark Exercises – Set 4 (SQL, JSON, AdvancedFunctions)
 Working with JSON & Nested Fields**

In [0]:
# 1. Flatten the Items array using explode() to create one row per product.

from pyspark.sql.functions import explode

df_flat = df_sales.withColumn("Item",explode("Items"))
df_flat = df_flat.select(df_flat["Item.Product"].alias("Product"), df_flat["Item.Qty"].alias("Quantity"))
df_flat.show(truncate=False)

+-------+--------+
|Product|Quantity|
+-------+--------+
|Laptop |1       |
|Mouse  |2       |
|Tablet |1       |
|Phone  |2       |
|Charger|1       |
|Desk   |1       |
+-------+--------+



In [0]:
# 2. Count total quantity sold per product.
from pyspark.sql.functions import sum

df_count = df_flat.groupBy("Product").agg(sum("Quantity").alias("SummedQuantity"))
df_count.show()

+-------+--------------+
|Product|SummedQuantity|
+-------+--------------+
| Laptop|           1.0|
|  Mouse|           2.0|
| Tablet|           1.0|
|  Phone|           2.0|
|Charger|           1.0|
|   Desk|           1.0|
+-------+--------------+



In [0]:
# 3. Count number of orders per region.
from pyspark.sql.functions import countDistinct

df_count = df_sales.groupBy("Region").agg(countDistinct("OrderID").alias("DistinctOrders"))
df_count.show()

+------+--------------+
|Region|DistinctOrders|
+------+--------------+
|Europe|             1|
|    US|             1|
|  Asia|             2|
+------+--------------+



**Using when and otherwise**

In [0]:
# 4. Create a new column HighValueOrder :
# "Yes" if Amount > 1000
# "No" otherwise

from pyspark.sql.functions import when

df_sales = df_sales.withColumn("HighValueOrder",when(df_sales["Amount"] > 1000, "Yes").otherwise("No"))
df_sales.show()

+-------+--------+--------------------+------+------+--------------+
|OrderID|Customer|               Items|Region|Amount|HighValueOrder|
+-------+--------+--------------------+------+------+--------------+
|    101|     Ali|[{Product -> Lapt...|  Asia|1200.0|           Yes|
|    102|    Zara|[{Product -> Tabl...|Europe| 650.0|            No|
|    103|   Mohan|[{Product -> Phon...|  Asia| 890.0|            No|
|    104|    Sara|[{Product -> Desk...|    US| 450.0|            No|
+-------+--------+--------------------+------+------+--------------+



In [0]:
# 5. Add a column ShippingZone :
# Asia → "Zone A", Europe → "Zone B", US → "Zone C"
from pyspark.sql.functions import when

df_sales = df_sales.withColumn("ShippingZone",when(df_sales["Region"] == "Asia", "Zone A").when(df_sales["Region"] == "Europe", "Zone B").when(df_sales["Region"] == "US", "Zone C").otherwise("Zone D"))
df_sales.show()

+-------+--------+--------------------+------+------+--------------+------------+
|OrderID|Customer|               Items|Region|Amount|HighValueOrder|ShippingZone|
+-------+--------+--------------------+------+------+--------------+------------+
|    101|     Ali|[{Product -> Lapt...|  Asia|1200.0|           Yes|      Zone A|
|    102|    Zara|[{Product -> Tabl...|Europe| 650.0|            No|      Zone B|
|    103|   Mohan|[{Product -> Phon...|  Asia| 890.0|            No|      Zone A|
|    104|    Sara|[{Product -> Desk...|    US| 450.0|            No|      Zone C|
+-------+--------+--------------------+------+------+--------------+------------+



**Temporary & Permanent Views**

In [0]:
# 6. Register df_sales as a temporary view named sales_view .

df_sales.createOrReplaceTempView("sales_view")

In [0]:
# 7. Write a SQL query to:
# Count orders by Region
# Find average amount per region

spark.sql("""SELECT Region, count(OrderID) AS countorders, avg(Amount) as avgamt from sales_view GROUP BY Region""").show()

+------+-----------+------+
|Region|countorders|avgamt|
+------+-----------+------+
|  Asia|          2|1045.0|
|Europe|          1| 650.0|
|    US|          1| 450.0|
+------+-----------+------+



In [0]:
# 8. Create a permanent view using saveAsTable() .
df_sales.write.saveAsTable("sales_permanent_view")

**SQL Queries via Spark**

In [0]:
# spark.sql("SELECT Region, COUNT(*) as OrderCount FROM sales_view GROUP BYRegion").show()
# 9. Use SQL to filter all orders with more than 1 item.

spark.sql("SELECT Region, COUNT(*) as OrderCount FROM sales_view GROUP BY Region").show()

spark.sql("Select * from sales_view where size(Items) > 1").show()

+------+----------+
|Region|OrderCount|
+------+----------+
|  Asia|         2|
|Europe|         1|
|    US|         1|
+------+----------+

+-------+--------+--------------------+------+------+--------------+------------+
|OrderID|Customer|               Items|Region|Amount|HighValueOrder|ShippingZone|
+-------+--------+--------------------+------+------+--------------+------------+
|    101|     Ali|[{Product -> Lapt...|  Asia|1200.0|           Yes|      Zone A|
|    103|   Mohan|[{Product -> Phon...|  Asia| 890.0|            No|      Zone A|
+-------+--------+--------------------+------+------+--------------+------------+



In [0]:
# 10. Use SQL to extract customer names where Amount > 800.

spark.sql("select Customer from sales_view where Amount > 800").show()

+--------+
|Customer|
+--------+
|     Ali|
|   Mohan|
+--------+



**Saving as Parquet and Reading Again**

In [0]:
# 11. Save the exploded product-level DataFrame as a partitioned Parquet file by Region .

# Save to DBFS path, partitioned by Region
df_flat.write.mode("overwrite").parquet("/tmp/product_sales_by_region")

In [0]:
# 12. Read the parquet back and perform a group-by on Product .

df_parquet = spark.read.parquet("/tmp/product_sales_by_region")

from pyspark.sql.functions import sum

df_parquet.groupBy("Product").agg(sum("Quantity").alias("TotalQty")).show()

+-------+--------+
|Product|TotalQty|
+-------+--------+
|  Phone|     2.0|
|Charger|     1.0|
| Laptop|     1.0|
|  Mouse|     2.0|
| Tablet|     1.0|
|   Desk|     1.0|
+-------+--------+

