In [0]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql import Row
from pyspark.sql.window import Window as W

In [0]:
spark = SparkSession.builder.appName("NB-1").getOrCreate()
spark

#**Dataset**


In [0]:
data = [
Row(OrderID=101, Customer="Ali", Items=[{"Product":"Laptop", "Qty":1},
{"Product":"Mouse", "Qty":2}], Region="Asia", Amount=1200.0),
Row(OrderID=102, Customer="Zara", Items=[{"Product":"Tablet", "Qty":1}],
Region="Europe", Amount=650.0),
Row(OrderID=103, Customer="Mohan", Items=[{"Product":"Phone", "Qty":2},
{"Product":"Charger", "Qty":1}], Region="Asia", Amount=890.0),
Row(OrderID=104, Customer="Sara", Items=[{"Product":"Desk", "Qty":1}],
Region="US", Amount=450.0)
]
df_sales = spark.createDataFrame(data)
df_sales.show(truncate=False)

+-------+--------+--------------------------------------------------------------+------+------+
|OrderID|Customer|Items                                                         |Region|Amount|
+-------+--------+--------------------------------------------------------------+------+------+
|101    |Ali     |[{Product -> Laptop, Qty -> 1}, {Product -> Mouse, Qty -> 2}] |Asia  |1200.0|
|102    |Zara    |[{Product -> Tablet, Qty -> 1}]                               |Europe|650.0 |
|103    |Mohan   |[{Product -> Phone, Qty -> 2}, {Product -> Charger, Qty -> 1}]|Asia  |890.0 |
|104    |Sara    |[{Product -> Desk, Qty -> 1}]                                 |US    |450.0 |
+-------+--------+--------------------------------------------------------------+------+------+



#**Working with JSON & Nested Fields**

In [0]:
# 1. Flatten the Items array using explode() to create one row per product.
df_sales_exploded = df_sales.select('*', F.explode("Items").alias("Products")).drop("Items")
df_sales.show()

+-------+--------+--------------------+------+------+
|OrderID|Customer|               Items|Region|Amount|
+-------+--------+--------------------+------+------+
|    101|     Ali|[{Product -> Lapt...|  Asia|1200.0|
|    102|    Zara|[{Product -> Tabl...|Europe| 650.0|
|    103|   Mohan|[{Product -> Phon...|  Asia| 890.0|
|    104|    Sara|[{Product -> Desk...|    US| 450.0|
+-------+--------+--------------------+------+------+



In [0]:
# 2. Count total quantity sold per product.
df_sales_exploded.select("Products.Product", "Products.Qty").groupby("Product").agg(
    F.sum("Qty").alias("TotalQuantity")
).show()

+-------+-------------+
|Product|TotalQuantity|
+-------+-------------+
| Laptop|          1.0|
|  Mouse|          2.0|
| Tablet|          1.0|
|  Phone|          2.0|
|Charger|          1.0|
|   Desk|          1.0|
+-------+-------------+



In [0]:
# 3. Count number of orders per region.
df_sales_exploded.groupby("Region").agg(
    F.countDistinct(df_sales_exploded.OrderID).alias("OrderCount")
).show()

+------+----------+
|Region|OrderCount|
+------+----------+
|Europe|         1|
|    US|         1|
|  Asia|         2|
+------+----------+



#**Using when and otherwise**

In [0]:
# 4. Create a new column HighValueOrder :
# "Yes" if Amount > 1000
# "No" otherwise
df_sales.withColumn("HighOrderValue", F.when(df_sales.Amount > 1000, "Yes").otherwise("No")).show()

+-------+--------+--------------------+------+------+--------------+
|OrderID|Customer|               Items|Region|Amount|HighOrderValue|
+-------+--------+--------------------+------+------+--------------+
|    101|     Ali|[{Product -> Lapt...|  Asia|1200.0|           Yes|
|    102|    Zara|[{Product -> Tabl...|Europe| 650.0|            No|
|    103|   Mohan|[{Product -> Phon...|  Asia| 890.0|            No|
|    104|    Sara|[{Product -> Desk...|    US| 450.0|            No|
+-------+--------+--------------------+------+------+--------------+



In [0]:
# 5. Add a column ShippingZone :
# Asia → "Zone A", Europe → "Zone B", US → "Zone C"
df_sales.withColumn("ShippingZone", F.when(df_sales.Region == "Asia", "Zone A").when(df_sales.Region == "Europe", "Zone B").otherwise("Zone C")).show()

+-------+--------+--------------------+------+------+------------+
|OrderID|Customer|               Items|Region|Amount|ShippingZone|
+-------+--------+--------------------+------+------+------------+
|    101|     Ali|[{Product -> Lapt...|  Asia|1200.0|      Zone A|
|    102|    Zara|[{Product -> Tabl...|Europe| 650.0|      Zone B|
|    103|   Mohan|[{Product -> Phon...|  Asia| 890.0|      Zone A|
|    104|    Sara|[{Product -> Desk...|    US| 450.0|      Zone C|
+-------+--------+--------------------+------+------+------------+



#**Temporary & Permanent views**

In [0]:
spark.sql("CREATE DATABASE IF NOT EXISTS sales")
spark.sql("USE sales")

DataFrame[]

In [0]:
# 6. Register df_sales as a temporary view named sales_view .
df_sales.createOrReplaceTempView("sales_view")

In [0]:
# 7. Write a SQL query to:
# Count orders by Region
# Find average amount per region
spark.sql("""
          SELECT Region, COUNT(OrderID) AS OrderCount, AVG(Amount) AS AverageAmount FROM sales_view
          GROUP BY Region
          """).show()

+------+----------+-------------+
|Region|OrderCount|AverageAmount|
+------+----------+-------------+
|  Asia|         2|       1045.0|
|Europe|         1|        650.0|
|    US|         1|        450.0|
+------+----------+-------------+



In [0]:
# 8. Create a permanent view using saveAsTable() .
df_sales.write.saveAsTable("sales.df_sales")

#**SQL Queries via Spark**

In [0]:
# 9. Use SQL to filter all orders with more than 1 item.
spark.sql("""
          SELECT * FROM df_sales
          WHERE SIZE(Items) > 1
          """).show()

+-------+--------+--------------------+------+------+
|OrderID|Customer|               Items|Region|Amount|
+-------+--------+--------------------+------+------+
|    103|   Mohan|[{Product -> Phon...|  Asia| 890.0|
|    101|     Ali|[{Product -> Lapt...|  Asia|1200.0|
+-------+--------+--------------------+------+------+



In [0]:
# 10. Use SQL to extract customer names where Amount > 800.
spark.sql("""
          SELECT Customer, Amount FROM df_sales
          WHERE Amount > 800
          """).show()

+--------+------+
|Customer|Amount|
+--------+------+
|   Mohan| 890.0|
|     Ali|1200.0|
+--------+------+



#**Saving as Parquet and Reading Again**

In [0]:
# 11. Save the exploded product-level DataFrame as a partitioned Parquet file by Region .
df_sales_exploded.write.mode("overwrite").parquet("/Workspace/Shared/sales/sales_exploded", partitionBy="Region")

In [0]:
# 12. Read the parquet back and perform a group-by on Product .
df_parquet_sales = spark.read.parquet("/Workspace/Shared/sales/sales_exploded")
df_parquet_sales.groupBy("Products.Product").agg(
    F.countDistinct("Products.Product").alias("Product"),
    F.sum("Products.Qty").alias("TotalQty")
).show()

+-------+-------+--------+
|Product|Product|TotalQty|
+-------+-------+--------+
|  Phone|      1|     2.0|
|Charger|      1|     1.0|
| Laptop|      1|     1.0|
|  Mouse|      1|     2.0|
| Tablet|      1|     1.0|
|   Desk|      1|     1.0|
+-------+-------+--------+

