In [22]:

from pyspark.sql import SparkSession, Row
from pyspark.sql.functions import explode, col, when

spark = SparkSession.builder.appName("SalesAnalysis").getOrCreate()

data = [
    Row(OrderID=101, Customer="Ali", Items=[{"Product":"Laptop", "Qty":1}, {"Product":"Mouse", "Qty":2}], Region="Asia", Amount=1200.0),
    Row(OrderID=102, Customer="Zara", Items=[{"Product":"Tablet", "Qty":1}], Region="Europe", Amount=650.0),
    Row(OrderID=103, Customer="Mohan", Items=[{"Product":"Phone", "Qty":2}, {"Product":"Charger", "Qty":1}], Region="Asia", Amount=890.0),
    Row(OrderID=104, Customer="Sara", Items=[{"Product":"Desk", "Qty":1}], Region="US", Amount=450.0)
]
df_sales = spark.createDataFrame(data)

In [23]:
# Q1: Flatten the Items array using explode()
# Step 1: Explode and extract nested fields
df_exploded = df_sales.withColumn("Item", explode("Items")) \
    .withColumn("Product", col("Item.Product")) \
    .withColumn("Qty", col("Item.Qty")) \
    .select("OrderID", "Customer", "Region", "Amount", "Product", "Qty")

# Step 2: Confirm structure
df_exploded.printSchema()
df_exploded.show(truncate=False)


root
 |-- OrderID: long (nullable = true)
 |-- Customer: string (nullable = true)
 |-- Region: string (nullable = true)
 |-- Amount: double (nullable = true)
 |-- Product: string (nullable = true)
 |-- Qty: string (nullable = true)

+-------+--------+------+------+-------+---+
|OrderID|Customer|Region|Amount|Product|Qty|
+-------+--------+------+------+-------+---+
|101    |Ali     |Asia  |1200.0|Laptop |1  |
|101    |Ali     |Asia  |1200.0|Mouse  |2  |
|102    |Zara    |Europe|650.0 |Tablet |1  |
|103    |Mohan   |Asia  |890.0 |Phone  |2  |
|103    |Mohan   |Asia  |890.0 |Charger|1  |
|104    |Sara    |US    |450.0 |Desk   |1  |
+-------+--------+------+------+-------+---+



In [24]:
# Q2: Total quantity per product
from pyspark.sql.types import IntegerType

# Cast Qty to Integer
df_exploded = df_exploded.withColumn("Qty", col("Qty").cast(IntegerType()))

# Now group and sum
df_exploded.groupBy("Product").sum("Qty").withColumnRenamed("sum(Qty)", "TotalQty").show()


+-------+--------+
|Product|TotalQty|
+-------+--------+
| Laptop|       1|
|  Mouse|       2|
| Tablet|       1|
|   Desk|       1|
|  Phone|       2|
|Charger|       1|
+-------+--------+



In [25]:
# Q3: Count number of orders per region
df_sales.groupBy("Region").count().withColumnRenamed("count", "OrderCount").show()

+------+----------+
|Region|OrderCount|
+------+----------+
|Europe|         1|
|  Asia|         2|
|    US|         1|
+------+----------+



In [26]:
# Q4: Add column HighValueOrder
df_sales.withColumn("HighValueOrder", when(col("Amount") > 1000, "Yes").otherwise("No")) \
        .select("OrderID", "Amount", "HighValueOrder").show()

+-------+------+--------------+
|OrderID|Amount|HighValueOrder|
+-------+------+--------------+
|    101|1200.0|           Yes|
|    102| 650.0|            No|
|    103| 890.0|            No|
|    104| 450.0|            No|
+-------+------+--------------+



In [27]:
# Q5: Add column ShippingZone
df_sales.withColumn("ShippingZone",
                    when(col("Region") == "Asia", "Zone A")
                    .when(col("Region") == "Europe", "Zone B")
                    .when(col("Region") == "US", "Zone C")) \
        .select("OrderID", "Region", "ShippingZone").show()

+-------+------+------------+
|OrderID|Region|ShippingZone|
+-------+------+------------+
|    101|  Asia|      Zone A|
|    102|Europe|      Zone B|
|    103|  Asia|      Zone A|
|    104|    US|      Zone C|
+-------+------+------------+



In [28]:
# Q6: Register as Temporary View
df_sales.createOrReplaceTempView("sales_view")

In [29]:
# Q7: SQL query - Count orders, Avg amount by region
spark.sql("""
    SELECT Region, COUNT(*) AS OrderCount, AVG(Amount) AS AvgAmount
    FROM sales_view
    GROUP BY Region
""").show()

+------+----------+---------+
|Region|OrderCount|AvgAmount|
+------+----------+---------+
|Europe|         1|    650.0|
|  Asia|         2|   1045.0|
|    US|         1|    450.0|
+------+----------+---------+



In [30]:
# Q8: Save as permanent view (optional if Hive is enabled)

df_sales.write.mode("overwrite").saveAsTable("permanent_sales")


In [31]:
# Q9: SQL to filter all orders with more than 1 item
spark.sql("""
    SELECT OrderID, Customer, size(Items) as NumItems
    FROM sales_view
    WHERE size(Items) > 1
""").show()

+-------+--------+--------+
|OrderID|Customer|NumItems|
+-------+--------+--------+
|    101|     Ali|       2|
|    103|   Mohan|       2|
+-------+--------+--------+



In [32]:
# Q10: SQL to extract customer names where Amount > 800
spark.sql("""
    SELECT Customer, Amount
    FROM sales_view
    WHERE Amount > 800
""").show()

+--------+------+
|Customer|Amount|
+--------+------+
|     Ali|1200.0|
|   Mohan| 890.0|
+--------+------+



In [33]:
# Q11: Save exploded DataFrame as partitioned Parquet file
df_exploded.write.mode("overwrite").partitionBy("Region").parquet("/tmp/sales_partitioned")

In [34]:
# Q12: Read back the Parquet and group by Product
df_parquet = spark.read.parquet("/tmp/sales_partitioned")
df_parquet.groupBy("Product").sum("Qty").withColumnRenamed("sum(Qty)", "TotalQty").show()

+-------+--------+
|Product|TotalQty|
+-------+--------+
|  Phone|       2|
| Laptop|       1|
|Charger|       1|
|  Mouse|       2|
|   Desk|       1|
| Tablet|       1|
+-------+--------+

