# Spark SQL Exercise Set – Product Orders Analytics
Dataset Theme: E-Commerce Orders

Preparation Instructions

In [1]:
# 1. Create a PySpark DataFrame with the following schema:
from pyspark.sql import SparkSession, Row

# Start Spark session
spark = SparkSession.builder.appName("ProductOrdersAnalytics").getOrCreate()

# 2. Sample at least 12 rows across multiple categories:
# "Electronics" , "Clothing" , "Furniture" , "Books"
data = [
    Row(OrderID=1, CustomerName="Alice", Product="Laptop", Category="Electronics", Quantity=2, UnitPrice=50000, OrderDate="2023-01-05"),
    Row(OrderID=2, CustomerName="Bob", Product="Shirt", Category="Clothing", Quantity=3, UnitPrice=1200, OrderDate="2023-01-12"),
    Row(OrderID=3, CustomerName="Charlie", Product="Bookshelf", Category="Furniture", Quantity=1, UnitPrice=7000, OrderDate="2023-02-10"),
    Row(OrderID=4, CustomerName="Diana", Product="Headphones", Category="Electronics", Quantity=1, UnitPrice=3000, OrderDate="2023-03-15"),
    Row(OrderID=5, CustomerName="Eva", Product="Notebook", Category="Books", Quantity=5, UnitPrice=200, OrderDate="2023-01-20"),
    Row(OrderID=6, CustomerName="Frank", Product="Sofa", Category="Furniture", Quantity=2, UnitPrice=25000, OrderDate="2023-02-25"),
    Row(OrderID=7, CustomerName="George", Product="Camera", Category="Electronics", Quantity=2, UnitPrice=40000, OrderDate="2023-04-01"),
    Row(OrderID=8, CustomerName="Hannah", Product="Dress", Category="Clothing", Quantity=1, UnitPrice=3000, OrderDate="2023-03-09"),
    Row(OrderID=9, CustomerName="Ivy", Product="Textbook", Category="Books", Quantity=2, UnitPrice=800, OrderDate="2023-01-11"),
    Row(OrderID=10, CustomerName="Jack", Product="Dining Table", Category="Furniture", Quantity=1, UnitPrice=15000, OrderDate="2023-03-22"),
    Row(OrderID=11, CustomerName="Karan", Product="Jeans", Category="Clothing", Quantity=4, UnitPrice=1500, OrderDate="2023-04-13"),
    Row(OrderID=12, CustomerName="Liam", Product="Smartphone", Category="Electronics", Quantity=1, UnitPrice=60000, OrderDate="2023-02-05")
]

# Create DataFrame
df = spark.createDataFrame(data)

# 3. Create:
# A local temporary view: "orders_local"
# A global temporary view: "orders_global"
df.createOrReplaceTempView("orders_local")
df.createOrReplaceGlobalTempView("orders_global")


# Part A: Local View – orders_local

1. List all orders placed for "Electronics" with a Quantity of 2 or more.
2. Calculate TotalAmount (Quantity × UnitPrice) for each order.
3. Show the total number of orders per Category .
4. List orders placed in "January 2023" only.
5. Show the average UnitPrice per category.
6. Find the order with the highest total amount.
7. Drop the local view and try querying it again.

In [2]:
# 1. List all orders placed for "Electronics" with a Quantity of 2 or more.
spark.sql("""
SELECT * FROM orders_local
WHERE Category = 'Electronics' AND Quantity >= 2
""").show()

+-------+------------+-------+-----------+--------+---------+----------+
|OrderID|CustomerName|Product|   Category|Quantity|UnitPrice| OrderDate|
+-------+------------+-------+-----------+--------+---------+----------+
|      1|       Alice| Laptop|Electronics|       2|    50000|2023-01-05|
|      7|      George| Camera|Electronics|       2|    40000|2023-04-01|
+-------+------------+-------+-----------+--------+---------+----------+



In [3]:
# 2. Calculate TotalAmount (Quantity × UnitPrice) for each order.
spark.sql("""
SELECT *, (Quantity * UnitPrice) AS TotalAmount
FROM orders_local
""").show()

+-------+------------+------------+-----------+--------+---------+----------+-----------+
|OrderID|CustomerName|     Product|   Category|Quantity|UnitPrice| OrderDate|TotalAmount|
+-------+------------+------------+-----------+--------+---------+----------+-----------+
|      1|       Alice|      Laptop|Electronics|       2|    50000|2023-01-05|     100000|
|      2|         Bob|       Shirt|   Clothing|       3|     1200|2023-01-12|       3600|
|      3|     Charlie|   Bookshelf|  Furniture|       1|     7000|2023-02-10|       7000|
|      4|       Diana|  Headphones|Electronics|       1|     3000|2023-03-15|       3000|
|      5|         Eva|    Notebook|      Books|       5|      200|2023-01-20|       1000|
|      6|       Frank|        Sofa|  Furniture|       2|    25000|2023-02-25|      50000|
|      7|      George|      Camera|Electronics|       2|    40000|2023-04-01|      80000|
|      8|      Hannah|       Dress|   Clothing|       1|     3000|2023-03-09|       3000|
|      9| 

In [4]:
# 3. Show the total number of orders per Category .
spark.sql("""
SELECT Category, COUNT(*) AS TotalOrders
FROM orders_local
GROUP BY Category
""").show()
#

+-----------+-----------+
|   Category|TotalOrders|
+-----------+-----------+
|Electronics|          4|
|   Clothing|          3|
|      Books|          2|
|  Furniture|          3|
+-----------+-----------+



In [5]:
# 4. List orders placed in "January 2023" only.
spark.sql("""
SELECT * FROM orders_local
WHERE OrderDate LIKE '2023-01%'
""").show()

+-------+------------+--------+-----------+--------+---------+----------+
|OrderID|CustomerName| Product|   Category|Quantity|UnitPrice| OrderDate|
+-------+------------+--------+-----------+--------+---------+----------+
|      1|       Alice|  Laptop|Electronics|       2|    50000|2023-01-05|
|      2|         Bob|   Shirt|   Clothing|       3|     1200|2023-01-12|
|      5|         Eva|Notebook|      Books|       5|      200|2023-01-20|
|      9|         Ivy|Textbook|      Books|       2|      800|2023-01-11|
+-------+------------+--------+-----------+--------+---------+----------+



In [6]:
# 5. Show the average UnitPrice per category.
spark.sql("""
SELECT Category, AVG(UnitPrice) AS AvgUnitPrice
FROM orders_local
GROUP BY Category
""").show()

+-----------+------------------+
|   Category|      AvgUnitPrice|
+-----------+------------------+
|Electronics|           38250.0|
|   Clothing|            1900.0|
|      Books|             500.0|
|  Furniture|15666.666666666666|
+-----------+------------------+



In [7]:
# 6. Find the order with the highest total amount.
spark.sql("""
SELECT *, (Quantity * UnitPrice) AS TotalAmount
FROM orders_local
ORDER BY TotalAmount DESC
LIMIT 1
""").show()

+-------+------------+-------+-----------+--------+---------+----------+-----------+
|OrderID|CustomerName|Product|   Category|Quantity|UnitPrice| OrderDate|TotalAmount|
+-------+------------+-------+-----------+--------+---------+----------+-----------+
|      1|       Alice| Laptop|Electronics|       2|    50000|2023-01-05|     100000|
+-------+------------+-------+-----------+--------+---------+----------+-----------+



In [8]:
# 7. Drop the local view and try querying it again.
spark.catalog.dropTempView("orders_local")

# Try - querying it again, This will raise AnalysisException
spark.sql("SELECT * FROM orders_local").show()

AnalysisException: [TABLE_OR_VIEW_NOT_FOUND] The table or view `orders_local` cannot be found. Verify the spelling and correctness of the schema and catalog.
If you did not qualify the name with a schema, verify the current_schema() output, or qualify the name with the correct schema and catalog.
To tolerate the error on drop use DROP VIEW IF EXISTS or DROP TABLE IF EXISTS.; line 1 pos 14;
'Project [*]
+- 'UnresolvedRelation [orders_local], [], false


# Part B: Global View – orders_global

1. Display all "Furniture" orders with TotalAmount above 10,000.
2. Create a column called DiscountFlag :

    Mark "Yes" if Quantity > 3
    
    Otherwise "No"
3. List customers who ordered more than 1 product type (Hint: use GROUP BY and
HAVING).
4. Count number of orders per month across the dataset.
5. Rank all products by total quantity sold across all orders using a window
function.
6. Run a query using a new SparkSession and the global view.

In [15]:
# 1. Display all "Furniture" orders with TotalAmount above 10,000.
spark.sql("""
SELECT *, (Quantity * UnitPrice) AS TotalAmount
FROM global_temp.orders_global
WHERE Category = 'Furniture' AND (Quantity * UnitPrice) > 10000
""").show()

+-------+------------+------------+---------+--------+---------+----------+-----------+
|OrderID|CustomerName|     Product| Category|Quantity|UnitPrice| OrderDate|TotalAmount|
+-------+------------+------------+---------+--------+---------+----------+-----------+
|      6|       Frank|        Sofa|Furniture|       2|    25000|2023-02-25|      50000|
|     10|        Jack|Dining Table|Furniture|       1|    15000|2023-03-22|      15000|
+-------+------------+------------+---------+--------+---------+----------+-----------+



In [16]:
# 2. Create a column called DiscountFlag :
    # Mark "Yes" if Quantity > 3
    # Otherwise "No"
spark.sql("""
SELECT *,
    CASE
        WHEN Quantity > 3 THEN 'Yes'
        ELSE 'No'
    END AS DiscountFlag
FROM global_temp.orders_global
""").show()


+-------+------------+------------+-----------+--------+---------+----------+------------+
|OrderID|CustomerName|     Product|   Category|Quantity|UnitPrice| OrderDate|DiscountFlag|
+-------+------------+------------+-----------+--------+---------+----------+------------+
|      1|       Alice|      Laptop|Electronics|       2|    50000|2023-01-05|          No|
|      2|         Bob|       Shirt|   Clothing|       3|     1200|2023-01-12|          No|
|      3|     Charlie|   Bookshelf|  Furniture|       1|     7000|2023-02-10|          No|
|      4|       Diana|  Headphones|Electronics|       1|     3000|2023-03-15|          No|
|      5|         Eva|    Notebook|      Books|       5|      200|2023-01-20|         Yes|
|      6|       Frank|        Sofa|  Furniture|       2|    25000|2023-02-25|          No|
|      7|      George|      Camera|Electronics|       2|    40000|2023-04-01|          No|
|      8|      Hannah|       Dress|   Clothing|       1|     3000|2023-03-09|          No|

In [23]:
# 3. List customers who ordered more than 1 product type (Hint: use GROUP BY and HAVING).
spark.sql("""
CREATE OR REPLACE TEMP VIEW filtered_orders AS
SELECT * FROM global_temp.orders_global
WHERE Category != 'Clothing'
""")
spark.sql("SELECT * FROM filtered_orders").show()

+-------+------------+------------+-----------+--------+---------+----------+
|OrderID|CustomerName|     Product|   Category|Quantity|UnitPrice| OrderDate|
+-------+------------+------------+-----------+--------+---------+----------+
|      1|       Alice|      Laptop|Electronics|       2|    50000|2023-01-05|
|      3|     Charlie|   Bookshelf|  Furniture|       1|     7000|2023-02-10|
|      4|       Diana|  Headphones|Electronics|       1|     3000|2023-03-15|
|      5|         Eva|    Notebook|      Books|       5|      200|2023-01-20|
|      6|       Frank|        Sofa|  Furniture|       2|    25000|2023-02-25|
|      7|      George|      Camera|Electronics|       2|    40000|2023-04-01|
|      9|         Ivy|    Textbook|      Books|       2|      800|2023-01-11|
|     10|        Jack|Dining Table|  Furniture|       1|    15000|2023-03-22|
|     12|        Liam|  Smartphone|Electronics|       1|    60000|2023-02-05|
+-------+------------+------------+-----------+--------+--------

In [22]:
# 4. Count number of orders per month across the dataset.
spark.sql("""
SELECT MONTH(OrderDate) AS Month, COUNT(*) AS OrderCount
FROM global_temp.orders_global
GROUP BY MONTH(OrderDate)
ORDER BY MONTH(OrderDate)
""").show()


+-----+----------+
|Month|OrderCount|
+-----+----------+
|    1|         4|
|    2|         3|
|    3|         3|
|    4|         2|
+-----+----------+



In [24]:
# 5. Rank all products by total quantity sold across all orders using a window function.
spark.sql("""
SELECT Product, TotalQuantity,
       RANK() OVER (ORDER BY TotalQuantity DESC) AS Rank
FROM (
    SELECT Product, SUM(Quantity) AS TotalQuantity
    FROM global_temp.orders_global
    GROUP BY Product
)
""").show()


+------------+-------------+----+
|     Product|TotalQuantity|Rank|
+------------+-------------+----+
|    Notebook|            5|   1|
|       Jeans|            4|   2|
|       Shirt|            3|   3|
|      Laptop|            2|   4|
|        Sofa|            2|   4|
|    Textbook|            2|   4|
|      Camera|            2|   4|
|   Bookshelf|            1|   8|
|  Headphones|            1|   8|
|       Dress|            1|   8|
|Dining Table|            1|   8|
|  Smartphone|            1|   8|
+------------+-------------+----+



In [25]:
# 6. Run a query using a new SparkSession and the global view.
from pyspark.sql import SparkSession

# Create new SparkSession
new_spark = SparkSession.builder.appName("NewSession").getOrCreate()

# Access global temp view from new session
new_spark.sql("""
SELECT DISTINCT Category
FROM global_temp.orders_global
""").show()


+-----------+
|   Category|
+-----------+
|Electronics|
|   Clothing|
|      Books|
|  Furniture|
+-----------+



# Bonus Challenges

1. Save a filtered subset (only "Books" category) as a new global temp view.
2. Find the most purchased product per category.
3. Create a view that excludes all "Clothing" orders and call it
"filtered_orders" .

In [26]:
# 1. Save a filtered subset (only "Books" category) as a new global temp view.
spark.sql("""
CREATE OR REPLACE GLOBAL TEMP VIEW books_only AS
SELECT *
FROM global_temp.orders_global
WHERE Category = 'Books'
""")
spark.sql("SELECT * FROM global_temp.books_only").show()

+-------+------------+--------+--------+--------+---------+----------+
|OrderID|CustomerName| Product|Category|Quantity|UnitPrice| OrderDate|
+-------+------------+--------+--------+--------+---------+----------+
|      5|         Eva|Notebook|   Books|       5|      200|2023-01-20|
|      9|         Ivy|Textbook|   Books|       2|      800|2023-01-11|
+-------+------------+--------+--------+--------+---------+----------+



In [30]:
# 2. Find the most purchased product per category.
spark.sql("""
SELECT Category, Product, TotalQty FROM (
    SELECT Category, Product, SUM(Quantity) AS TotalQty,
           RANK() OVER (PARTITION BY Category ORDER BY SUM(Quantity) DESC) AS rk
    FROM global_temp.orders_global
    GROUP BY Category, Product
)
WHERE rk = 1
""").show()

+-----------+--------+--------+
|   Category| Product|TotalQty|
+-----------+--------+--------+
|      Books|Notebook|       5|
|   Clothing|   Jeans|       4|
|Electronics|  Laptop|       2|
|Electronics|  Camera|       2|
|  Furniture|    Sofa|       2|
+-----------+--------+--------+



In [31]:
# 3. Create a view that excludes all "Clothing" orders and call it "filtered_orders" .
spark.sql("""
CREATE OR REPLACE TEMP VIEW filtered_orders AS
SELECT *
FROM global_temp.orders_global
WHERE Category != 'Clothing'
""")

spark.sql("SELECT * FROM filtered_orders").show()

+-------+------------+------------+-----------+--------+---------+----------+
|OrderID|CustomerName|     Product|   Category|Quantity|UnitPrice| OrderDate|
+-------+------------+------------+-----------+--------+---------+----------+
|      1|       Alice|      Laptop|Electronics|       2|    50000|2023-01-05|
|      3|     Charlie|   Bookshelf|  Furniture|       1|     7000|2023-02-10|
|      4|       Diana|  Headphones|Electronics|       1|     3000|2023-03-15|
|      5|         Eva|    Notebook|      Books|       5|      200|2023-01-20|
|      6|       Frank|        Sofa|  Furniture|       2|    25000|2023-02-25|
|      7|      George|      Camera|Electronics|       2|    40000|2023-04-01|
|      9|         Ivy|    Textbook|      Books|       2|      800|2023-01-11|
|     10|        Jack|Dining Table|  Furniture|       1|    15000|2023-03-22|
|     12|        Liam|  Smartphone|Electronics|       1|    60000|2023-02-05|
+-------+------------+------------+-----------+--------+--------