#Spark SQL Exercise Set – Product Orders Analytics

In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("ProductordersAnalytics").getOrCreate()

In [23]:
from pyspark.sql import Row
data=[
    Row(OrderID=1, CustomerName="Rohit", Product="Laptop", Category="Electronics", Quantity=2, UnitPrice=50000, OrderDate="2023-01-15"),
    Row(OrderID=2, CustomerName="Sharmin", Product="T-Shirt", Category="Clothing", Quantity=3, UnitPrice=800, OrderDate="2023-02-10"),
    Row(OrderID=3, CustomerName="Aditi", Product="Chair", Category="Furniture", Quantity=1, UnitPrice=3000, OrderDate="2023-03-05"),
    Row(OrderID=4, CustomerName="zara", Product="Bookshelf", Category="Furniture", Quantity=2, UnitPrice=7000, OrderDate="2023-01-20"),
    Row(OrderID=5, CustomerName="Amuya", Product="Smartphone", Category="Electronics", Quantity=1, UnitPrice=25000, OrderDate="2023-01-02"),
    Row(OrderID=6, CustomerName="Alia", Product="Jeans", Category="Clothing", Quantity=2, UnitPrice=1500, OrderDate="2023-03-11"),
    Row(OrderID=7, CustomerName="Kareena", Product="Notebook", Category="Books", Quantity=5, UnitPrice=200, OrderDate="2023-02-22"),
    Row(OrderID=8, CustomerName="Nandhitha", Product="Table", Category="Furniture", Quantity=1, UnitPrice=5500, OrderDate="2023-01-08"),
    Row(OrderID=9, CustomerName="Inder", Product="Tablet", Category="Electronics", Quantity=2, UnitPrice=30000, OrderDate="2023-03-03"),
    Row(OrderID=10, CustomerName="Emily", Product="Dress", Category="Clothing", Quantity=4, UnitPrice=1200, OrderDate="2023-01-25"),
    Row(OrderID=11, CustomerName="Rohit", Product="Python Book", Category="Books", Quantity=1, UnitPrice=900, OrderDate="2023-02-05"),
    Row(OrderID=12, CustomerName="Aditi", Product="Monitor", Category="Electronics", Quantity=1, UnitPrice=18000, OrderDate="2023-02-18"),
]

df = spark.createDataFrame(data)
df.createOrReplaceTempView("orders_local")
df.createOrReplaceGlobalTempView("orders_global")


#Part A: Local View – orders_local

In [24]:
#1. List all orders placed for "Electronics" with a Quantity of 2 or more.
spark.sql("select* from orders_local where Category = 'Electronics' and Quantity >= 2").show()
#2. Calculate TotalAmount (Quantity × UnitPrice) for each order.
spark.sql("select OrderID, CustomerName, Product, Category, Quantity, UnitPrice, Quantity*UnitPrice as TotalAmount from orders_local").show()
#3. Show the total number of orders per Category .
spark.sql("select Category, count(*) as TotalOrders from orders_local group by Category").show()


+-------+------------+-------+-----------+--------+---------+----------+
|OrderID|CustomerName|Product|   Category|Quantity|UnitPrice| OrderDate|
+-------+------------+-------+-----------+--------+---------+----------+
|      1|       Rohit| Laptop|Electronics|       2|    50000|2023-01-15|
|      9|       Inder| Tablet|Electronics|       2|    30000|2023-03-03|
+-------+------------+-------+-----------+--------+---------+----------+

+-------+------------+-----------+-----------+--------+---------+-----------+
|OrderID|CustomerName|    Product|   Category|Quantity|UnitPrice|TotalAmount|
+-------+------------+-----------+-----------+--------+---------+-----------+
|      1|       Rohit|     Laptop|Electronics|       2|    50000|     100000|
|      2|     Sharmin|    T-Shirt|   Clothing|       3|      800|       2400|
|      3|       Aditi|      Chair|  Furniture|       1|     3000|       3000|
|      4|        zara|  Bookshelf|  Furniture|       2|     7000|      14000|
|      5|      

In [25]:
#4. List orders placed in "January 2023" only.
spark.sql("select*from orders_local where OrderDate like '2023-01%'").show()
#5. Show the average UnitPrice per category.
spark.sql("select Category, avg(UnitPrice) as AvgUnitPrice from orders_local group by Category").show()
#6. Find the order with the highest total amount.
spark.sql("select OrderID, CustomerName, Product, Category, Quantity, UnitPrice, Quantity*UnitPrice as TotalAmount from orders_local order by TotalAmount desc limit 1").show()


+-------+------------+----------+-----------+--------+---------+----------+
|OrderID|CustomerName|   Product|   Category|Quantity|UnitPrice| OrderDate|
+-------+------------+----------+-----------+--------+---------+----------+
|      1|       Rohit|    Laptop|Electronics|       2|    50000|2023-01-15|
|      4|        zara| Bookshelf|  Furniture|       2|     7000|2023-01-20|
|      5|       Amuya|Smartphone|Electronics|       1|    25000|2023-01-02|
|      8|   Nandhitha|     Table|  Furniture|       1|     5500|2023-01-08|
|     10|       Emily|     Dress|   Clothing|       4|     1200|2023-01-25|
+-------+------------+----------+-----------+--------+---------+----------+

+-----------+------------------+
|   Category|      AvgUnitPrice|
+-----------+------------------+
|Electronics|           30750.0|
|   Clothing|1166.6666666666667|
|  Furniture| 5166.666666666667|
|      Books|             550.0|
+-----------+------------------+

+-------+------------+-------+-----------+--------

In [None]:
#7. Drop the local view and try querying it again.
spark.sql("drop view orders_local")
spark.sql("select*from orders_local").show()


#Part B: Global View – orders_global

In [26]:
#1. Display all "Furniture" orders with TotalAmount above 10,000.
spark.sql("select OrderID, CustomerName, Product, Category, Quantity, UnitPrice, Quantity * UnitPrice AS TotalAmount from global_temp.orders_global where Category = 'Furniture' AND Quantity * UnitPrice > 10000").show()



+-------+------------+---------+---------+--------+---------+-----------+
|OrderID|CustomerName|  Product| Category|Quantity|UnitPrice|TotalAmount|
+-------+------------+---------+---------+--------+---------+-----------+
|      4|        zara|Bookshelf|Furniture|       2|     7000|      14000|
+-------+------------+---------+---------+--------+---------+-----------+



In [27]:
#2. Create a column called DiscountFlag :
# Mark "Yes" if Quantity > 3
#Otherwise "No"
spark.sql("select OrderID, CustomerName, Product, Category, Quantity, UnitPrice, Quantity*UnitPrice as TotalAmount, if(Quantity>3,'Yes','No') as DiscountFlag from global_temp.orders_global").show()

+-------+------------+-----------+-----------+--------+---------+-----------+------------+
|OrderID|CustomerName|    Product|   Category|Quantity|UnitPrice|TotalAmount|DiscountFlag|
+-------+------------+-----------+-----------+--------+---------+-----------+------------+
|      1|       Rohit|     Laptop|Electronics|       2|    50000|     100000|          No|
|      2|     Sharmin|    T-Shirt|   Clothing|       3|      800|       2400|          No|
|      3|       Aditi|      Chair|  Furniture|       1|     3000|       3000|          No|
|      4|        zara|  Bookshelf|  Furniture|       2|     7000|      14000|          No|
|      5|       Amuya| Smartphone|Electronics|       1|    25000|      25000|          No|
|      6|        Alia|      Jeans|   Clothing|       2|     1500|       3000|          No|
|      7|     Kareena|   Notebook|      Books|       5|      200|       1000|         Yes|
|      8|   Nandhitha|      Table|  Furniture|       1|     5500|       5500|          No|

In [28]:
#3. List customers who ordered more than 1 product type (Hint: use GROUP BY and HAVING).
spark.sql("select CustomerName, count(distinct Product) as ProductTypes from global_temp.orders_global group by CustomerName having ProductTypes>1").show()


+------------+------------+
|CustomerName|ProductTypes|
+------------+------------+
|       Aditi|           2|
|       Rohit|           2|
+------------+------------+



In [29]:
#4. Count number of orders per month across the dataset.
spark.sql("select month(OrderDate) as Month, count(*) as TotalOrders from global_temp.orders_global group by Month").show()
#5. Rank all products by total quantity sold across all orders using a window function.
spark.sql("select Product, sum(Quantity) as TotalQuantity, rank() over (order by sum(Quantity) desc) as Ranking from global_temp.orders_global group by Product").show()
#6. Run a query using a new SparkSession and the global view.
spark.sql("select*from global_temp.orders_global").show()

+-----+-----------+
|Month|TotalOrders|
+-----+-----------+
|    1|          5|
|    3|          3|
|    2|          4|
+-----+-----------+

+-----------+-------------+-------+
|    Product|TotalQuantity|Ranking|
+-----------+-------------+-------+
|   Notebook|            5|      1|
|      Dress|            4|      2|
|    T-Shirt|            3|      3|
|     Laptop|            2|      4|
|  Bookshelf|            2|      4|
|      Jeans|            2|      4|
|     Tablet|            2|      4|
|      Chair|            1|      8|
| Smartphone|            1|      8|
|Python Book|            1|      8|
|      Table|            1|      8|
|    Monitor|            1|      8|
+-----------+-------------+-------+

+-------+------------+-----------+-----------+--------+---------+----------+
|OrderID|CustomerName|    Product|   Category|Quantity|UnitPrice| OrderDate|
+-------+------------+-----------+-----------+--------+---------+----------+
|      1|       Rohit|     Laptop|Electronics|     

#Bonus Challenges

In [30]:
#1. Save a filtered subset (only "Books" category) as a new global temp view.
spark.sql("create Or Replace global temp view filtered_orders as select * from global_temp.orders_global where Category='Books'")
spark.sql("select *from global_temp.filtered_orders").show()

+-------+------------+-----------+--------+--------+---------+----------+
|OrderID|CustomerName|    Product|Category|Quantity|UnitPrice| OrderDate|
+-------+------------+-----------+--------+--------+---------+----------+
|      7|     Kareena|   Notebook|   Books|       5|      200|2023-02-22|
|     11|       Rohit|Python Book|   Books|       1|      900|2023-02-05|
+-------+------------+-----------+--------+--------+---------+----------+



In [31]:
#2. Find the most purchased product per category.
spark.sql("select Category, Product, count(*) as PurchaseCount from global_temp.orders_global group by Category, Product order by PurchaseCount desc").show()

+-----------+-----------+-------------+
|   Category|    Product|PurchaseCount|
+-----------+-----------+-------------+
|  Furniture|  Bookshelf|            1|
|  Furniture|      Chair|            1|
|Electronics|     Laptop|            1|
|Electronics| Smartphone|            1|
|   Clothing|      Jeans|            1|
|   Clothing|    T-Shirt|            1|
|   Clothing|      Dress|            1|
|Electronics|     Tablet|            1|
|Electronics|    Monitor|            1|
|  Furniture|      Table|            1|
|      Books|Python Book|            1|
|      Books|   Notebook|            1|
+-----------+-----------+-------------+



In [33]:
#3. Create a view that excludes all "Clothing" orders and call it "filtered_orders" .
spark.sql("create Or Replace global temp view filtered_orders as select * from global_temp.orders_global where Category!='Clothing'")
spark.sql("select *from global_temp.filtered_orders").show()

+-------+------------+-----------+-----------+--------+---------+----------+
|OrderID|CustomerName|    Product|   Category|Quantity|UnitPrice| OrderDate|
+-------+------------+-----------+-----------+--------+---------+----------+
|      1|       Rohit|     Laptop|Electronics|       2|    50000|2023-01-15|
|      3|       Aditi|      Chair|  Furniture|       1|     3000|2023-03-05|
|      4|        zara|  Bookshelf|  Furniture|       2|     7000|2023-01-20|
|      5|       Amuya| Smartphone|Electronics|       1|    25000|2023-01-02|
|      7|     Kareena|   Notebook|      Books|       5|      200|2023-02-22|
|      8|   Nandhitha|      Table|  Furniture|       1|     5500|2023-01-08|
|      9|       Inder|     Tablet|Electronics|       2|    30000|2023-03-03|
|     11|       Rohit|Python Book|      Books|       1|      900|2023-02-05|
|     12|       Aditi|    Monitor|Electronics|       1|    18000|2023-02-18|
+-------+------------+-----------+-----------+--------+---------+----------+