In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

spark = SparkSession.builder.appName("ProductOrdersAnalytics").getOrCreate()

# Preparation Instructions
1. Create a PySpark DataFrame with the following schema:
OrderID (int)
CustomerName (string)
Product (string)
Category (string)
Quantity (int)
UnitPrice (int)
OrderDate (string in YYYY-MM-DD format)
2. Sample at least 12 rows across multiple categories:
"Electronics" , "Clothing" , "Furniture" , "Books"
3. Create:
A local temporary view: "orders_local"
A global temporary view: "orders_global"

In [2]:
data = [
    (101, "Ravi", "Laptop", "Electronics", 2, 55000, "2023-01-15"),
    (102, "Sneha", "T-Shirt", "Clothing", 3, 800, "2023-01-20"),
    (103, "Kabir", "Bookshelf", "Furniture", 1, 12000, "2023-02-05"),
    (104, "Anita", "Novel", "Books", 4, 500, "2023-02-10"),
    (105, "Divya", "Smartphone", "Electronics", 1, 30000, "2023-03-01"),
    (106, "Ravi", "Dining Table", "Furniture", 2, 25000, "2023-03-15"),
    (107, "Sneha", "Jeans", "Clothing", 2, 1500, "2023-01-25"),
    (108, "Kabir", "Tablet", "Electronics", 3, 20000, "2023-02-20"),
    (109, "Anita", "Textbook", "Books", 2, 1500, "2023-03-10"),
    (110, "Divya", "Chair", "Furniture", 5, 4000, "2023-01-18"),
    (111, "Ravi", "Shirt", "Clothing", 4, 1200, "2023-03-25"),
    (112, "Sneha", "Camera", "Electronics", 2, 35000, "2023-02-28")
]

columns = ["OrderID", "CustomerName", "Product", "Category", "Quantity", "UnitPrice", "OrderDate"]

df = spark.createDataFrame(data, columns)

In [3]:
df.createOrReplaceTempView("orders_local")
df.createOrReplaceGlobalTempView("orders_global")

# Part A: Local View – orders_local
1. List all orders placed for "Electronics" with a Quantity of 2 or more.
2. Calculate TotalAmount (Quantity × UnitPrice) for each order.
3. Show the total number of orders per Category .
4. List orders placed in "January 2023" only.
5. Show the average UnitPrice per category.
6. Find the order with the highest total amount.
7. Drop the local view and try querying it again.

In [4]:

# 1. Electronics orders with Quantity >= 2
spark.sql("""
SELECT * FROM orders_local
WHERE Category = 'Electronics' AND Quantity >= 2
""").show()

# 2. Calculate TotalAmount
spark.sql("""
SELECT *, (Quantity * UnitPrice) AS TotalAmount
FROM orders_local
""").show()

# 3. Total number of orders per Category
spark.sql("""
SELECT Category, COUNT(*) AS TotalOrders
FROM orders_local
GROUP BY Category
""").show()

# 4. Orders in January 2023
spark.sql("""
SELECT * FROM orders_local
WHERE OrderDate LIKE '2023-01%'
""").show()

# 5. Average UnitPrice per Category
spark.sql("""
SELECT Category, ROUND(AVG(UnitPrice), 2) AS AvgPrice
FROM orders_local
GROUP BY Category
""").show()

# 6. Order with highest total amount
spark.sql("""
SELECT *, (Quantity * UnitPrice) AS TotalAmount
FROM orders_local
ORDER BY TotalAmount DESC
LIMIT 1
""").show()

# 7. Drop local view & try query
spark.catalog.dropTempView("orders_local")
try:
    spark.sql("SELECT * FROM orders_local").show()
except Exception as e:
    print("Error querying dropped local view:", e)

+-------+------------+-------+-----------+--------+---------+----------+
|OrderID|CustomerName|Product|   Category|Quantity|UnitPrice| OrderDate|
+-------+------------+-------+-----------+--------+---------+----------+
|    101|        Ravi| Laptop|Electronics|       2|    55000|2023-01-15|
|    108|       Kabir| Tablet|Electronics|       3|    20000|2023-02-20|
|    112|       Sneha| Camera|Electronics|       2|    35000|2023-02-28|
+-------+------------+-------+-----------+--------+---------+----------+

+-------+------------+------------+-----------+--------+---------+----------+-----------+
|OrderID|CustomerName|     Product|   Category|Quantity|UnitPrice| OrderDate|TotalAmount|
+-------+------------+------------+-----------+--------+---------+----------+-----------+
|    101|        Ravi|      Laptop|Electronics|       2|    55000|2023-01-15|     110000|
|    102|       Sneha|     T-Shirt|   Clothing|       3|      800|2023-01-20|       2400|
|    103|       Kabir|   Bookshelf|  F

# Part B: Global View – orders_global
1. Display all "Furniture" orders with TotalAmount above
10,000.

2. Create a column called DiscountFlag :
Mark "Yes" if Quantity > 3
Otherwise "No"
3. List customers who ordered more than 1 product type (Hint: use GROUP BY and
HAVING).
4. Count number of orders per month across the dataset.
5. Rank all products by total quantity sold across all orders using a window
function.
6. Run a query using a new SparkSession and the global view.

In [5]:

# 1. Furniture orders with TotalAmount > 10000
spark.sql("""
SELECT *, (Quantity * UnitPrice) AS TotalAmount
FROM global_temp.orders_global
WHERE Category = 'Furniture' AND (Quantity * UnitPrice) > 10000
""").show()

# 2. Add DiscountFlag
spark.sql("""
SELECT *, CASE WHEN Quantity > 3 THEN 'Yes' ELSE 'No' END AS DiscountFlag
FROM global_temp.orders_global
""").show()

# 3. Customers with more than 1 product type
spark.sql("""
SELECT CustomerName, COUNT(DISTINCT Product) AS ProductTypes
FROM global_temp.orders_global
GROUP BY CustomerName
HAVING COUNT(DISTINCT Product) > 1
""").show()

# 4. Number of orders per month
spark.sql("""
SELECT SUBSTRING(OrderDate, 1, 7) AS Month, COUNT(*) AS OrdersCount
FROM global_temp.orders_global
GROUP BY SUBSTRING(OrderDate, 1, 7)
ORDER BY Month
""").show()

# 5. Rank products by total quantity sold
spark.sql("""
SELECT Product, SUM(Quantity) AS TotalQty,
       RANK() OVER (ORDER BY SUM(Quantity) DESC) AS RankPos
FROM global_temp.orders_global
GROUP BY Product
""").show()

# 6. Query global view from new SparkSession
new_spark = SparkSession.builder.appName("NewSessionTest").getOrCreate()
new_spark.sql("""
SELECT * FROM global_temp.orders_global
""").show()


+-------+------------+------------+---------+--------+---------+----------+-----------+
|OrderID|CustomerName|     Product| Category|Quantity|UnitPrice| OrderDate|TotalAmount|
+-------+------------+------------+---------+--------+---------+----------+-----------+
|    103|       Kabir|   Bookshelf|Furniture|       1|    12000|2023-02-05|      12000|
|    106|        Ravi|Dining Table|Furniture|       2|    25000|2023-03-15|      50000|
|    110|       Divya|       Chair|Furniture|       5|     4000|2023-01-18|      20000|
+-------+------------+------------+---------+--------+---------+----------+-----------+

+-------+------------+------------+-----------+--------+---------+----------+------------+
|OrderID|CustomerName|     Product|   Category|Quantity|UnitPrice| OrderDate|DiscountFlag|
+-------+------------+------------+-----------+--------+---------+----------+------------+
|    101|        Ravi|      Laptop|Electronics|       2|    55000|2023-01-15|          No|
|    102|       Sne

# Bonus Challenges
1. Save a filtered subset (only "Books" category) as a new global temp view.
2. Find the most purchased product per category.
3. Create a view that excludes all "Clothing" orders and call it
"filtered_orders" .

In [7]:
# 1. Books category as new global temp view
spark.sql("""
SELECT * FROM global_temp.orders_global
WHERE Category = 'Books'
""").createOrReplaceGlobalTempView("books_only")

spark.sql("SELECT * FROM global_temp.books_only").show()

# 2. Most purchased product per category
spark.sql("""
WITH ranked AS (
    SELECT Category,
           Product,
           SUM(Quantity) AS TotalQty,
           ROW_NUMBER() OVER (PARTITION BY Category ORDER BY SUM(Quantity) DESC) AS rn
    FROM global_temp.orders_global
    GROUP BY Category, Product
)
SELECT Category, Product, TotalQty
FROM ranked
WHERE rn = 1
""").show()


# 3. View excluding Clothing orders
spark.sql("""
SELECT * FROM global_temp.orders_global
WHERE Category != 'Clothing'
""").createOrReplaceGlobalTempView("filtered_orders")

spark.sql("SELECT * FROM global_temp.filtered_orders").show()

+-------+------------+--------+--------+--------+---------+----------+
|OrderID|CustomerName| Product|Category|Quantity|UnitPrice| OrderDate|
+-------+------------+--------+--------+--------+---------+----------+
|    104|       Anita|   Novel|   Books|       4|      500|2023-02-10|
|    109|       Anita|Textbook|   Books|       2|     1500|2023-03-10|
+-------+------------+--------+--------+--------+---------+----------+

+-----------+-------+--------+
|   Category|Product|TotalQty|
+-----------+-------+--------+
|      Books|  Novel|       4|
|   Clothing|  Shirt|       4|
|Electronics| Tablet|       3|
|  Furniture|  Chair|       5|
+-----------+-------+--------+

+-------+------------+------------+-----------+--------+---------+----------+
|OrderID|CustomerName|     Product|   Category|Quantity|UnitPrice| OrderDate|
+-------+------------+------------+-----------+--------+---------+----------+
|    101|        Ravi|      Laptop|Electronics|       2|    55000|2023-01-15|
|    103|   