# Step 1: Data Preparation

In [26]:
from pyspark.sql import SparkSession, Row

spark = SparkSession.builder.appName("ProductOrdersSQL").getOrCreate()

data = [
    Row(OrderID=1, CustomerName="Ravi", Product="Laptop", Category="Electronics", Quantity=2, UnitPrice=50000, OrderDate="2023-01-05"),
    Row(OrderID=2, CustomerName="Sneha", Product="Jeans", Category="Clothing", Quantity=1, UnitPrice=2500, OrderDate="2023-01-12"),
    Row(OrderID=3, CustomerName="Kabir", Product="Desk", Category="Furniture", Quantity=1, UnitPrice=12000, OrderDate="2023-02-01"),
    Row(OrderID=4, CustomerName="Anita", Product="Chair", Category="Furniture", Quantity=3, UnitPrice=4500, OrderDate="2023-02-18"),
    Row(OrderID=5, CustomerName="Divya", Product="Smartphone", Category="Electronics", Quantity=2, UnitPrice=30000, OrderDate="2023-01-20"),
    Row(OrderID=6, CustomerName="Manav", Product="Shirt", Category="Clothing", Quantity=4, UnitPrice=1500, OrderDate="2023-03-12"),
    Row(OrderID=7, CustomerName="Amit", Product="Tablet", Category="Electronics", Quantity=1, UnitPrice=28000, OrderDate="2023-03-25"),
    Row(OrderID=8, CustomerName="Neha", Product="Lamp", Category="Furniture", Quantity=2, UnitPrice=3000, OrderDate="2023-01-28"),
    Row(OrderID=9, CustomerName="Farah", Product="Book - AI", Category="Books", Quantity=5, UnitPrice=900, OrderDate="2023-04-01"),
    Row(OrderID=10, CustomerName="Ravi", Product="Book - Python", Category="Books", Quantity=3, UnitPrice=1200, OrderDate="2023-04-10"),
    Row(OrderID=11, CustomerName="Sneha", Product="T-shirt", Category="Clothing", Quantity=2, UnitPrice=700, OrderDate="2023-05-05"),
    Row(OrderID=12, CustomerName="Kabir", Product="Headphones", Category="Electronics", Quantity=1, UnitPrice=4000, OrderDate="2023-02-22"),
    Row(OrderID=13, CustomerName="Divya", Product="Notebook", Category="Books", Quantity=2, UnitPrice=1500, OrderDate="2023-05-20"),
    Row(OrderID=14, CustomerName="Manav", Product="Table", Category="Furniture", Quantity=4, UnitPrice=3500, OrderDate="2023-06-01")
]

df = spark.createDataFrame(data)
df.createOrReplaceTempView("orders_local")
df.createOrReplaceGlobalTempView("orders_global")
df.show(truncate=False)



+-------+------------+-------------+-----------+--------+---------+----------+
|OrderID|CustomerName|Product      |Category   |Quantity|UnitPrice|OrderDate |
+-------+------------+-------------+-----------+--------+---------+----------+
|1      |Ravi        |Laptop       |Electronics|2       |50000    |2023-01-05|
|2      |Sneha       |Jeans        |Clothing   |1       |2500     |2023-01-12|
|3      |Kabir       |Desk         |Furniture  |1       |12000    |2023-02-01|
|4      |Anita       |Chair        |Furniture  |3       |4500     |2023-02-18|
|5      |Divya       |Smartphone   |Electronics|2       |30000    |2023-01-20|
|6      |Manav       |Shirt        |Clothing   |4       |1500     |2023-03-12|
|7      |Amit        |Tablet       |Electronics|1       |28000    |2023-03-25|
|8      |Neha        |Lamp         |Furniture  |2       |3000     |2023-01-28|
|9      |Farah       |Book - AI    |Books      |5       |900      |2023-04-01|
|10     |Ravi        |Book - Python|Books      |3   

# Part A: Local View – orders_local

1. List all orders placed for "Electronics" with a Quantity of 2 or more.

In [3]:
spark.sql("select * from orders_local where category = 'Electronics' AND Quantity >=2").show()

+-------+------------+----------+-----------+--------+---------+----------+
|OrderID|CustomerName|   Product|   Category|Quantity|UnitPrice| OrderDate|
+-------+------------+----------+-----------+--------+---------+----------+
|      1|        Ravi|    Laptop|Electronics|       2|    50000|2023-01-05|
|      5|       Divya|Smartphone|Electronics|       2|    30000|2023-01-20|
+-------+------------+----------+-----------+--------+---------+----------+



2. Calculate TotalAmount (Quantity × UnitPrice) for each order.

In [12]:
spark.sql("select * ,(Quantity * UnitPrice) as TotalAmount from orders_local").show()

+-------+------------+-------------+-----------+--------+---------+----------+-----------+
|OrderID|CustomerName|      Product|   Category|Quantity|UnitPrice| OrderDate|TotalAmount|
+-------+------------+-------------+-----------+--------+---------+----------+-----------+
|      1|        Ravi|       Laptop|Electronics|       2|    50000|2023-01-05|     100000|
|      2|       Sneha|        Jeans|   Clothing|       1|     2500|2023-01-12|       2500|
|      3|       Kabir|         Desk|  Furniture|       1|    12000|2023-02-01|      12000|
|      4|       Anita|        Chair|  Furniture|       3|     4500|2023-02-18|      13500|
|      5|       Divya|   Smartphone|Electronics|       2|    30000|2023-01-20|      60000|
|      6|       Manav|        Shirt|   Clothing|       4|     1500|2023-03-12|       6000|
|      7|        Amit|       Tablet|Electronics|       1|    28000|2023-03-25|      28000|
|      8|        Neha|         Lamp|  Furniture|       2|     3000|2023-01-28|       6000|

3. Show the total number of orders per Category .

In [27]:
spark.sql("select Category, Count(*) as TotalOrders from orders_local group by Category").show()

+-----------+-----------+
|   Category|TotalOrders|
+-----------+-----------+
|Electronics|          4|
|   Clothing|          3|
|  Furniture|          4|
|      Books|          3|
+-----------+-----------+



4. List orders placed in "January 2023" only.

In [37]:
spark.sql("select * from orders_local where OrderDate BETWEEN '2023-01-01' AND '2023-01-31'").show()

+-------+------------+----------+-----------+--------+---------+----------+
|OrderID|CustomerName|   Product|   Category|Quantity|UnitPrice| OrderDate|
+-------+------------+----------+-----------+--------+---------+----------+
|      1|        Ravi|    Laptop|Electronics|       2|    50000|2023-01-05|
|      2|       Sneha|     Jeans|   Clothing|       1|     2500|2023-01-12|
|      5|       Divya|Smartphone|Electronics|       2|    30000|2023-01-20|
|      8|        Neha|      Lamp|  Furniture|       2|     3000|2023-01-28|
+-------+------------+----------+-----------+--------+---------+----------+



5. Show the average UnitPrice per category.

In [11]:
spark.sql("select Category, Avg(UnitPrice) as Avg_UnitPrice from orders_local group by Category").show()

+-----------+------------------+
|   Category|     Avg_UnitPrice|
+-----------+------------------+
|Electronics|           28000.0|
|   Clothing|1566.6666666666667|
|  Furniture|            5750.0|
|      Books|            1200.0|
+-----------+------------------+



6. Find the order with the highest total amount.

In [15]:
spark.sql("select *, (Quantity * UnitPrice) as TotalAmount from orders_local order by TotalAmount DESC LIMIT 1").show()

+-------+------------+-------+-----------+--------+---------+----------+-----------+
|OrderID|CustomerName|Product|   Category|Quantity|UnitPrice| OrderDate|TotalAmount|
+-------+------------+-------+-----------+--------+---------+----------+-----------+
|      1|        Ravi| Laptop|Electronics|       2|    50000|2023-01-05|     100000|
+-------+------------+-------+-----------+--------+---------+----------+-----------+



7. Drop the local view and try querying it again.

In [39]:
spark.catalog.dropTempView("orders_local")
spark.sql("SELECT * FROM orders_local").show()  # This will now show error

AnalysisException: [TABLE_OR_VIEW_NOT_FOUND] The table or view `orders_local` cannot be found. Verify the spelling and correctness of the schema and catalog.
If you did not qualify the name with a schema, verify the current_schema() output, or qualify the name with the correct schema and catalog.
To tolerate the error on drop use DROP VIEW IF EXISTS or DROP TABLE IF EXISTS.; line 1 pos 14;
'Project [*]
+- 'UnresolvedRelation [orders_local], [], false


# Part B: Global View – orders_global

1. Display all "Furniture" orders with TotalAmount above
10,000.

In [19]:
spark.sql("select *, (Quantity * UnitPrice) as TotalAmount from global_temp.orders_global where Category = 'Furniture' AND (Quantity * UnitPrice)  > 10000").show()

+-------+------------+-------+---------+--------+---------+----------+-----------+
|OrderID|CustomerName|Product| Category|Quantity|UnitPrice| OrderDate|TotalAmount|
+-------+------------+-------+---------+--------+---------+----------+-----------+
|      3|       Kabir|   Desk|Furniture|       1|    12000|2023-02-01|      12000|
|      4|       Anita|  Chair|Furniture|       3|     4500|2023-02-18|      13500|
|     14|       Manav|  Table|Furniture|       4|     3500|2023-06-01|      14000|
+-------+------------+-------+---------+--------+---------+----------+-----------+



2. Create a column called DiscountFlag :
Mark "Yes" if Quantity > 3
Otherwise "No"

In [21]:
spark.sql("""select *, CASE when Quantity > 3 THEN 'Yes'
ELSE 'No' END AS DiscountFlag from global_temp.orders_global""").show()

+-------+------------+-------------+-----------+--------+---------+----------+------------+
|OrderID|CustomerName|      Product|   Category|Quantity|UnitPrice| OrderDate|DiscountFlag|
+-------+------------+-------------+-----------+--------+---------+----------+------------+
|      1|        Ravi|       Laptop|Electronics|       2|    50000|2023-01-05|          No|
|      2|       Sneha|        Jeans|   Clothing|       1|     2500|2023-01-12|          No|
|      3|       Kabir|         Desk|  Furniture|       1|    12000|2023-02-01|          No|
|      4|       Anita|        Chair|  Furniture|       3|     4500|2023-02-18|          No|
|      5|       Divya|   Smartphone|Electronics|       2|    30000|2023-01-20|          No|
|      6|       Manav|        Shirt|   Clothing|       4|     1500|2023-03-12|         Yes|
|      7|        Amit|       Tablet|Electronics|       1|    28000|2023-03-25|          No|
|      8|        Neha|         Lamp|  Furniture|       2|     3000|2023-01-28|  

3. List customers who ordered more than 1 product type (Hint: use GROUP BY and
HAVING).

In [29]:
spark.sql("""select CustomerName from global_temp.orders_global
GROUP BY CustomerName
HAVING COUNT(DISTINCT Category) > 1
""").show()

+------------+
|CustomerName|
+------------+
|       Divya|
|        Ravi|
|       Kabir|
|       Manav|
+------------+



4. Count number of orders per month across the dataset.

In [30]:
spark.sql("""select substring(OrderDate, 1, 7) AS Month, COUNT(*) as OrderCount
from global_temp.orders_global
GROUP BY Month
ORDER BY Month
""").show()

+-------+----------+
|  Month|OrderCount|
+-------+----------+
|2023-01|         4|
|2023-02|         3|
|2023-03|         2|
|2023-04|         2|
|2023-05|         2|
|2023-06|         1|
+-------+----------+



5. Rank all products by total quantity sold across all orders using a window
function.

In [31]:
from pyspark.sql.functions import col, sum as Fsum
from pyspark.sql.window import Window
from pyspark.sql.functions import rank

prod_qty = df.groupBy("Product").agg(Fsum("Quantity").alias("TotalQuantity"))

windowSpec = Window.orderBy(col("TotalQuantity").desc())
ranked = prod_qty.withColumn("Rank", rank().over(windowSpec))
ranked.show()

+-------------+-------------+----+
|      Product|TotalQuantity|Rank|
+-------------+-------------+----+
|    Book - AI|            5|   1|
|        Shirt|            4|   2|
|        Table|            4|   2|
|        Chair|            3|   4|
|Book - Python|            3|   4|
|       Laptop|            2|   6|
|   Smartphone|            2|   6|
|      T-shirt|            2|   6|
|         Lamp|            2|   6|
|     Notebook|            2|   6|
|         Desk|            1|  11|
|       Tablet|            1|  11|
|        Jeans|            1|  11|
|   Headphones|            1|  11|
+-------------+-------------+----+



6. Run a query using a new SparkSession and the global view.

In [32]:
new_Spark = SparkSession.builder.appName("NewSession").getOrCreate()
new_Spark.sql("select * from global_temp.orders_global").show()

+-------+------------+-------------+-----------+--------+---------+----------+
|OrderID|CustomerName|      Product|   Category|Quantity|UnitPrice| OrderDate|
+-------+------------+-------------+-----------+--------+---------+----------+
|      1|        Ravi|       Laptop|Electronics|       2|    50000|2023-01-05|
|      2|       Sneha|        Jeans|   Clothing|       1|     2500|2023-01-12|
|      3|       Kabir|         Desk|  Furniture|       1|    12000|2023-02-01|
|      4|       Anita|        Chair|  Furniture|       3|     4500|2023-02-18|
|      5|       Divya|   Smartphone|Electronics|       2|    30000|2023-01-20|
|      6|       Manav|        Shirt|   Clothing|       4|     1500|2023-03-12|
|      7|        Amit|       Tablet|Electronics|       1|    28000|2023-03-25|
|      8|        Neha|         Lamp|  Furniture|       2|     3000|2023-01-28|
|      9|       Farah|    Book - AI|      Books|       5|      900|2023-04-01|
|     10|        Ravi|Book - Python|      Books|    

# Bonus Challenges

1. Save a filtered subset (only "Books" category) as a new global temp view.

In [33]:
books_df = df.filter(col("Category") == "Books")
books_df.createOrReplaceGlobalTempView("books_only")


2. Find the most purchased product per category.

In [34]:
from pyspark.sql.functions import row_number

prod_sum = df.groupBy("Category", "Product").agg(Fsum("Quantity").alias("Qty"))
window = Window.partitionBy("Category").orderBy(col("Qty").desc())
top_per_cat = prod_sum.withColumn("rn", row_number().over(window)).filter("rn = 1")
top_per_cat.show()

+-----------+---------+---+---+
|   Category|  Product|Qty| rn|
+-----------+---------+---+---+
|      Books|Book - AI|  5|  1|
|   Clothing|    Shirt|  4|  1|
|Electronics|   Laptop|  2|  1|
|  Furniture|    Table|  4|  1|
+-----------+---------+---+---+



3. Create a view that excludes all "Clothing" orders and call it
"filtered_orders" .

In [35]:
df.filter(col("Category") != "Clothing").createOrReplaceTempView("filtered_orders")
spark.sql("SELECT * FROM filtered_orders").show()

+-------+------------+-------------+-----------+--------+---------+----------+
|OrderID|CustomerName|      Product|   Category|Quantity|UnitPrice| OrderDate|
+-------+------------+-------------+-----------+--------+---------+----------+
|      1|        Ravi|       Laptop|Electronics|       2|    50000|2023-01-05|
|      3|       Kabir|         Desk|  Furniture|       1|    12000|2023-02-01|
|      4|       Anita|        Chair|  Furniture|       3|     4500|2023-02-18|
|      5|       Divya|   Smartphone|Electronics|       2|    30000|2023-01-20|
|      7|        Amit|       Tablet|Electronics|       1|    28000|2023-03-25|
|      8|        Neha|         Lamp|  Furniture|       2|     3000|2023-01-28|
|      9|       Farah|    Book - AI|      Books|       5|      900|2023-04-01|
|     10|        Ravi|Book - Python|      Books|       3|     1200|2023-04-10|
|     12|       Kabir|   Headphones|Electronics|       1|     4000|2023-02-22|
|     13|       Divya|     Notebook|      Books|    