In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql.functions import col, when, sum as _sum, avg, count, rank, month, year
from pyspark.sql.window import Window

spark = SparkSession.builder.appName("ProductOrdersAnalytics").getOrCreate()


In [2]:
data = [
    Row(OrderID=1, CustomerName="Ravi", Product="Laptop", Category="Electronics", Quantity=2, UnitPrice=50000, OrderDate="2023-01-15"),
    Row(OrderID=2, CustomerName="Anita", Product="T-Shirt", Category="Clothing", Quantity=4, UnitPrice=700, OrderDate="2023-01-20"),
    Row(OrderID=3, CustomerName="Kabir", Product="Book A", Category="Books", Quantity=3, UnitPrice=400, OrderDate="2023-02-10"),
    Row(OrderID=4, CustomerName="Divya", Product="Sofa", Category="Furniture", Quantity=1, UnitPrice=12000, OrderDate="2023-03-05"),
    Row(OrderID=5, CustomerName="Amit", Product="Phone", Category="Electronics", Quantity=1, UnitPrice=25000, OrderDate="2023-01-25"),
    Row(OrderID=6, CustomerName="Sneha", Product="Desk", Category="Furniture", Quantity=2, UnitPrice=8000, OrderDate="2023-04-11"),
    Row(OrderID=7, CustomerName="Neha", Product="Shoes", Category="Clothing", Quantity=2, UnitPrice=1500, OrderDate="2023-05-02"),
    Row(OrderID=8, CustomerName="Rishitha", Product="Book B", Category="Books", Quantity=5, UnitPrice=350, OrderDate="2023-06-18"),
    Row(OrderID=9, CustomerName="Savitri", Product="TV", Category="Electronics", Quantity=3, UnitPrice=40000, OrderDate="2023-07-01"),
    Row(OrderID=10, CustomerName="Kiran", Product="Bookshelf", Category="Furniture", Quantity=1, UnitPrice=15000, OrderDate="2023-07-20"),
    Row(OrderID=11, CustomerName="Farah", Product="Book C", Category="Books", Quantity=2, UnitPrice=500, OrderDate="2023-08-05"),
    Row(OrderID=12, CustomerName="Manav", Product="Headphones", Category="Electronics", Quantity=4, UnitPrice=3000, OrderDate="2023-09-09")
]

df = spark.createDataFrame(data)
df.show(truncate=False)


+-------+------------+----------+-----------+--------+---------+----------+
|OrderID|CustomerName|Product   |Category   |Quantity|UnitPrice|OrderDate |
+-------+------------+----------+-----------+--------+---------+----------+
|1      |Ravi        |Laptop    |Electronics|2       |50000    |2023-01-15|
|2      |Anita       |T-Shirt   |Clothing   |4       |700      |2023-01-20|
|3      |Kabir       |Book A    |Books      |3       |400      |2023-02-10|
|4      |Divya       |Sofa      |Furniture  |1       |12000    |2023-03-05|
|5      |Amit        |Phone     |Electronics|1       |25000    |2023-01-25|
|6      |Sneha       |Desk      |Furniture  |2       |8000     |2023-04-11|
|7      |Neha        |Shoes     |Clothing   |2       |1500     |2023-05-02|
|8      |Rishitha    |Book B    |Books      |5       |350      |2023-06-18|
|9      |Savitri     |TV        |Electronics|3       |40000    |2023-07-01|
|10     |Kiran       |Bookshelf |Furniture  |1       |15000    |2023-07-20|
|11     |Far

In [3]:
# Local view
df.createOrReplaceTempView("orders_local")

# Global view
df.createOrReplaceGlobalTempView("orders_global")


In [4]:
#PART-A
# 1. List all Electronics orders with Quantity >= 2
spark.sql("SELECT * FROM orders_local WHERE Category = 'Electronics' AND Quantity >= 2").show()


+-------+------------+----------+-----------+--------+---------+----------+
|OrderID|CustomerName|   Product|   Category|Quantity|UnitPrice| OrderDate|
+-------+------------+----------+-----------+--------+---------+----------+
|      1|        Ravi|    Laptop|Electronics|       2|    50000|2023-01-15|
|      9|     Savitri|        TV|Electronics|       3|    40000|2023-07-01|
|     12|       Manav|Headphones|Electronics|       4|     3000|2023-09-09|
+-------+------------+----------+-----------+--------+---------+----------+



In [5]:
# 2. Calculate TotalAmount (Quantity * UnitPrice) for each order
spark.sql("SELECT *, (Quantity * UnitPrice) AS TotalAmount FROM orders_local").show()


+-------+------------+----------+-----------+--------+---------+----------+-----------+
|OrderID|CustomerName|   Product|   Category|Quantity|UnitPrice| OrderDate|TotalAmount|
+-------+------------+----------+-----------+--------+---------+----------+-----------+
|      1|        Ravi|    Laptop|Electronics|       2|    50000|2023-01-15|     100000|
|      2|       Anita|   T-Shirt|   Clothing|       4|      700|2023-01-20|       2800|
|      3|       Kabir|    Book A|      Books|       3|      400|2023-02-10|       1200|
|      4|       Divya|      Sofa|  Furniture|       1|    12000|2023-03-05|      12000|
|      5|        Amit|     Phone|Electronics|       1|    25000|2023-01-25|      25000|
|      6|       Sneha|      Desk|  Furniture|       2|     8000|2023-04-11|      16000|
|      7|        Neha|     Shoes|   Clothing|       2|     1500|2023-05-02|       3000|
|      8|    Rishitha|    Book B|      Books|       5|      350|2023-06-18|       1750|
|      9|     Savitri|        TV

In [6]:
# 3. Show total number of orders per Category
spark.sql("SELECT Category, COUNT(*) AS TotalOrders FROM orders_local GROUP BY Category").show()


+-----------+-----------+
|   Category|TotalOrders|
+-----------+-----------+
|Electronics|          4|
|   Clothing|          2|
|      Books|          3|
|  Furniture|          3|
+-----------+-----------+



In [7]:
# 4. List orders placed in January 2023 only
spark.sql("SELECT * FROM orders_local WHERE OrderDate LIKE '2023-01%'").show()


+-------+------------+-------+-----------+--------+---------+----------+
|OrderID|CustomerName|Product|   Category|Quantity|UnitPrice| OrderDate|
+-------+------------+-------+-----------+--------+---------+----------+
|      1|        Ravi| Laptop|Electronics|       2|    50000|2023-01-15|
|      2|       Anita|T-Shirt|   Clothing|       4|      700|2023-01-20|
|      5|        Amit|  Phone|Electronics|       1|    25000|2023-01-25|
+-------+------------+-------+-----------+--------+---------+----------+



In [8]:
# 5. Show average UnitPrice per category
spark.sql("SELECT Category, AVG(UnitPrice) AS AvgPrice FROM orders_local GROUP BY Category").show()


+-----------+------------------+
|   Category|          AvgPrice|
+-----------+------------------+
|Electronics|           29500.0|
|   Clothing|            1100.0|
|      Books| 416.6666666666667|
|  Furniture|11666.666666666666|
+-----------+------------------+



In [9]:
# 6. Find the order with the highest total amount
spark.sql("SELECT *, (Quantity * UnitPrice) AS TotalAmount FROM orders_local ORDER BY TotalAmount DESC LIMIT 1").show()


+-------+------------+-------+-----------+--------+---------+----------+-----------+
|OrderID|CustomerName|Product|   Category|Quantity|UnitPrice| OrderDate|TotalAmount|
+-------+------------+-------+-----------+--------+---------+----------+-----------+
|      9|     Savitri|     TV|Electronics|       3|    40000|2023-07-01|     120000|
+-------+------------+-------+-----------+--------+---------+----------+-----------+



In [10]:
# 7. Drop the local view and try querying again
spark.catalog.dropTempView("orders_local")
spark.sql("SELECT * FROM orders_local").show()
#IT GETS ERROR


AnalysisException: [TABLE_OR_VIEW_NOT_FOUND] The table or view `orders_local` cannot be found. Verify the spelling and correctness of the schema and catalog.
If you did not qualify the name with a schema, verify the current_schema() output, or qualify the name with the correct schema and catalog.
To tolerate the error on drop use DROP VIEW IF EXISTS or DROP TABLE IF EXISTS.; line 1 pos 14;
'Project [*]
+- 'UnresolvedRelation [orders_local], [], false


In [11]:
# PART-B
#1. Display all Furniture orders with TotalAmount > 10000
spark.sql("SELECT * FROM global_temp.orders_global WHERE Category = 'Furniture' AND (Quantity * UnitPrice) > 10000").show()


+-------+------------+---------+---------+--------+---------+----------+
|OrderID|CustomerName|  Product| Category|Quantity|UnitPrice| OrderDate|
+-------+------------+---------+---------+--------+---------+----------+
|      4|       Divya|     Sofa|Furniture|       1|    12000|2023-03-05|
|      6|       Sneha|     Desk|Furniture|       2|     8000|2023-04-11|
|     10|       Kiran|Bookshelf|Furniture|       1|    15000|2023-07-20|
+-------+------------+---------+---------+--------+---------+----------+



In [12]:
# 2. Create a column called DiscountFlag: 'Yes' if Quantity > 3, else 'No'
df_discount = df.withColumn("DiscountFlag", when(col("Quantity") > 3, "Yes").otherwise("No"))
df_discount.createOrReplaceGlobalTempView("orders_discounted")
spark.sql("SELECT * FROM global_temp.orders_discounted").show()


+-------+------------+----------+-----------+--------+---------+----------+------------+
|OrderID|CustomerName|   Product|   Category|Quantity|UnitPrice| OrderDate|DiscountFlag|
+-------+------------+----------+-----------+--------+---------+----------+------------+
|      1|        Ravi|    Laptop|Electronics|       2|    50000|2023-01-15|          No|
|      2|       Anita|   T-Shirt|   Clothing|       4|      700|2023-01-20|         Yes|
|      3|       Kabir|    Book A|      Books|       3|      400|2023-02-10|          No|
|      4|       Divya|      Sofa|  Furniture|       1|    12000|2023-03-05|          No|
|      5|        Amit|     Phone|Electronics|       1|    25000|2023-01-25|          No|
|      6|       Sneha|      Desk|  Furniture|       2|     8000|2023-04-11|          No|
|      7|        Neha|     Shoes|   Clothing|       2|     1500|2023-05-02|          No|
|      8|    Rishitha|    Book B|      Books|       5|      350|2023-06-18|         Yes|
|      9|     Savitri

In [13]:
# 3. List customers who ordered more than one product type
spark.sql("""SELECT CustomerName
  FROM global_temp.orders_global
  GROUP BY CustomerName
  HAVING COUNT(DISTINCT Category) > 1
""").show()


+------------+
|CustomerName|
+------------+
+------------+



In [14]:
# 4. Count number of orders per month
df_month = df.withColumn("Month", month(col("OrderDate")))
df_month.createOrReplaceGlobalTempView("orders_by_month")
spark.sql("SELECT Month, COUNT(*) AS OrdersCount FROM global_temp.orders_by_month GROUP BY Month ORDER BY Month").show()


+-----+-----------+
|Month|OrdersCount|
+-----+-----------+
|    1|          3|
|    2|          1|
|    3|          1|
|    4|          1|
|    5|          1|
|    6|          1|
|    7|          2|
|    8|          1|
|    9|          1|
+-----+-----------+



In [15]:
# 5. Rank all products by total quantity sold across all orders
windowSpec = Window.orderBy(col("TotalQtySold").desc())
product_totals = df.groupBy("Product").agg(_sum("Quantity").alias("TotalQtySold"))
ranked = product_totals.withColumn("Rank", rank().over(windowSpec))
ranked.show()


+----------+------------+----+
|   Product|TotalQtySold|Rank|
+----------+------------+----+
|    Book B|           5|   1|
|   T-Shirt|           4|   2|
|Headphones|           4|   2|
|    Book A|           3|   4|
|        TV|           3|   4|
|      Desk|           2|   6|
|    Laptop|           2|   6|
|     Shoes|           2|   6|
|    Book C|           2|   6|
|     Phone|           1|  10|
|      Sofa|           1|  10|
| Bookshelf|           1|  10|
+----------+------------+----+



In [16]:
# 6. Run a query using a new Spark session and the global view
new_spark = SparkSession.builder.appName("NewSession").getOrCreate()
new_spark.sql("SELECT * FROM global_temp.orders_global").show()


+-------+------------+----------+-----------+--------+---------+----------+
|OrderID|CustomerName|   Product|   Category|Quantity|UnitPrice| OrderDate|
+-------+------------+----------+-----------+--------+---------+----------+
|      1|        Ravi|    Laptop|Electronics|       2|    50000|2023-01-15|
|      2|       Anita|   T-Shirt|   Clothing|       4|      700|2023-01-20|
|      3|       Kabir|    Book A|      Books|       3|      400|2023-02-10|
|      4|       Divya|      Sofa|  Furniture|       1|    12000|2023-03-05|
|      5|        Amit|     Phone|Electronics|       1|    25000|2023-01-25|
|      6|       Sneha|      Desk|  Furniture|       2|     8000|2023-04-11|
|      7|        Neha|     Shoes|   Clothing|       2|     1500|2023-05-02|
|      8|    Rishitha|    Book B|      Books|       5|      350|2023-06-18|
|      9|     Savitri|        TV|Electronics|       3|    40000|2023-07-01|
|     10|       Kiran| Bookshelf|  Furniture|       1|    15000|2023-07-20|
|     11|   

In [17]:
#BONUS
# 1. Save a filtered subset (only "Books" category) as a new global temp view
spark.sql("""CREATE OR REPLACE GLOBAL TEMP VIEW books_only AS
  SELECT * FROM global_temp.orders_global WHERE Category = 'Books'
""")


DataFrame[]

In [18]:
# View the "Books" only dataset
spark.sql("SELECT * FROM global_temp.books_only").show()


+-------+------------+-------+--------+--------+---------+----------+
|OrderID|CustomerName|Product|Category|Quantity|UnitPrice| OrderDate|
+-------+------------+-------+--------+--------+---------+----------+
|      3|       Kabir| Book A|   Books|       3|      400|2023-02-10|
|      8|    Rishitha| Book B|   Books|       5|      350|2023-06-18|
|     11|       Farah| Book C|   Books|       2|      500|2023-08-05|
+-------+------------+-------+--------+--------+---------+----------+



In [20]:
from pyspark.sql.functions import row_number
windowSpec = Window.partitionBy("Category").orderBy(col("TotalQty").desc())

most_purchased = df.groupBy("Category", "Product").agg(_sum("Quantity").alias("TotalQty"))
most_purchased = most_purchased.withColumn("rn", row_number().over(windowSpec)).filter("rn = 1")
most_purchased.show()


+-----------+----------+--------+---+
|   Category|   Product|TotalQty| rn|
+-----------+----------+--------+---+
|      Books|    Book B|       5|  1|
|   Clothing|   T-Shirt|       4|  1|
|Electronics|Headphones|       4|  1|
|  Furniture|      Desk|       2|  1|
+-----------+----------+--------+---+



In [21]:
# 3. Create a view that excludes all "Clothing" orders and call it "filtered_orders"
spark.sql("""
  CREATE OR REPLACE GLOBAL TEMP VIEW filtered_orders AS
  SELECT * FROM global_temp.orders_global WHERE Category != 'Clothing'
""")


DataFrame[]

In [22]:
# View the filtered orders
spark.sql("SELECT * FROM global_temp.filtered_orders").show()


+-------+------------+----------+-----------+--------+---------+----------+
|OrderID|CustomerName|   Product|   Category|Quantity|UnitPrice| OrderDate|
+-------+------------+----------+-----------+--------+---------+----------+
|      1|        Ravi|    Laptop|Electronics|       2|    50000|2023-01-15|
|      3|       Kabir|    Book A|      Books|       3|      400|2023-02-10|
|      4|       Divya|      Sofa|  Furniture|       1|    12000|2023-03-05|
|      5|        Amit|     Phone|Electronics|       1|    25000|2023-01-25|
|      6|       Sneha|      Desk|  Furniture|       2|     8000|2023-04-11|
|      8|    Rishitha|    Book B|      Books|       5|      350|2023-06-18|
|      9|     Savitri|        TV|Electronics|       3|    40000|2023-07-01|
|     10|       Kiran| Bookshelf|  Furniture|       1|    15000|2023-07-20|
|     11|       Farah|    Book C|      Books|       2|      500|2023-08-05|
|     12|       Manav|Headphones|Electronics|       4|     3000|2023-09-09|
+-------+---