In [35]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()

spark.sql("CREATE DATABASE IF NOT EXISTS sales_db")
spark.sql("USE sales_db")

DataFrame[]

In [36]:
spark.sql("""
CREATE TABLE IF NOT EXISTS product_sales (
    ProductID INT,
    ProductName STRING,
    Category STRING,
    Price DOUBLE,
    Quantity INT,
    SaleDate DATE
)
USING PARQUET
""")


DataFrame[]

In [37]:
spark.sql("""
INSERT INTO product_sales VALUES
(101, 'Smartwatch', 'Electronics', 25000.00, 5, DATE('2023-01-10')),
(102, 'Laptop', 'Electronics', 75000.00, 10, DATE('2024-05-14')),
(103, 'Chair', 'Furniture', 450.00, 20, DATE('2024-07-22')),
(104, 'Shoes', 'Fashion', 2000.00, 7, DATE('2024-09-18')),
(105, 'Microwave', 'Appliances', 8500.00, 2, DATE('2025-01-09'))
""")


DataFrame[]

In [38]:
# 5. Select all records from product_sales .
spark.sql("select * from product_sales").show()

# 6. Retrieve products where price is above 500.
spark.sql("select * from product_sales where Price > 500").show()

# 7. Calculate total sale amount ( Price * Quantity ) for each product.
spark.sql("select *,(Price * Quantity) as Total_Sale from product_sales").show()

# 8. Find the number of products sold in each Category .
spark.sql("select Category,COUNT(*) AS Total_Product_Sold from product_sales group by Category").show()

# 9. Sort products by total sales in descending order.
spark.sql("select *,(Price * Quantity) as Total_Sale from product_sales order by Total_sale desc").show()

+---------+-----------+-----------+-------+--------+----------+
|ProductID|ProductName|   Category|  Price|Quantity|  SaleDate|
+---------+-----------+-----------+-------+--------+----------+
|      101| Smartwatch|Electronics|25000.0|       5|2023-01-10|
|      102|     Laptop|Electronics|75000.0|      10|2024-05-14|
|      103|      Chair|  Furniture|  450.0|      20|2024-07-22|
|      104|      Shoes|    Fashion| 2000.0|       7|2024-09-18|
|      105|  Microwave| Appliances| 8500.0|       2|2025-01-09|
+---------+-----------+-----------+-------+--------+----------+

+---------+-----------+-----------+-------+--------+----------+
|ProductID|ProductName|   Category|  Price|Quantity|  SaleDate|
+---------+-----------+-----------+-------+--------+----------+
|      101| Smartwatch|Electronics|25000.0|       5|2023-01-10|
|      102|     Laptop|Electronics|75000.0|      10|2024-05-14|
|      104|      Shoes|    Fashion| 2000.0|       7|2024-09-18|
|      105|  Microwave| Appliances| 850

In [39]:
# Temporary View Tasks
# 10. Create a PySpark DataFrame with dummy product data.

from pyspark.sql import Row

data = [
    Row(ProductID=201, ProductName='Smart Phone', Category='Electronics', Price=15000.0, Quantity=3),
    Row(ProductID=202, ProductName='Camera', Category='Electronics', Price=22000.0, Quantity=1),
    Row(ProductID=203, ProductName='T-shirt', Category='Fashion', Price=500.0, Quantity=5),
    Row(ProductID=204, ProductName='Refrigerator', Category='Appliances', Price=25000.0, Quantity=2),
    Row(ProductID=205, ProductName='Blendor', Category='Appliances', Price=3000.0, Quantity=10),
]

df = spark.createDataFrame(data)


In [40]:
# 11. Register it as a temporary view called temp_orders .
df.createOrReplaceTempView("temp_orders")

# 12. Run a SQL query to filter temp_orders where quantity > 1.
spark.sql("select * from temp_orders where Quantity > 1").show()

+---------+------------+-----------+-------+--------+
|ProductID| ProductName|   Category|  Price|Quantity|
+---------+------------+-----------+-------+--------+
|      201| Smart Phone|Electronics|15000.0|       3|
|      203|     T-shirt|    Fashion|  500.0|       5|
|      204|Refrigerator| Appliances|25000.0|       2|
|      205|     Blendor| Appliances| 3000.0|      10|
+---------+------------+-----------+-------+--------+



In [41]:
# 13. Create a global temp view from a PySpark DataFrame named global_orders .
df.createOrReplaceGlobalTempView("global_orders")

# 14. Run a SQL query on the global view from another notebook cell/session.
spark.sql("select * from global_temp.global_orders where Quantity > 1").show()

+---------+------------+-----------+-------+--------+
|ProductID| ProductName|   Category|  Price|Quantity|
+---------+------------+-----------+-------+--------+
|      201| Smart Phone|Electronics|15000.0|       3|
|      203|     T-shirt|    Fashion|  500.0|       5|
|      204|Refrigerator| Appliances|25000.0|       2|
|      205|     Blendor| Appliances| 3000.0|      10|
+---------+------------+-----------+-------+--------+



In [42]:
# Join Tasks
# 15. Create a second table customer_details with:
# CustomerID , Name , Gender , City , SignupDate
spark.sql("""
CREATE TABLE IF NOT EXISTS customer_details (
    CustomerID INT,Name STRING,Gender STRING,
    City STRING,SignupDate DATE
)
USING PARQUET
""")

# 16. Insert at least 3 records into customer_details .
spark.sql("""
INSERT INTO customer_details VALUES
(101, 'Rahul', 'M', 'Chennai', DATE('2022-01-15')),
(102, 'Priya', 'F', 'Delhi', DATE('2023-07-01')),
(105, 'Isha', 'F', 'Mumbai', DATE('2024-10-22'))
""")

DataFrame[]

In [43]:
# 17. Write a SQL join between product_sales and customer_details based on
# ProductID = CustomerID (simulate a match).

spark.sql("""
SELECT p.ProductID, p.ProductName, c.Name as Customer_Name, c.City,p.price, p.Quantity, p.SaleDate
FROM product_sales p
JOIN customer_details c ON p.ProductID = c.CustomerID
""").show()

# 18. List customers who bought more than 2 products.
spark.sql("""
SELECT c.CustomerID, c.Name, SUM(p.Quantity) AS Products_Bought
FROM product_sales p
JOIN customer_details c ON p.ProductID = c.CustomerID
GROUP BY c.CustomerID, c.Name
HAVING SUM(p.Quantity) > 2
""").show()


+---------+-----------+-------------+-------+-------+--------+----------+
|ProductID|ProductName|Customer_Name|   City|  price|Quantity|  SaleDate|
+---------+-----------+-------------+-------+-------+--------+----------+
|      101| Smartwatch|        Rahul|Chennai|25000.0|       5|2023-01-10|
|      102|     Laptop|        Priya|  Delhi|75000.0|      10|2024-05-14|
|      105|  Microwave|         Isha| Mumbai| 8500.0|       2|2025-01-09|
+---------+-----------+-------------+-------+-------+--------+----------+

+----------+-----+---------------+
|CustomerID| Name|Products_Bought|
+----------+-----+---------------+
|       102|Priya|             10|
|       101|Rahul|              5|
+----------+-----+---------------+



In [44]:
# View & Summary Tasks
# 19. Create a SQL view sales_summary that includes:
# ProductName , Price , Quantity , Total = Price * Quantity
spark.sql("""
CREATE OR REPLACE VIEW sales_summary AS
select
    ProductName,Price,Quantity,
    (Price * Quantity) AS Total
from product_sales
""")

# 20. Query the view for records with Total > 1000 .
spark.sql("select * from sales_summary WHERE Total > 1000").show()



+-----------+-------+--------+--------+
|ProductName|  Price|Quantity|   Total|
+-----------+-------+--------+--------+
| Smartwatch|25000.0|       5|125000.0|
|     Laptop|75000.0|      10|750000.0|
|      Chair|  450.0|      20|  9000.0|
|      Shoes| 2000.0|       7| 14000.0|
|  Microwave| 8500.0|       2| 17000.0|
+-----------+-------+--------+--------+



In [45]:
# 21. Drop the view sales_summary .
spark.sql("DROP VIEW IF EXISTS sales_summary")

# 22. Drop the tables product_sales and customer_details .
spark.sql("DROP TABLE IF EXISTS product_sales")
spark.sql("DROP TABLE IF EXISTS customer_details")

# 23. Drop the database sales_db
spark.sql("DROP DATABASE IF EXISTS sales_db CASCADE")



DataFrame[]