In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Sales DB") \
    .getOrCreate()


#Database & Table Tasks


In [3]:
# Q1. Create a new database named sales_db
spark.sql("CREATE DATABASE IF NOT EXISTS sales_db")

# Q2. Set the current database to sales_db
spark.sql("USE sales_db")

# Q3. Create a table product_sales
spark.sql("""
CREATE TABLE IF NOT EXISTS product_sales (
    ProductID INT,
    ProductName STRING,
    Category STRING,
    Price DOUBLE,
    Quantity INT,
    SaleDate DATE
)
USING PARQUET
""")

# Q4. Insert at least 5 rows into product_sales
spark.sql("""
INSERT INTO product_sales VALUES
(101, 'Laptop', 'Electronics', 75000, 1, DATE('2024-05-01')),
(102, 'Headphones', 'Electronics', 2500, 2, DATE('2024-05-02')),
(103, 'Shirt', 'Fashion', 1200, 3, DATE('2024-05-03')),
(104, 'Coffee Maker', 'Home Appliances', 3500, 1, DATE('2024-05-04')),
(105, 'Smartphone', 'Electronics', 30000, 2, DATE('2024-05-05'))
""")


DataFrame[]

#Query Tasks


In [4]:
# Q5. Select all records from product_sales
spark.sql("SELECT * FROM product_sales").show()

# Q6. Retrieve products where price is above 500
spark.sql("SELECT * FROM product_sales WHERE Price > 500").show()

# Q7. Calculate total sale amount (Price * Quantity) for each product
spark.sql("SELECT ProductName, Price, Quantity, (Price * Quantity) AS TotalSale FROM product_sales").show()

# Q8. Find the number of products sold in each Category
spark.sql("SELECT Category, COUNT(*) AS ProductCount FROM product_sales GROUP BY Category").show()

# Q9. Sort products by total sales in descending order
spark.sql("SELECT ProductName, (Price * Quantity) AS TotalSale FROM product_sales ORDER BY TotalSale DESC").show()


+---------+------------+---------------+-------+--------+----------+
|ProductID| ProductName|       Category|  Price|Quantity|  SaleDate|
+---------+------------+---------------+-------+--------+----------+
|      103|       Shirt|        Fashion| 1200.0|       3|2024-05-03|
|      104|Coffee Maker|Home Appliances| 3500.0|       1|2024-05-04|
|      105|  Smartphone|    Electronics|30000.0|       2|2024-05-05|
|      101|      Laptop|    Electronics|75000.0|       1|2024-05-01|
|      102|  Headphones|    Electronics| 2500.0|       2|2024-05-02|
+---------+------------+---------------+-------+--------+----------+

+---------+------------+---------------+-------+--------+----------+
|ProductID| ProductName|       Category|  Price|Quantity|  SaleDate|
+---------+------------+---------------+-------+--------+----------+
|      103|       Shirt|        Fashion| 1200.0|       3|2024-05-03|
|      104|Coffee Maker|Home Appliances| 3500.0|       1|2024-05-04|
|      105|  Smartphone|    Elect

#Temporary View Tasks

In [7]:
# Q10. Create a PySpark DataFrame with dummy product data
from pyspark.sql import Row

dummy_data = [
    Row(ProductID=201, ProductName='Tablet', Category='Electronics', Price=15000.0, Quantity=2),
    Row(ProductID=202, ProductName='Jacket', Category='Fashion', Price=3500.0, Quantity=1)
]

temp_df = spark.createDataFrame(dummy_data)

# Q11. Register it as a temporary view called temp_orders
temp_df.createOrReplaceTempView("temp_orders")

# Q12. Run a SQL query to filter temp_orders where quantity > 1
spark.sql("SELECT * FROM temp_orders WHERE Quantity > 1").show()


+---------+-----------+-----------+-------+--------+
|ProductID|ProductName|   Category|  Price|Quantity|
+---------+-----------+-----------+-------+--------+
|      201|     Tablet|Electronics|15000.0|       2|
+---------+-----------+-----------+-------+--------+



#Global View Tasks



In [8]:
# Q13. Create a global temp view from a PySpark DataFrame named global_orders
temp_df.createOrReplaceGlobalTempView("global_orders")

# Q14. Run a SQL query on the global view from another session or cell
spark.sql("SELECT * FROM global_temp.global_orders").show()


+---------+-----------+-----------+-------+--------+
|ProductID|ProductName|   Category|  Price|Quantity|
+---------+-----------+-----------+-------+--------+
|      201|     Tablet|Electronics|15000.0|       2|
|      202|     Jacket|    Fashion| 3500.0|       1|
+---------+-----------+-----------+-------+--------+



#Join Tasks

In [10]:
# Q15. Create a second table customer_details
spark.sql("""
CREATE TABLE IF NOT EXISTS customer_details (
    CustomerID INT,
    Name STRING,
    Gender STRING,
    City STRING,
    SignupDate DATE
)
USING PARQUET
""")

# Q16. Insert at least 3 records into customer_details
spark.sql("""
INSERT INTO customer_details VALUES
(101, 'Ali', 'Male', 'Delhi', DATE('2023-06-01')),
(104, 'Neha', 'Female', 'Mumbai', DATE('2023-06-10')),
(105, 'Rahul', 'Male', 'Chennai', DATE('2023-06-15'))
""")

# Q17. Write a SQL join between product_sales and customer_details based on ProductID = CustomerID
spark.sql("""
SELECT ps.ProductName, cd.Name, cd.City
FROM product_sales ps
JOIN customer_details cd
ON ps.ProductID = cd.CustomerID
""").show()

# Q18. List customers who bought more than 2 products

# Optional: Add a new customer who matches ProductID = 103 (Shirt, Quantity=3)
spark.sql("""
INSERT INTO customer_details VALUES
(103, 'Divya', 'Female', 'Bangalore', DATE('2023-07-01'))
""")

# Now re-run Q18
spark.sql("""
SELECT cd.Name, ps.Quantity
FROM product_sales ps
JOIN customer_details cd
ON ps.ProductID = cd.CustomerID
WHERE ps.Quantity > 2
""").show()


+------------+-----+-------+
| ProductName| Name|   City|
+------------+-----+-------+
|Coffee Maker| Neha| Mumbai|
|  Smartphone|Rahul|Chennai|
|Coffee Maker| Neha| Mumbai|
|  Smartphone|Rahul|Chennai|
|      Laptop|  Ali|  Delhi|
|      Laptop|  Ali|  Delhi|
+------------+-----+-------+

+-----+--------+
| Name|Quantity|
+-----+--------+
|Divya|       3|
+-----+--------+



#View & Summary Tasks


In [11]:
# Q19. Create a SQL view sales_summary
spark.sql("""
CREATE OR REPLACE VIEW sales_summary AS
SELECT ProductName, Price, Quantity, (Price * Quantity) AS Total
FROM product_sales
""")

# Q20. Query the view for records with Total > 1000
spark.sql("SELECT * FROM sales_summary WHERE Total > 1000").show()


+------------+-------+--------+-------+
| ProductName|  Price|Quantity|  Total|
+------------+-------+--------+-------+
|       Shirt| 1200.0|       3| 3600.0|
|Coffee Maker| 3500.0|       1| 3500.0|
|  Smartphone|30000.0|       2|60000.0|
|      Laptop|75000.0|       1|75000.0|
|  Headphones| 2500.0|       2| 5000.0|
+------------+-------+--------+-------+



#Cleanup Tasks

In [12]:
# Q21. Drop the view sales_summary
spark.sql("DROP VIEW IF EXISTS sales_summary")

# Q22. Drop the tables product_sales and customer_details
spark.sql("DROP TABLE IF EXISTS product_sales")
spark.sql("DROP TABLE IF EXISTS customer_details")

# Q23. Drop the database sales_db
spark.sql("DROP DATABASE IF EXISTS sales_db CASCADE")


DataFrame[]