In [10]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("SparkSQLExercises") \
    .getOrCreate()

spark


In [3]:
# 1. Create database
spark.sql("CREATE DATABASE IF NOT EXISTS sales_db")

# 2. Set current database
spark.catalog.setCurrentDatabase("sales_db")

# 3. Create table
spark.sql("""
CREATE TABLE IF NOT EXISTS product_sales (
    ProductID INT,
    ProductName STRING,
    Category STRING,
    Price DOUBLE,
    Quantity INT,
    SaleDate DATE
) USING CSV
OPTIONS ('path' '/tmp/product_sales.csv', 'header' 'true')
""")

# For environments without file persistence, we can create using a DataFrame:
data = [
    (1, "Laptop", "Electronics", 900.0, 2, "2024-12-01"),
    (2, "Phone", "Electronics", 600.0, 1, "2024-11-15"),
    (3, "Chair", "Furniture", 150.0, 4, "2024-11-10"),
    (4, "Desk", "Furniture", 300.0, 2, "2024-11-11"),
    (5, "Pen", "Stationery", 5.0, 10, "2024-10-10")
]
columns = ["ProductID", "ProductName", "Category", "Price", "Quantity", "SaleDate"]
df = spark.createDataFrame(data, columns)
df.write.saveAsTable("product_sales", mode="overwrite")


In [4]:
# 5. Select all
spark.sql("SELECT * FROM product_sales").show()

# 6. Price > 500
spark.sql("SELECT * FROM product_sales WHERE Price > 500").show()

# 7. Total Sale Amount
spark.sql("SELECT ProductName, Price, Quantity, (Price * Quantity) AS TotalAmount FROM product_sales").show()

# 8. Products per Category
spark.sql("SELECT Category, COUNT(*) AS ProductCount FROM product_sales GROUP BY Category").show()

# 9. Sort by Total Sales
spark.sql("SELECT ProductName, (Price * Quantity) AS Total FROM product_sales ORDER BY Total DESC").show()


+---------+-----------+-----------+-----+--------+----------+
|ProductID|ProductName|   Category|Price|Quantity|  SaleDate|
+---------+-----------+-----------+-----+--------+----------+
|        3|      Chair|  Furniture|150.0|       4|2024-11-10|
|        4|       Desk|  Furniture|300.0|       2|2024-11-11|
|        5|        Pen| Stationery|  5.0|      10|2024-10-10|
|        1|     Laptop|Electronics|900.0|       2|2024-12-01|
|        2|      Phone|Electronics|600.0|       1|2024-11-15|
+---------+-----------+-----------+-----+--------+----------+

+---------+-----------+-----------+-----+--------+----------+
|ProductID|ProductName|   Category|Price|Quantity|  SaleDate|
+---------+-----------+-----------+-----+--------+----------+
|        1|     Laptop|Electronics|900.0|       2|2024-12-01|
|        2|      Phone|Electronics|600.0|       1|2024-11-15|
+---------+-----------+-----------+-----+--------+----------+

+-----------+-----+--------+-----------+
|ProductName|Price|Quantity

In [5]:
temp_data = [
    (101, "Keyboard", "Electronics", 50.0, 2),
    (102, "Mouse", "Electronics", 25.0, 1),
    (103, "Notebook", "Stationery", 3.0, 5)
]
temp_cols = ["ProductID", "ProductName", "Category", "Price", "Quantity"]
temp_df = spark.createDataFrame(temp_data, temp_cols)

# 11. Register temporary view
temp_df.createOrReplaceTempView("temp_orders")

# 12. Query temp view
spark.sql("SELECT * FROM temp_orders WHERE Quantity > 1").show()


+---------+-----------+-----------+-----+--------+
|ProductID|ProductName|   Category|Price|Quantity|
+---------+-----------+-----------+-----+--------+
|      101|   Keyboard|Electronics| 50.0|       2|
|      103|   Notebook| Stationery|  3.0|       5|
+---------+-----------+-----------+-----+--------+



In [6]:
# 13. Global temp view
temp_df.createOrReplaceGlobalTempView("global_orders")

# 14. Query in this or another session
spark.sql("SELECT * FROM global_temp.global_orders WHERE Price > 30").show()


+---------+-----------+-----------+-----+--------+
|ProductID|ProductName|   Category|Price|Quantity|
+---------+-----------+-----------+-----+--------+
|      101|   Keyboard|Electronics| 50.0|       2|
+---------+-----------+-----------+-----+--------+



In [7]:
# 15–16. Create & insert customer table
customer_data = [
    (1, "Alice", "F", "New York", "2023-01-01"),
    (2, "Bob", "M", "Chicago", "2023-02-01"),
    (3, "Charlie", "M", "Boston", "2023-03-01")
]
customer_cols = ["CustomerID", "Name", "Gender", "City", "SignupDate"]
cust_df = spark.createDataFrame(customer_data, customer_cols)
cust_df.write.saveAsTable("customer_details", mode="overwrite")

# 17. Simulated Join (ProductID == CustomerID)
spark.sql("""
SELECT p.ProductName, c.Name, c.City
FROM product_sales p
JOIN customer_details c
ON p.ProductID = c.CustomerID
""").show()

# 18. Customers who bought more than 2 products
spark.sql("""
SELECT c.Name, SUM(p.Quantity) AS TotalBought
FROM product_sales p
JOIN customer_details c ON p.ProductID = c.CustomerID
GROUP BY c.Name
HAVING SUM(p.Quantity) > 2
""").show()


+-----------+-------+--------+
|ProductName|   Name|    City|
+-----------+-------+--------+
|      Phone|    Bob| Chicago|
|      Chair|Charlie|  Boston|
|     Laptop|  Alice|New York|
+-----------+-------+--------+

+-------+-----------+
|   Name|TotalBought|
+-------+-----------+
|Charlie|          4|
+-------+-----------+



In [8]:
# 19. Create view
spark.sql("""
CREATE OR REPLACE VIEW sales_summary AS
SELECT ProductName, Price, Quantity, (Price * Quantity) AS Total
FROM product_sales
""")

# 20. Query view
spark.sql("SELECT * FROM sales_summary WHERE Total > 1000").show()


+-----------+-----+--------+------+
|ProductName|Price|Quantity| Total|
+-----------+-----+--------+------+
|     Laptop|900.0|       2|1800.0|
+-----------+-----+--------+------+



In [9]:
# 21. Drop view
spark.sql("DROP VIEW IF EXISTS sales_summary")

# 22. Drop tables
spark.sql("DROP TABLE IF EXISTS product_sales")
spark.sql("DROP TABLE IF EXISTS customer_details")

# 23. Drop database
spark.sql("DROP DATABASE IF EXISTS sales_db CASCADE")


DataFrame[]