<a href="https://colab.research.google.com/github/Subramaniya-pillai/data_engineering/blob/main/sales.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pyspark




In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col


In [3]:
spark = SparkSession.builder \
    .appName("Spark SQL Exercises") \
    .config("spark.sql.catalogImplementation", "in-memory") \
    .getOrCreate()


In [5]:
# 1. Create a new database
spark.sql("CREATE DATABASE IF NOT EXISTS sales_db")

# 2. Set current database
spark.catalog.setCurrentDatabase("sales_db")

# 3. Create product_sales table
spark.sql("""
CREATE TABLE IF NOT EXISTS product_sales (
  ProductID INT,
  ProductName STRING,
  Category STRING,
  Price DOUBLE,
  Quantity INT,
  SaleDate DATE
)
USING PARQUET
""")

# 4. Insert 5 rows
spark.sql("""
INSERT INTO product_sales VALUES
(101, 'Laptop', 'Electronics', 75000, 1, DATE '2025-06-01'),
(102, 'Smartphone', 'Electronics', 30000, 2, DATE '2025-06-02'),
(103, 'Tablet', 'Electronics', 20000, 3, DATE '2025-06-03'),
(104, 'Printer', 'Office', 8000, 1, DATE '2025-06-04'),
(105, 'Desk', 'Furniture', 12000, 2, DATE '2025-06-05')
""")

# Show inserted data
spark.sql("SELECT * FROM product_sales").show()


+---------+-----------+-----------+-------+--------+----------+
|ProductID|ProductName|   Category|  Price|Quantity|  SaleDate|
+---------+-----------+-----------+-------+--------+----------+
|      101|     Laptop|Electronics|75000.0|       1|2025-06-01|
|      102| Smartphone|Electronics|30000.0|       2|2025-06-02|
|      103|     Tablet|Electronics|20000.0|       3|2025-06-03|
|      104|    Printer|     Office| 8000.0|       1|2025-06-04|
|      105|       Desk|  Furniture|12000.0|       2|2025-06-05|
+---------+-----------+-----------+-------+--------+----------+



# **5–9. Query Tasks**



In [6]:
# 5. Select all records
spark.sql("SELECT * FROM product_sales").show()

+---------+-----------+-----------+-------+--------+----------+
|ProductID|ProductName|   Category|  Price|Quantity|  SaleDate|
+---------+-----------+-----------+-------+--------+----------+
|      101|     Laptop|Electronics|75000.0|       1|2025-06-01|
|      102| Smartphone|Electronics|30000.0|       2|2025-06-02|
|      103|     Tablet|Electronics|20000.0|       3|2025-06-03|
|      104|    Printer|     Office| 8000.0|       1|2025-06-04|
|      105|       Desk|  Furniture|12000.0|       2|2025-06-05|
+---------+-----------+-----------+-------+--------+----------+



In [7]:
# 6. Products where price > 500
spark.sql("SELECT * FROM product_sales WHERE Price > 500").show()


+---------+-----------+-----------+-------+--------+----------+
|ProductID|ProductName|   Category|  Price|Quantity|  SaleDate|
+---------+-----------+-----------+-------+--------+----------+
|      101|     Laptop|Electronics|75000.0|       1|2025-06-01|
|      102| Smartphone|Electronics|30000.0|       2|2025-06-02|
|      103|     Tablet|Electronics|20000.0|       3|2025-06-03|
|      104|    Printer|     Office| 8000.0|       1|2025-06-04|
|      105|       Desk|  Furniture|12000.0|       2|2025-06-05|
+---------+-----------+-----------+-------+--------+----------+



In [8]:
# 7. Total sale amount
spark.sql("SELECT ProductName, Price, Quantity, (Price * Quantity) AS TotalSale FROM product_sales").show()


+-----------+-------+--------+---------+
|ProductName|  Price|Quantity|TotalSale|
+-----------+-------+--------+---------+
|     Laptop|75000.0|       1|  75000.0|
| Smartphone|30000.0|       2|  60000.0|
|     Tablet|20000.0|       3|  60000.0|
|    Printer| 8000.0|       1|   8000.0|
|       Desk|12000.0|       2|  24000.0|
+-----------+-------+--------+---------+



In [9]:
# 8. Number of products sold in each category
spark.sql("SELECT Category, SUM(Quantity) AS TotalQuantity FROM product_sales GROUP BY Category").show()


+-----------+-------------+
|   Category|TotalQuantity|
+-----------+-------------+
|Electronics|            6|
|     Office|            1|
|  Furniture|            2|
+-----------+-------------+



In [10]:
# 9. Sort products by total sales
spark.sql("SELECT ProductName, (Price * Quantity) AS TotalSale FROM product_sales ORDER BY TotalSale DESC").show()

+-----------+---------+
|ProductName|TotalSale|
+-----------+---------+
|     Laptop|  75000.0|
|     Tablet|  60000.0|
| Smartphone|  60000.0|
|       Desk|  24000.0|
|    Printer|   8000.0|
+-----------+---------+



# **10–12. Temporary View Tasks**

In [11]:
# 10. Create PySpark DataFrame with dummy data
temp_data = [
    (201, 'Keyboard', 'Electronics', 1500.0, 2, '2025-06-01'),
    (202, 'Mouse', 'Electronics', 800.0, 1, '2025-06-02'),
    (203, 'Chair', 'Furniture', 5000.0, 3, '2025-06-03')
]
columns = ["ProductID", "ProductName", "Category", "Price", "Quantity", "SaleDate"]
temp_df = spark.createDataFrame(temp_data, columns)


In [12]:
# 11. Register as temporary view
temp_df.createOrReplaceTempView("temp_orders")

In [13]:

# 12. Filter where quantity > 1
spark.sql("SELECT * FROM temp_orders WHERE Quantity > 1").show()

+---------+-----------+-----------+------+--------+----------+
|ProductID|ProductName|   Category| Price|Quantity|  SaleDate|
+---------+-----------+-----------+------+--------+----------+
|      201|   Keyboard|Electronics|1500.0|       2|2025-06-01|
|      203|      Chair|  Furniture|5000.0|       3|2025-06-03|
+---------+-----------+-----------+------+--------+----------+



# **13–14. Global Temp View Tasks**

In [14]:
# 13. Create global temp view
temp_df.createOrReplaceGlobalTempView("global_orders")

In [15]:
# 14. Query from global temp view (can use in another Colab cell)
spark.sql("SELECT * FROM global_temp.global_orders").show()

+---------+-----------+-----------+------+--------+----------+
|ProductID|ProductName|   Category| Price|Quantity|  SaleDate|
+---------+-----------+-----------+------+--------+----------+
|      201|   Keyboard|Electronics|1500.0|       2|2025-06-01|
|      202|      Mouse|Electronics| 800.0|       1|2025-06-02|
|      203|      Chair|  Furniture|5000.0|       3|2025-06-03|
+---------+-----------+-----------+------+--------+----------+



# **15–18. Join Tasks**

In [16]:
# 15. Create customer_details table
spark.sql("""
CREATE TABLE IF NOT EXISTS customer_details (
  CustomerID INT,
  Name STRING,
  Gender STRING,
  City STRING,
  SignupDate DATE
)
USING PARQUET
""")


DataFrame[]

In [18]:
# 16. Insert data into customer_details

spark.sql("""
INSERT INTO customer_details VALUES
(101, 'nithya', 'F', 'Chennai', DATE '2024-01-01'),
(104, 'mani', 'M', 'Delhi', DATE '2024-02-01'),
(105, 'saravana', 'M', 'Mumbai', DATE '2024-03-01')
""")


# Show inserted customers
spark.sql("SELECT * FROM customer_details").show()

+----------+--------+------+-------+----------+
|CustomerID|    Name|Gender|   City|SignupDate|
+----------+--------+------+-------+----------+
|       104|    mani|     M|  Delhi|2024-02-01|
|       105|saravana|     M| Mumbai|2024-03-01|
|       101|  nithya|     F|Chennai|2024-01-01|
+----------+--------+------+-------+----------+



In [19]:
# 17. Join on ProductID = CustomerID
spark.sql("""
SELECT ps.ProductID, ps.ProductName, cd.Name, cd.City
FROM product_sales ps
JOIN customer_details cd
ON ps.ProductID = cd.CustomerID
""").show()


+---------+-----------+--------+-------+
|ProductID|ProductName|    Name|   City|
+---------+-----------+--------+-------+
|      104|    Printer|    mani|  Delhi|
|      105|       Desk|saravana| Mumbai|
|      101|     Laptop|  nithya|Chennai|
+---------+-----------+--------+-------+



In [20]:
# 18. Customers who bought more than 2 products
spark.sql("""
SELECT cd.Name, SUM(ps.Quantity) AS TotalBought
FROM product_sales ps
JOIN customer_details cd
ON ps.ProductID = cd.CustomerID
GROUP BY cd.Name
HAVING SUM(ps.Quantity) > 2
""").show()

+----+-----------+
|Name|TotalBought|
+----+-----------+
+----+-----------+



# **19–20. View & Summary Tasks**

In [21]:
# 19. Create sales_summary view
spark.sql("""
CREATE OR REPLACE VIEW sales_summary AS
SELECT ProductName, Price, Quantity, (Price * Quantity) AS Total
FROM product_sales
""")


DataFrame[]

In [22]:
# 20. Query records with Total > 1000
spark.sql("SELECT * FROM sales_summary WHERE Total > 1000").show()

+-----------+-------+--------+-------+
|ProductName|  Price|Quantity|  Total|
+-----------+-------+--------+-------+
|     Laptop|75000.0|       1|75000.0|
| Smartphone|30000.0|       2|60000.0|
|     Tablet|20000.0|       3|60000.0|
|    Printer| 8000.0|       1| 8000.0|
|       Desk|12000.0|       2|24000.0|
+-----------+-------+--------+-------+



# **21–23. Cleanup Tasks**

In [None]:
# 21. Drop the view
spark.sql("DROP VIEW IF EXISTS sales_summary")

In [None]:
# 22. Drop tables
spark.sql("DROP TABLE IF EXISTS product_sales")
spark.sql("DROP TABLE IF EXISTS customer_details")


In [None]:
# 23. Drop database
spark.sql("DROP DATABASE IF EXISTS sales_db CASCADE")