In [1]:
from pyspark.sql import SparkSession, Row
from pyspark.sql import functions as F
import pandas as pd
from io import StringIO
from pyspark.sql.types import StringType

In [2]:
spark = SparkSession.builder.appName("deo").getOrCreate()

In [3]:
spark

# **Database and Table Tasks**

In [4]:
# 1. Create a new database named sales_db .
spark.sql("CREATE DATABASE IF NOT EXISTS sales_db")

DataFrame[]

In [5]:
# 2. Set the current database to sales_db .
spark.sql("USE sales_db")

DataFrame[]

In [7]:
# 3. Create a table product_sales
spark.sql("""
  CREATE TABLE IF NOT EXISTS product_sales(
    ProductID INT,
    ProductName VARCHAR(50),
    Category VARCHAR(50),
    Price DOUBLE,
    Quantity INT,
    SaleDate DATE
  )
  USING PARQUET
""")

DataFrame[]

In [21]:
# 4. Insert at least 5 rows into product_sales .
spark.sql("""
  INSERT INTO product_sales(ProductID, ProductName, Category, Price, Quantity, SaleDate)
  VALUES
    (1, 'Macbook Air', 'Electronics', 100000, 1, DATE('2025-06-01')),
    (2, 'Bleach Manga', 'Books', 850, 2, DATE('2025-06-02')),
    (3, 'Oversized Tees', 'Cloting', 600, 4,  DATE('2025-06-3')),
    (4, 'Samsung Tab s10 Ultra', 'Electronics', 120000, 1, DATE('2025-06-04')),
    (5, 'Nothing Ear buds', 'Electronics', 14000, 1, DATE('2025-06-05'))
""")

DataFrame[]

# **Query Tasks**

In [22]:
# 5. Select all records from product_sales .
spark.sql("SELECT * FROM product_sales").show()

+---------+--------------------+-----------+--------+--------+----------+
|ProductID|         ProductName|   Category|   Price|Quantity|  SaleDate|
+---------+--------------------+-----------+--------+--------+----------+
|        3|      Oversized Tees|    Cloting|   600.0|       4|2025-06-03|
|        4|Samsung Tab s10 U...|Electronics|120000.0|       1|2025-06-04|
|        5|    Nothing Ear buds|Electronics| 14000.0|       1|2025-06-05|
|        1|         Macbook Air|Electronics|100000.0|       1|2025-06-01|
|        2|        Bleach Manga|      Books|   850.0|       2|2025-06-02|
+---------+--------------------+-----------+--------+--------+----------+



In [23]:
# 6. Retrieve products where price is above 500.
spark.sql("""
  SELECT * FROM product_sales
  WHERE Price >= 500
""").show()

+---------+--------------------+-----------+--------+--------+----------+
|ProductID|         ProductName|   Category|   Price|Quantity|  SaleDate|
+---------+--------------------+-----------+--------+--------+----------+
|        3|      Oversized Tees|    Cloting|   600.0|       4|2025-06-03|
|        4|Samsung Tab s10 U...|Electronics|120000.0|       1|2025-06-04|
|        5|    Nothing Ear buds|Electronics| 14000.0|       1|2025-06-05|
|        1|         Macbook Air|Electronics|100000.0|       1|2025-06-01|
|        2|        Bleach Manga|      Books|   850.0|       2|2025-06-02|
+---------+--------------------+-----------+--------+--------+----------+



In [25]:
# 7. Calculate total sale amount ( Price * Quantity ) for each product.

spark.sql("""
  SELECT *, (Price * Quantity) AS TotalSaleAmt FROM product_sales
""").show()

+---------+--------------------+-----------+--------+--------+----------+------------+
|ProductID|         ProductName|   Category|   Price|Quantity|  SaleDate|TotalSaleAmt|
+---------+--------------------+-----------+--------+--------+----------+------------+
|        3|      Oversized Tees|    Cloting|   600.0|       4|2025-06-03|      2400.0|
|        4|Samsung Tab s10 U...|Electronics|120000.0|       1|2025-06-04|    120000.0|
|        5|    Nothing Ear buds|Electronics| 14000.0|       1|2025-06-05|     14000.0|
|        1|         Macbook Air|Electronics|100000.0|       1|2025-06-01|    100000.0|
|        2|        Bleach Manga|      Books|   850.0|       2|2025-06-02|      1700.0|
+---------+--------------------+-----------+--------+--------+----------+------------+



In [26]:
# 8. Find the number of products sold in each Category .
spark.sql("""
  SELECT Category, SUM(Quantity) FROM product_sales
  GROUP BY Category
""").show()

+-----------+-------------+
|   Category|sum(Quantity)|
+-----------+-------------+
|    Cloting|            4|
|Electronics|            3|
|      Books|            2|
+-----------+-------------+



In [27]:
# 9. Sort products by total sales in descending order.
spark.sql("""
  SELECT *, (Price * Quantity) AS TotalSaleAmt FROM product_sales
  ORDER BY TotalSaleAmt DESC
""").show()

+---------+--------------------+-----------+--------+--------+----------+------------+
|ProductID|         ProductName|   Category|   Price|Quantity|  SaleDate|TotalSaleAmt|
+---------+--------------------+-----------+--------+--------+----------+------------+
|        4|Samsung Tab s10 U...|Electronics|120000.0|       1|2025-06-04|    120000.0|
|        1|         Macbook Air|Electronics|100000.0|       1|2025-06-01|    100000.0|
|        5|    Nothing Ear buds|Electronics| 14000.0|       1|2025-06-05|     14000.0|
|        3|      Oversized Tees|    Cloting|   600.0|       4|2025-06-03|      2400.0|
|        2|        Bleach Manga|      Books|   850.0|       2|2025-06-02|      1700.0|
+---------+--------------------+-----------+--------+--------+----------+------------+



# **Temporary View Tasks**

In [30]:
# 10. Create a PySpark DataFrame with dummy product data.
df = spark.table("product_sales")
df.show()

+---------+--------------------+-----------+--------+--------+----------+
|ProductID|         ProductName|   Category|   Price|Quantity|  SaleDate|
+---------+--------------------+-----------+--------+--------+----------+
|        3|      Oversized Tees|    Cloting|   600.0|       4|2025-06-03|
|        4|Samsung Tab s10 U...|Electronics|120000.0|       1|2025-06-04|
|        5|    Nothing Ear buds|Electronics| 14000.0|       1|2025-06-05|
|        1|         Macbook Air|Electronics|100000.0|       1|2025-06-01|
|        2|        Bleach Manga|      Books|   850.0|       2|2025-06-02|
+---------+--------------------+-----------+--------+--------+----------+



In [31]:
# 11. Register it as a temporary view called temp_orders .
df.createOrReplaceTempView("temp_orders")

spark.sql("SELECT * FROM temp_orders").show()

+---------+--------------------+-----------+--------+--------+----------+
|ProductID|         ProductName|   Category|   Price|Quantity|  SaleDate|
+---------+--------------------+-----------+--------+--------+----------+
|        3|      Oversized Tees|    Cloting|   600.0|       4|2025-06-03|
|        4|Samsung Tab s10 U...|Electronics|120000.0|       1|2025-06-04|
|        5|    Nothing Ear buds|Electronics| 14000.0|       1|2025-06-05|
|        1|         Macbook Air|Electronics|100000.0|       1|2025-06-01|
|        2|        Bleach Manga|      Books|   850.0|       2|2025-06-02|
+---------+--------------------+-----------+--------+--------+----------+



In [33]:
# 12. Run a SQL query to filter temp_orders where quantity > 1.
spark.sql("""
  SELECT * FROM temp_orders
  WHERE Quantity > 1
""").show()

+---------+--------------+--------+-----+--------+----------+
|ProductID|   ProductName|Category|Price|Quantity|  SaleDate|
+---------+--------------+--------+-----+--------+----------+
|        3|Oversized Tees| Cloting|600.0|       4|2025-06-03|
|        2|  Bleach Manga|   Books|850.0|       2|2025-06-02|
+---------+--------------+--------+-----+--------+----------+



# **Global View Tasks**

In [35]:
# 13. Create a global temp view from a PySpark DataFrame named global_orders .
df_2 = spark.table("product_sales")
df_2.createOrReplaceGlobalTempView("global_orders")

spark.sql("SELECT * FROM global_temp.global_orders").show()

+---------+--------------------+-----------+--------+--------+----------+
|ProductID|         ProductName|   Category|   Price|Quantity|  SaleDate|
+---------+--------------------+-----------+--------+--------+----------+
|        3|      Oversized Tees|    Cloting|   600.0|       4|2025-06-03|
|        4|Samsung Tab s10 U...|Electronics|120000.0|       1|2025-06-04|
|        5|    Nothing Ear buds|Electronics| 14000.0|       1|2025-06-05|
|        1|         Macbook Air|Electronics|100000.0|       1|2025-06-01|
|        2|        Bleach Manga|      Books|   850.0|       2|2025-06-02|
+---------+--------------------+-----------+--------+--------+----------+



In [38]:
# 14. Run a SQL query on the global view from another notebook cell/session.
spark_2 = SparkSession.builder.appName("session-2").getOrCreate()
spark_2.sql("SELECT * FROM global_temp.global_orders").show()

+---------+--------------------+-----------+--------+--------+----------+
|ProductID|         ProductName|   Category|   Price|Quantity|  SaleDate|
+---------+--------------------+-----------+--------+--------+----------+
|        3|      Oversized Tees|    Cloting|   600.0|       4|2025-06-03|
|        4|Samsung Tab s10 U...|Electronics|120000.0|       1|2025-06-04|
|        5|    Nothing Ear buds|Electronics| 14000.0|       1|2025-06-05|
|        1|         Macbook Air|Electronics|100000.0|       1|2025-06-01|
|        2|        Bleach Manga|      Books|   850.0|       2|2025-06-02|
+---------+--------------------+-----------+--------+--------+----------+



# **Join Tasks**

In [41]:
# 15. Create a second table customer_details with: CustomerID , Name , Gender , City , SignupDate
spark.sql("""
  CREATE TABLE IF NOT EXISTS customer_details(
    CustomerID INT,
    Name VARCHAR(50),
    Gender VARCHAR(20),
    City VARCHAR(40),
    SignupDate DATE
  )
  USING PARQUET
""")

DataFrame[]

In [42]:
# 16. Insert at least 3 records into customer_details .
spark.sql("""
  INSERT INTO customer_details (CustomerID, Name, Gender, City, SignupDate)
  VALUES
    (1, 'Tharun', 'Male', 'Chennai', DATE('2025-05-01')),
    (2, 'Eren', 'Male', 'Tokoyo', DATE('2025-05-03')),
    (3, 'Walter White', 'Male', 'Texas', DATE('2025-05-05'))
""")


DataFrame[]

In [43]:
# 17. Write a SQL join between product_sales and customer_details based on ProductID = CustomerID (simulate a match).
spark.sql("""
  SELECT * FROM product_sales p
  INNER JOIN customer_details c
  ON p.ProductID = c.CustomerID
""").show()

+---------+--------------+-----------+--------+--------+----------+----------+------------+------+-------+----------+
|ProductID|   ProductName|   Category|   Price|Quantity|  SaleDate|CustomerID|        Name|Gender|   City|SignupDate|
+---------+--------------+-----------+--------+--------+----------+----------+------------+------+-------+----------+
|        3|Oversized Tees|    Cloting|   600.0|       4|2025-06-03|         3|Walter White|  Male|  Texas|2025-05-05|
|        1|   Macbook Air|Electronics|100000.0|       1|2025-06-01|         1|      Tharun|  Male|Chennai|2025-05-01|
|        2|  Bleach Manga|      Books|   850.0|       2|2025-06-02|         2|        Eren|  Male| Tokoyo|2025-05-03|
+---------+--------------+-----------+--------+--------+----------+----------+------------+------+-------+----------+



In [44]:
# 18. List customers who bought more than 2 products.
spark.sql("""
  SELECT c.Name FROM product_sales p
  INNER JOIN customer_details c
  ON p.ProductID = c.CustomerID
  WHERE p.Quantity > 2
""").show()

+------------+
|        Name|
+------------+
|Walter White|
+------------+



# **View and Summary Tasks**

In [47]:
# 19. Create a SQL view sales_summary that includes: ProductName , Price , Quantity , Total = Price * Quantity
df_3 = spark.sql("""
  SELECT ProductName, Price, Quantity, (Price * Quantity) AS Total FROM product_sales
""")
df_3.createTempView("sales_summary")

spark.sql("SELECT * FROM sales_summary").show()

+--------------------+--------+--------+--------+
|         ProductName|   Price|Quantity|   Total|
+--------------------+--------+--------+--------+
|      Oversized Tees|   600.0|       4|  2400.0|
|Samsung Tab s10 U...|120000.0|       1|120000.0|
|    Nothing Ear buds| 14000.0|       1| 14000.0|
|         Macbook Air|100000.0|       1|100000.0|
|        Bleach Manga|   850.0|       2|  1700.0|
+--------------------+--------+--------+--------+



In [48]:
# 20. Query the view for records with Total > 1000 .
spark.sql("""
  SELECT * FROM sales_summary
  WHERE Total > 1000
  """).show()

+--------------------+--------+--------+--------+
|         ProductName|   Price|Quantity|   Total|
+--------------------+--------+--------+--------+
|      Oversized Tees|   600.0|       4|  2400.0|
|Samsung Tab s10 U...|120000.0|       1|120000.0|
|    Nothing Ear buds| 14000.0|       1| 14000.0|
|         Macbook Air|100000.0|       1|100000.0|
|        Bleach Manga|   850.0|       2|  1700.0|
+--------------------+--------+--------+--------+



# **Cleanup Tasks**

In [52]:
# 21. Drop the view sales_summary .
spark.catalog.dropTempView("sales_summary")

True

In [54]:
# 22. Drop the tables product_sales and customer_details .
spark.sql("DROP TABLE IF EXISTS product_sales")
spark.sql("DROP TABLE IF EXISTS customer_details")

DataFrame[]

In [55]:
# 23. Drop the database sales_db .
spark.sql("DROP DATABASE IF EXISTS sales_db")

DataFrame[]