In [0]:
# ---------- TASK 1 ----------
# 1. Put the CSV into DBFS (only if you don't already have it)
csv_data = """transaction_id,customer_name,region,product,category,quantity,unit_price,date
1,Rajesh,North,Laptop,Electronics,1,55000,2024-01-12
2,Sneha,West,Refrigerator,Electronics,1,32000,2024-02-05
3,Anil,South,Shampoo,Personal Care,5,150,2024-01-17
4,Divya,North,Mobile,Electronics,2,20000,2024-03-22
5,Vikram,East,Washing Machine,Electronics,1,28000,2024-02-28
6,Preeti,West,Sneakers,Fashion,2,4000,2024-01-31
7,Aman,South,TV,Electronics,1,45000,2024-02-15
8,Isha,North,Notebook,Stationery,10,60,2024-01-10
9,Kunal,East,Pencil,Stationery,20,10,2024-03-05
10,Tanvi,West,Face Cream,Personal Care,3,200,2024-03-19
"""
dbutils.fs.put("/tmp/sales_transactions.csv", csv_data, overwrite=True)

# 2. Read CSV into a Spark DataFrame (infer types)
df = spark.read.option("header", True).option("inferSchema", True).csv("dbfs:/tmp/sales_transactions.csv")
df.show()

# 3. Save as Parquet (separate folder)
df.write.mode("overwrite").parquet("/tmp/parquet/sales_transactions")

# 4. Save as Delta (separate folder)
dbutils.fs.rm("/tmp/delta/sales_transactions", recurse=True)
df.write.format("delta").mode("overwrite").save("/tmp/delta/sales_transactions")




Wrote 603 bytes.
+--------------+-------------+------+---------------+-------------+--------+----------+----------+
|transaction_id|customer_name|region|        product|     category|quantity|unit_price|      date|
+--------------+-------------+------+---------------+-------------+--------+----------+----------+
|             1|       Rajesh| North|         Laptop|  Electronics|       1|     55000|2024-01-12|
|             2|        Sneha|  West|   Refrigerator|  Electronics|       1|     32000|2024-02-05|
|             3|         Anil| South|        Shampoo|Personal Care|       5|       150|2024-01-17|
|             4|        Divya| North|         Mobile|  Electronics|       2|     20000|2024-03-22|
|             5|       Vikram|  East|Washing Machine|  Electronics|       1|     28000|2024-02-28|
|             6|       Preeti|  West|       Sneakers|      Fashion|       2|      4000|2024-01-31|
|             7|         Aman| South|             TV|  Electronics|       1|     45000|2024-

In [0]:
%sql
-- Add columns (if you must alter schema in SQL)
ALTER TABLE delta.`/tmp/delta/sales_transactions` ADD COLUMNS (
  total_amount DOUBLE,
  month INT,
  formatted_date STRING,
  is_high_value BOOLEAN
);

UPDATE delta.`/tmp/delta/sales_transactions`
SET total_amount = quantity * unit_price,
    month = MONTH(date),
    formatted_date = DATE_FORMAT(date, 'dd-MMM-yyyy'),
    is_high_value = (quantity * unit_price) > 30000;


num_affected_rows
10


In [0]:
#TASK-3
%sql
-- Transactions per region
SELECT region, COUNT(*) 
FROM delta.`/tmp/delta/sales_transactions` 
GROUP BY region;

region,count(1)
South,2
East,2
West,3
North,3


In [0]:
%sql
-- Top 3 categories by total sales
SELECT category, SUM(quantity * unit_price) AS total_sales
FROM delta.`/tmp/delta/sales_transactions`
GROUP BY category
ORDER BY total_sales DESC
LIMIT 3;

category,total_sales
Electronics,200000
Fashion,8000
Personal Care,1350


In [0]:
%sql
-- Month-wise revenue
SELECT MONTH(date) AS month, SUM(quantity * unit_price) AS revenue
FROM delta.`/tmp/delta/sales_transactions`
GROUP BY MONTH(date)
ORDER BY month;

month,revenue
1,64350
2,105000
3,40800


In [0]:
%sql
-- Highest single transaction(s)
WITH t AS (
  SELECT customer_name, transaction_id, quantity * unit_price AS txn_amount 
  FROM delta.`/tmp/delta/sales_transactions`
)
SELECT * 
FROM t 
WHERE txn_amount = (SELECT MAX(txn_amount) FROM t);

customer_name,transaction_id,txn_amount
Rajesh,1,55000


In [0]:
%sql
-- Q1 total
SELECT SUM(quantity * unit_price) AS Q1_sales
FROM delta.`/tmp/delta/sales_transactions`
WHERE MONTH(date) BETWEEN 1 AND 3;

Q1_sales
210150


In [0]:
#TASK-4
#Update price of all Stationery items by +10%
spark.sql("""
    UPDATE delta.`/tmp/delta/sales_transactions`
    SET unit_price = unit_price * 1.10
    WHERE category = 'Stationery'
""")


DataFrame[num_affected_rows: bigint]

In [0]:
# Delete records where quantity < 3
spark.sql("""
DELETE FROM delta.`/tmp/delta/sales_transactions`
WHERE quantity < 3
""")


DataFrame[num_affected_rows: bigint]

In [0]:
# Add a new row with today’s transaction
spark.sql("""
INSERT INTO delta.`/tmp/delta/sales_transactions`
(transaction_id, customer_name, region, product, category, quantity, unit_price, date)
VALUES
(11, 'Rishitha Varma', 'North', 'Gadget', 'Electronics', 1, 9999, current_date())
""")

DataFrame[num_affected_rows: bigint, num_inserted_rows: bigint]

In [0]:
%sql
SELECT * FROM delta.`/tmp/delta/sales_transactions`;


transaction_id,customer_name,region,product,category,quantity,unit_price,date,total_amount,month,formatted_date,is_high_value
8,Isha,North,Notebook,Stationery,10,66,2024-01-10,600.0,1.0,10-Jan-2024,False
9,Kunal,East,Pencil,Stationery,20,11,2024-03-05,200.0,3.0,05-Mar-2024,False
3,Anil,South,Shampoo,Personal Care,5,150,2024-01-17,750.0,1.0,17-Jan-2024,False
10,Tanvi,West,Face Cream,Personal Care,3,200,2024-03-19,600.0,3.0,19-Mar-2024,False
11,Rishitha Varma,North,Gadget,Electronics,1,9999,2025-08-08,,,,


In [0]:
%sql
-- TASK-5
-- Total sales amount per region
SELECT 
    region,
    SUM(quantity * unit_price) AS total_sales_amount
FROM delta.`/tmp/delta/sales_transactions`
GROUP BY region;

region,total_sales_amount
South,750
East,220
West,600
North,10659


In [0]:
%sql
-- Total quantity sold per category
SELECT 
    category,
    SUM(quantity) AS total_quantity
FROM delta.`/tmp/delta/sales_transactions`
GROUP BY category;

category,total_quantity
Stationery,30
Personal Care,8
Electronics,1


In [0]:
%sql
-- Average unit price per category
SELECT 
    category,
    AVG(unit_price) AS avg_unit_price
FROM delta.`/tmp/delta/sales_transactions`
GROUP BY category;

category,avg_unit_price
Stationery,38.5
Personal Care,175.0
Electronics,9999.0


In [0]:
# Highest selling product
query = """
SELECT 
    product,
    SUM(quantity) AS total_sold
FROM delta.`/tmp/delta/sales_transactions`
GROUP BY product
ORDER BY total_sold DESC
LIMIT 1
"""

result = spark.sql(query)
display(result)

product,total_sold
Pencil,20
