In [None]:
pip install pyspark



In [None]:
from pyspark.sql.functions import *
from pyspark.sql import SparkSession
from pyspark.sql.window import Window
spark = SparkSession.builder \
    .appName("Day 2") \
    .getOrCreate()

Problem 1: PySpark – Find Top N Products per Category

You have a PySpark DataFrame containing product sales. Each product belongs to a category, and you need to find the top 2 products by sales amount within each category.

In [None]:
data = [
    ("Electronics", "Laptop", 1200),
    ("Electronics", "Phone", 900),
    ("Electronics", "Tablet", 700),
    ("Clothing", "Shirt", 400),
    ("Clothing", "Jeans", 600),
    ("Clothing", "Jacket", 800)
]

columns = ["category", "product", "sales"]
df = spark.createDataFrame(data, columns)

In [None]:
product_sales=df.groupBy("category","product").agg(sum(col("sales")).alias("Total_sales"))
window=Window.partitionBy("category").orderBy(col("Total_sales").desc())
result=product_sales.withColumn("rank",row_number().over(window)).filter(col("rank")<=2).select("category","product","Total_sales").show()

+-----------+-------+-----------+
|   category|product|Total_sales|
+-----------+-------+-----------+
|   Clothing| Jacket|        800|
|   Clothing|  Jeans|        600|
|Electronics| Laptop|       1200|
|Electronics|  Phone|        900|
+-----------+-------+-----------+



Problem 2: SQL – Calculate Running Balance

You are given a SQL table transactions(user_id, txn_date, amount) where amount can be positive (credit) or negative (debit). Write a SQL query to calculate the running balance for each user ordered by txn_date.



In [14]:
data = [
    (1, "2025-01-01", 500),
    (1, "2025-01-03", -200),
    (1, "2025-01-05", 300),
    (2, "2025-01-02", 1000),
    (2, "2025-01-04", -400),
]
columns = ["user_id", "txn_date", "amount"]

transactions = spark.createDataFrame(data, columns)

transactions = transactions.withColumn("txn_date", to_date(col("txn_date"), "yyyy-MM-dd"))

transactions.show()
transactions.createOrReplaceTempView("transactions")

+-------+----------+------+
|user_id|  txn_date|amount|
+-------+----------+------+
|      1|2025-01-01|   500|
|      1|2025-01-03|  -200|
|      1|2025-01-05|   300|
|      2|2025-01-02|  1000|
|      2|2025-01-04|  -400|
+-------+----------+------+



In [18]:
result=spark.sql(
    """
      select user_id,txn_date,amount,
      sum(amount) over(
        partition by user_id order by txn_date
        rows between unbounded preceding and current row
      ) as running_balance
      from transactions
      order by user_id,txn_date;
    """
)
result.show()

+-------+----------+------+---------------+
|user_id|  txn_date|amount|running_balance|
+-------+----------+------+---------------+
|      1|2025-01-01|   500|            500|
|      1|2025-01-03|  -200|            300|
|      1|2025-01-05|   300|            600|
|      2|2025-01-02|  1000|           1000|
|      2|2025-01-04|  -400|            600|
+-------+----------+------+---------------+

