# Tech Mahindra (Middle Level) - PySpark Interview Question

You are given a sales transaction dataset containing sales details. Your task is to write a PySpark transformation to:
* Calculate the total sales per product.
* Filter out products with total sales greater than $1000.

In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import * 
from pyspark.sql.window import *

In [0]:
data = [ (1, "P001", "Laptop", 2, 600), (2, "P002", "Phone", 3, 300), 
(3, "P001", "Laptop", 1, 600), (4, "P003", "Tablet", 5, 150), 
(5, "P002", "Phone", 1, 300) ] 

columns = ["transaction_id", "product_id", "product_name", "quantity", "price_per_unit"]

In [0]:
df = spark.createDataFrame(data, columns)
df.display()

transaction_id,product_id,product_name,quantity,price_per_unit
1,P001,Laptop,2,600
2,P002,Phone,3,300
3,P001,Laptop,1,600
4,P003,Tablet,5,150
5,P002,Phone,1,300


In [0]:
(
    df.withColumn(
            'total_sales'
            , col('quantity') * col('price_per_unit')
        )
        .groupBy(
            'product_id', 'product_name'
        ).agg(
            sum(col('total_sales')).alias('total_sales_per_product')
        )
        .filter(col('total_sales_per_product') > 1000)
        .display()
)

product_id,product_name,total_sales_per_product
P001,Laptop,1800
P002,Phone,1200
