#  LTIMindtree - PySpark Interview Question — Solution

Given a dataset of monthly sales records with salespeople names and their regions, calculate the month with the highest sales for each region using window functions and the max() function. Ensure that the result includes the region name, month, and sales value. Consider sales fluctuations, and the dataset should contain multiple records for each region to test windowing correctly.

In [0]:
from pyspark.sql.functions import *
from pyspark.sql.window import *
from pyspark.sql.types import *

In [0]:
data = [ ("Amit", "North", "Jan", 12000), ("Rajesh", "North", "Feb", 15000), ("Sunita", "North", "Mar", 11000), ("Meena", "South", "Jan", 17000), 
("Ravi", "South", "Feb", 20000), ("Priya", "South", "Mar", 18000), 
("Suresh", "East", "Jan", 10000), ("Vishal", "East", "Feb", 22000), 
("Akash", "East", "Mar", 21000), ("Anjali", "West", "Jan", 15000), 
("Deepak", "West", "Feb", 13000), ("Nidhi", "West", "Mar", 17000), ] 

# Step 3: Define schema and create DataFrame 
columns = ["Salesperson", "Region", "Month", "Sales"]

In [0]:
df = spark.createDataFrame(data, columns)
df.display()

Salesperson,Region,Month,Sales
Amit,North,Jan,12000
Rajesh,North,Feb,15000
Sunita,North,Mar,11000
Meena,South,Jan,17000
Ravi,South,Feb,20000
Priya,South,Mar,18000
Suresh,East,Jan,10000
Vishal,East,Feb,22000
Akash,East,Mar,21000
Anjali,West,Jan,15000


In [0]:
window_criteria = Window.partitionBy('Region').orderBy(col('Sales').desc())

(
    df.withColumn(
        'max_sales'
        , dense_rank().over(window_criteria)
    )
    .filter(col('max_sales') == 1)
    .select('Region', 'Month', 'Sales')
    .display()    
)

Region,Month,Sales
East,Feb,22000
North,Feb,15000
South,Feb,20000
West,Mar,17000
