# Tech Mahindra - PySpark Interview Question

Imagine you're analyzing the monthly sales performance of a company across different regions. You want to calculate:
* The cumulative sales for each region over months.
* The rank of each month based on sales within the same region.

In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import *

In [0]:
data = [ ("East", "Jan", 200), ("East", "Feb", 300), 
("East", "Mar", 250), ("West", "Jan", 400), 
("West", "Feb", 350), ("West", "Mar", 450) ]

 # Define schema and create DataFrame 

columns = ["Region", "Month", "Sales"]

In [0]:
df_sales = spark.createDataFrame(data, columns)
df_sales.display()

Region,Month,Sales
East,Jan,200
East,Feb,300
East,Mar,250
West,Jan,400
West,Feb,350
West,Mar,450


In [0]:
partition_rnk = Window.partitionBy(col('Region')).orderBy(col('Sales').asc())
partition_cum_sum = Window.partitionBy('Region').orderBy(col('Sales')).rowsBetween(Window.unboundedPreceding, 0)
(
    df_sales.withColumns({
                'rank': rank().over(partition_rnk)
                , 'cum': sum(col('Sales')).over(partition_cum_sum)
            })
            .display()
)

Region,Month,Sales,rank,cum
East,Jan,200,1,200
East,Mar,250,2,450
East,Feb,300,3,750
West,Feb,350,1,350
West,Jan,400,2,750
West,Mar,450,3,1200
