In [24]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("WindowExamples").getOrCreate()

emp = spark.read.csv("employee.csv", header=True, inferSchema=True)
orders = spark.read.csv("orders.csv", header=True, inferSchema=True)

emp.show(5)
orders.show(5)


+-----------+-----+----------+------+------------+-----------------+
|employee_id| name|department|salary|joining_date|performance_score|
+-----------+-----+----------+------+------------+-----------------+
|          1|Alice|     Sales| 72000|  2018-01-15|              3.8|
|          2|  Bob|     Sales| 68000|  2019-03-12|              4.1|
|          3|Carol|     Sales| 72000|  2020-06-05|              4.1|
|          4|David|        IT| 98000|  2017-11-01|              4.9|
|          5|  Eva|        IT|102000|  2018-12-24|              4.7|
+-----------+-----+----------+------+------------+-----------------+
only showing top 5 rows
+--------+--------+------+----------+------+
|order_id|customer|region|order_date|amount|
+--------+--------+------+----------+------+
|    1001|   Alice| North|2021-01-02| 250.0|
|    1002|     Bob| North|2021-01-05| 120.0|
|    1003|   Carol| South|2021-01-03| 300.0|
|    1004|   David|  East|2021-01-07| 450.0|
|    1005|     Eva|  East|2021-01-08| 20

In [3]:
from pyspark.sql.window import Window
from pyspark.sql.functions import col, rank, dense_rank, row_number, lag, lead, sum as _sum, avg as _avg
from pyspark.sql.functions import min as _min, max as _max
from pyspark.sql.functions import unix_timestamp
from pyspark.sql.types import TimestampType


In [None]:
dept_salary_range = Window.partitionBy("department").orderBy(col("salary").desc(), col("joining_date").asc())

In [7]:
emp_rownum = emp.withColumn("row_num", row_number().over(dept_salary_range))
emp_rownum.select("employee_id","name","department","salary","row_num").orderBy("department","row_num").show(10)

+-----------+------+----------+------+-------+
|employee_id|  name|department|salary|row_num|
+-----------+------+----------+------+-------+
|         13|  Maya|   Finance|112000|      1|
|         15|Olivia|   Finance|112000|      2|
|         14|  Nate|   Finance|108000|      3|
|          8|  Hank|        HR| 66000|      1|
|         18|  Rita|        HR| 66000|      2|
|          7| Grace|        HR| 64000|      3|
|          9|   Ivy|        HR| 64000|      4|
|          5|   Eva|        IT|102000|      1|
|         17| Quinn|        IT|102000|      2|
|          4| David|        IT| 98000|      3|
+-----------+------+----------+------+-------+
only showing top 10 rows


In [10]:
emp_rank = emp.withColumn("rnk", rank().over(dept_salary_range)) \
              .withColumn("drnk", dense_rank().over(dept_salary_range))

emp_rank.select("employee_id","name","department","salary","rnk","drnk").orderBy("department","rnk").show(12)

+-----------+------+----------+------+---+----+
|employee_id|  name|department|salary|rnk|drnk|
+-----------+------+----------+------+---+----+
|         13|  Maya|   Finance|112000|  1|   1|
|         15|Olivia|   Finance|112000|  1|   1|
|         14|  Nate|   Finance|108000|  3|   2|
|          8|  Hank|        HR| 66000|  1|   1|
|         18|  Rita|        HR| 66000|  1|   1|
|          7| Grace|        HR| 64000|  3|   2|
|          9|   Ivy|        HR| 64000|  3|   2|
|          5|   Eva|        IT|102000|  1|   1|
|         17| Quinn|        IT|102000|  1|   1|
|          4| David|        IT| 98000|  3|   2|
|          6| Frank|        IT| 98000|  3|   2|
|         19|   Sam| Marketing| 92000|  1|   1|
+-----------+------+----------+------+---+----+
only showing top 12 rows


In [13]:
dept_dates = Window.partitionBy("department").orderBy(col("joining_date").asc())

emp_laglead = emp.withColumn("prev_salary", lag("salary", 1).over(dept_dates))\
                 .withColumn("next_salary", lead("salary", 1).over(dept_dates))

emp_laglead.select("name", "department", "joining_date", "salary", "prev_salary", "next_salary")\
           .orderBy("department", "joining_date").show(12)


+------+----------+------------+------+-----------+-----------+
|  name|department|joining_date|salary|prev_salary|next_salary|
+------+----------+------------+------+-----------+-----------+
|  Maya|   Finance|  2016-01-10|112000|       NULL|     108000|
|  Nate|   Finance|  2018-10-12|108000|     112000|     112000|
|Olivia|   Finance|  2019-11-19|112000|     108000|       NULL|
| Grace|        HR|  2016-08-30| 64000|       NULL|      66000|
|  Hank|        HR|  2018-05-20| 66000|      64000|      64000|
|   Ivy|        HR|  2020-02-14| 64000|      66000|      66000|
|  Rita|        HR|  2021-09-01| 66000|      64000|       NULL|
| David|        IT|  2017-11-01| 98000|       NULL|     102000|
|   Eva|        IT|  2018-12-24|102000|      98000|      98000|
| Frank|        IT|  2019-04-10| 98000|     102000|     102000|
| Quinn|        IT|  2021-06-18|102000|      98000|       NULL|
|   Sam| Marketing|  2016-05-05| 92000|       NULL|      89000|
+------+----------+------------+------+-

In [33]:
# cumulative salary sum per department ordered by joining date.

dept_wise_joining = Window.partitionBy("department").orderBy(col("joining_date").asc()).rowsBetween(Window.unboundedPreceding, Window.currentRow)

cumulative_salary = emp.withColumn("cumulative_salary", _sum("salary").over(dept_wise_joining))

cumulative_salary.select("employee_id","name","department","joining_date","salary","cumulative_salary").orderBy("department","joining_date").show()

+-----------+------+----------+------------+------+-----------------+
|employee_id|  name|department|joining_date|salary|cumulative_salary|
+-----------+------+----------+------------+------+-----------------+
|         13|  Maya|   Finance|  2016-01-10|112000|           112000|
|         14|  Nate|   Finance|  2018-10-12|108000|           220000|
|         15|Olivia|   Finance|  2019-11-19|112000|           332000|
|          7| Grace|        HR|  2016-08-30| 64000|            64000|
|          8|  Hank|        HR|  2018-05-20| 66000|           130000|
|          9|   Ivy|        HR|  2020-02-14| 64000|           194000|
|         18|  Rita|        HR|  2021-09-01| 66000|           260000|
|          4| David|        IT|  2017-11-01| 98000|            98000|
|          5|   Eva|        IT|  2018-12-24|102000|           200000|
|          6| Frank|        IT|  2019-04-10| 98000|           298000|
|         17| Quinn|        IT|  2021-06-18|102000|           400000|
|         19|   Sam|

In [71]:
from pyspark.sql.functions import round
mov_avg = Window.partitionBy("department").orderBy("joining_date").rowsBetween(-1,1)

emp_mov_avg = emp.withColumn("moving_average_performance", round((_avg("performance_score")).over(mov_avg), 2))

emp_mov_avg.select("name","department","joining_date","performance_score","moving_average_performance")\
        .orderBy("department","joining_date").show(12)


+------+----------+------------+-----------------+--------------------------+
|  name|department|joining_date|performance_score|moving_average_performance|
+------+----------+------------+-----------------+--------------------------+
|  Maya|   Finance|  2016-01-10|              4.8|                       4.5|
|  Nate|   Finance|  2018-10-12|              4.2|                       4.6|
|Olivia|   Finance|  2019-11-19|              4.8|                       4.5|
| Grace|        HR|  2016-08-30|              3.2|                       3.4|
|  Hank|        HR|  2018-05-20|              3.6|                      3.57|
|   Ivy|        HR|  2020-02-14|              3.9|                      3.67|
|  Rita|        HR|  2021-09-01|              3.5|                       3.7|
| David|        IT|  2017-11-01|              4.9|                       4.8|
|   Eva|        IT|  2018-12-24|              4.7|                       4.7|
| Frank|        IT|  2019-04-10|              4.5|              

In [83]:

orders_ts = orders.withColumn("order_ts", unix_timestamp(col("order_date").cast("timestamp")))

region_time = Window.partitionBy("region").orderBy("order_ts").rangeBetween(-7*24*3600, 0)
orders_7d = orders_ts.withColumn("sum_7d", _sum("amount").over(region_time))
orders_7d.select("order_id","region","order_date","amount","sum_7d").orderBy("region","order_date").show(12)


+--------+------+----------+------+------+
|order_id|region|order_date|amount|sum_7d|
+--------+------+----------+------+------+
|    1004|  East|2021-01-07| 450.0| 450.0|
|    1005|  East|2021-01-08| 200.0| 650.0|
|    1010|  East|2021-01-11| 700.0|1350.0|
|    1014|  East|2021-01-15| 250.0|1150.0|
|    1018|  East|2021-01-19| 130.0| 380.0|
|    1001| North|2021-01-02| 250.0| 250.0|
|    1002| North|2021-01-05| 120.0| 370.0|
|    1007| North|2021-01-09| 320.0| 690.0|
|    1012| North|2021-01-13|  90.0| 410.0|
|    1016| North|2021-01-17|  75.0| 165.0|
|    1020| North|2021-01-21| 310.0| 385.0|
|    1003| South|2021-01-03| 300.0| 300.0|
+--------+------+----------+------+------+
only showing top 12 rows


In [73]:
orders.show(4)

+--------+--------+------+----------+------+
|order_id|customer|region|order_date|amount|
+--------+--------+------+----------+------+
|    1001|   Alice| North|2021-01-02| 250.0|
|    1002|     Bob| North|2021-01-05| 120.0|
|    1003|   Carol| South|2021-01-03| 300.0|
|    1004|   David|  East|2021-01-07| 450.0|
+--------+--------+------+----------+------+
only showing top 4 rows


In [80]:
# Top 2 highest-paid employees per department

dept_wise = Window.partitionBy("department").orderBy(col("salary").desc())

highest_paid = emp.withColumn("highest_paid", row_number().over(dept_wise))

highest_paid.select("name","department","salary","highest_paid").show()

+------+----------+------+------------+
|  name|department|salary|highest_paid|
+------+----------+------+------------+
|  Maya|   Finance|112000|           1|
|Olivia|   Finance|112000|           2|
|  Nate|   Finance|108000|           3|
|  Hank|        HR| 66000|           1|
|  Rita|        HR| 66000|           2|
| Grace|        HR| 64000|           3|
|   Ivy|        HR| 64000|           4|
|   Eva|        IT|102000|           1|
| Quinn|        IT|102000|           2|
| David|        IT| 98000|           3|
| Frank|        IT| 98000|           4|
|   Sam| Marketing| 92000|           1|
| Karen| Marketing| 89000|           2|
|  Jack| Marketing| 87000|           3|
|   Leo| Marketing| 87000|           4|
|  Tara|     Sales| 75000|           1|
| Alice|     Sales| 72000|           2|
| Carol|     Sales| 72000|           3|
|   Bob|     Sales| 68000|           4|
|  Pete|     Sales| 68000|           5|
+------+----------+------+------------+

