# Spark Window Functions
Spark Window functions operate on a group of rows (like frame, partition) and return a single value for every input row. Spark SQL supports three kinds of window functions:

- ranking functions
- analytic functions
- aggregate functions

In [6]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

spark = (
    SparkSession.builder
    .appName("example-window-functios")
    .getOrCreate()
)

In [6]:

simpleData = (("James", "Sales", 3000), \
              ("Michael", "Sales", 4600),  \
              ("Robert", "Sales", 4100),   \
              ("Maria", "Finance", 3000),  \
              ("James", "Sales", 3000),    \
              ("Scott", "Finance", 3300),  \
              ("Jen", "Finance", 3900),    \
              ("Jeff", "Marketing", 3000), \
              ("Kumar", "Marketing", 2000),\
              ("Saif", "Sales", 4100) \
             )
 
columns= ["employee_name", "department", "salary"]
df = spark.createDataFrame(data = simpleData, schema = columns)
df.printSchema()
df.show(truncate=False)


root
 |-- employee_name: string (nullable = true)
 |-- department: string (nullable = true)
 |-- salary: long (nullable = true)

+-------------+----------+------+
|employee_name|department|salary|
+-------------+----------+------+
|James        |Sales     |3000  |
|Michael      |Sales     |4600  |
|Robert       |Sales     |4100  |
|Maria        |Finance   |3000  |
|James        |Sales     |3000  |
|Scott        |Finance   |3300  |
|Jen          |Finance   |3900  |
|Jeff         |Marketing |3000  |
|Kumar        |Marketing |2000  |
|Saif         |Sales     |4100  |
+-------------+----------+------+



## Spark Window Ranking functions
### row_number Window Function

In [9]:
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number

windowSpec  = Window.partitionBy("department").orderBy("salary")

df.withColumn("row_number",row_number().over(windowSpec)).show(truncate=False)


+-------------+----------+------+----------+
|employee_name|department|salary|row_number|
+-------------+----------+------+----------+
|James        |Sales     |3000  |1         |
|James        |Sales     |3000  |2         |
|Robert       |Sales     |4100  |3         |
|Saif         |Sales     |4100  |4         |
|Michael      |Sales     |4600  |5         |
|Maria        |Finance   |3000  |1         |
|Scott        |Finance   |3300  |2         |
|Jen          |Finance   |3900  |3         |
|Kumar        |Marketing |2000  |1         |
|Jeff         |Marketing |3000  |2         |
+-------------+----------+------+----------+



### rank Window Function

In [10]:
from pyspark.sql.functions import rank

df.withColumn("rank",rank().over(windowSpec)) .show()


+-------------+----------+------+----+
|employee_name|department|salary|rank|
+-------------+----------+------+----+
|        James|     Sales|  3000|   1|
|        James|     Sales|  3000|   1|
|       Robert|     Sales|  4100|   3|
|         Saif|     Sales|  4100|   3|
|      Michael|     Sales|  4600|   5|
|        Maria|   Finance|  3000|   1|
|        Scott|   Finance|  3300|   2|
|          Jen|   Finance|  3900|   3|
|        Kumar| Marketing|  2000|   1|
|         Jeff| Marketing|  3000|   2|
+-------------+----------+------+----+



### dense_rank Window Function

In [11]:
from pyspark.sql.functions import dense_rank

df.withColumn("dense_rank",dense_rank().over(windowSpec)).show()


+-------------+----------+------+----------+
|employee_name|department|salary|dense_rank|
+-------------+----------+------+----------+
|        James|     Sales|  3000|         1|
|        James|     Sales|  3000|         1|
|       Robert|     Sales|  4100|         2|
|         Saif|     Sales|  4100|         2|
|      Michael|     Sales|  4600|         3|
|        Maria|   Finance|  3000|         1|
|        Scott|   Finance|  3300|         2|
|          Jen|   Finance|  3900|         3|
|        Kumar| Marketing|  2000|         1|
|         Jeff| Marketing|  3000|         2|
+-------------+----------+------+----------+



### percent_rank Window Function

In [12]:
from pyspark.sql.functions import percent_rank

df.withColumn("percent_rank",percent_rank().over(windowSpec)).show()


+-------------+----------+------+------------+
|employee_name|department|salary|percent_rank|
+-------------+----------+------+------------+
|        James|     Sales|  3000|         0.0|
|        James|     Sales|  3000|         0.0|
|       Robert|     Sales|  4100|         0.5|
|         Saif|     Sales|  4100|         0.5|
|      Michael|     Sales|  4600|         1.0|
|        Maria|   Finance|  3000|         0.0|
|        Scott|   Finance|  3300|         0.5|
|          Jen|   Finance|  3900|         1.0|
|        Kumar| Marketing|  2000|         0.0|
|         Jeff| Marketing|  3000|         1.0|
+-------------+----------+------+------------+



### ntile Window Function

In [13]:
from pyspark.sql.functions import ntile

df.withColumn("ntile",ntile(2).over(windowSpec)).show()


+-------------+----------+------+-----+
|employee_name|department|salary|ntile|
+-------------+----------+------+-----+
|        James|     Sales|  3000|    1|
|        James|     Sales|  3000|    1|
|       Robert|     Sales|  4100|    1|
|         Saif|     Sales|  4100|    2|
|      Michael|     Sales|  4600|    2|
|        Maria|   Finance|  3000|    1|
|        Scott|   Finance|  3300|    1|
|          Jen|   Finance|  3900|    2|
|        Kumar| Marketing|  2000|    1|
|         Jeff| Marketing|  3000|    2|
+-------------+----------+------+-----+



## PySpark Window Analytic functions
### cume_dist Window Function

In [14]:
""" cume_dist """
from pyspark.sql.functions import cume_dist    
df.withColumn("cume_dist",cume_dist().over(windowSpec)) \
   .show()


+-------------+----------+------+------------------+
|employee_name|department|salary|         cume_dist|
+-------------+----------+------+------------------+
|        James|     Sales|  3000|               0.4|
|        James|     Sales|  3000|               0.4|
|       Robert|     Sales|  4100|               0.8|
|         Saif|     Sales|  4100|               0.8|
|      Michael|     Sales|  4600|               1.0|
|        Maria|   Finance|  3000|0.3333333333333333|
|        Scott|   Finance|  3300|0.6666666666666666|
|          Jen|   Finance|  3900|               1.0|
|        Kumar| Marketing|  2000|               0.5|
|         Jeff| Marketing|  3000|               1.0|
+-------------+----------+------+------------------+



### lag Window Function

In [15]:

"""lag"""
from pyspark.sql.functions import lag    
df.withColumn("lag",lag("salary",2).over(windowSpec)) \
      .show()


+-------------+----------+------+----+
|employee_name|department|salary| lag|
+-------------+----------+------+----+
|        James|     Sales|  3000|null|
|        James|     Sales|  3000|null|
|       Robert|     Sales|  4100|3000|
|         Saif|     Sales|  4100|3000|
|      Michael|     Sales|  4600|4100|
|        Maria|   Finance|  3000|null|
|        Scott|   Finance|  3300|null|
|          Jen|   Finance|  3900|3000|
|        Kumar| Marketing|  2000|null|
|         Jeff| Marketing|  3000|null|
+-------------+----------+------+----+



### lead Window Function

In [16]:

 """lead"""
from pyspark.sql.functions import lead    
df.withColumn("lead",lead("salary",2).over(windowSpec)) \
    .show()


+-------------+----------+------+----+
|employee_name|department|salary|lead|
+-------------+----------+------+----+
|        James|     Sales|  3000|4100|
|        James|     Sales|  3000|4100|
|       Robert|     Sales|  4100|4600|
|         Saif|     Sales|  4100|null|
|      Michael|     Sales|  4600|null|
|        Maria|   Finance|  3000|3900|
|        Scott|   Finance|  3300|null|
|          Jen|   Finance|  3900|null|
|        Kumar| Marketing|  2000|null|
|         Jeff| Marketing|  3000|null|
+-------------+----------+------+----+



## PySpark Window Aggregate Functions

In [17]:
from pyspark.sql.functions import col,avg,sum,min,max,row_number 

windowSpecAgg  = Window.partitionBy("department")

df.withColumn("row",row_number().over(windowSpec)) \
  .withColumn("avg", avg(col("salary")).over(windowSpecAgg)) \
  .withColumn("sum", sum(col("salary")).over(windowSpecAgg)) \
  .withColumn("min", min(col("salary")).over(windowSpecAgg)) \
  .withColumn("max", max(col("salary")).over(windowSpecAgg)) \
  .where(col("row")==1).select("department","avg","sum","min","max") \
  .show()


+----------+------+-----+----+----+
|department|   avg|  sum| min| max|
+----------+------+-----+----+----+
|     Sales|3760.0|18800|3000|4600|
|   Finance|3400.0|10200|3000|3900|
| Marketing|2500.0| 5000|2000|3000|
+----------+------+-----+----+----+



In [19]:
data_list = [("US", "536365","2021-05-15", "600"),
             ("US", "536365","2021-05-17", "500"),
             ("US", "536366","2021-05-14", "200"),
             ("IN", "536367","2021-05-16", "600"),
             ("IN", "536367","2021-05-20", "800")]

df = spark.createDataFrame(data_list).toDF("Country", "CustomerID", "PurchaseDate", "Amount")

windowSpec = Window \
.partitionBy("Country") \
.orderBy("PurchaseDate") \
.rowsBetween(Window.unboundedPreceding, Window.currentRow)

df.withColumn("CumulativePurchase", sum("Amount").over(windowSpec)).show()

+-------+----------+------------+------+------------------+
|Country|CustomerID|PurchaseDate|Amount|CumulativePurchase|
+-------+----------+------------+------+------------------+
|     US|    536366|  2021-05-14|   200|             200.0|
|     US|    536365|  2021-05-15|   600|             800.0|
|     US|    536365|  2021-05-17|   500|            1300.0|
|     IN|    536367|  2021-05-16|   600|             600.0|
|     IN|    536367|  2021-05-20|   800|            1400.0|
+-------+----------+------------+------+------------------+



In [24]:
data_list = [("Germany", "48", "10"),
             ("Germany", "49", "5"),
             ("Germany", "50", "3"),
             ("Germany", "51", "2"),
             ("United Kingdom", "48", "2"),
             ("United Kingdom", "49", "2")]

df = spark.createDataFrame(data_list).toDF("Country", "Week", "Quantity")



In [25]:
running_total_window = Window.partitionBy("Country").orderBy("Week").rowsBetween(Window.unboundedPreceding, Window.currentRow)

df.withColumn("3WeekTotal", sum("Quantity").over(running_total_window)).show()

+--------------+----+--------+----------+
|       Country|Week|Quantity|3WeekTotal|
+--------------+----+--------+----------+
|       Germany|  48|      10|      10.0|
|       Germany|  49|       5|      15.0|
|       Germany|  50|       3|      18.0|
|       Germany|  51|       2|      20.0|
|United Kingdom|  48|       2|       2.0|
|United Kingdom|  49|       2|       4.0|
+--------------+----+--------+----------+



In [31]:
running_total_window = Window.partitionBy("Country").orderBy("Week").rowsBetween(-2, Window.currentRow)

df.withColumn("3WeekTotal", sum("Quantity").over(running_total_window)).show()

+--------------+----+--------+----------+
|       Country|Week|Quantity|3WeekTotal|
+--------------+----+--------+----------+
|       Germany|  48|      10|      10.0|
|       Germany|  49|       5|      15.0|
|       Germany|  50|       3|      18.0|
|       Germany|  51|       2|      10.0|
|United Kingdom|  48|       2|       2.0|
|United Kingdom|  49|       2|       4.0|
+--------------+----+--------+----------+



In [3]:
data_list = [("Alma",  "D0", [100]),
             ("Galma", "D1", [300, 250, 100]),
             ("Salma", "D1", [350, 100]),
             ("Dalma", "D1", [400, 100]),
             ("Jalma", "D2", [250]),
             ("Nalma", "D2", [500, 300, 100]),
             ("Lalma", "D3", [300, 100])]

df = spark.createDataFrame(data_list).toDF("Name", "Department", "Score")
df.show()


+-----+----------+---------------+
| Name|Department|          Score|
+-----+----------+---------------+
| Alma|        D0|          [100]|
|Galma|        D1|[300, 250, 100]|
|Salma|        D1|     [350, 100]|
|Dalma|        D1|     [400, 100]|
|Jalma|        D2|          [250]|
|Nalma|        D2|[500, 300, 100]|
|Lalma|        D3|     [300, 100]|
+-----+----------+---------------+



In [10]:
from pyspark.sql.window import Window
from pyspark.sql.functions import dense_rank


windowSpec = Window.partitionBy("Department") \
    .orderBy(col("Score").desc()) \
    .rowsBetween(Window.unboundedPreceding, Window.currentRow) 

df.withColumn("Score", explode(col("Score"))) \
    .withColumn("rank", dense_rank().over(windowSpec)) \
    .select("Department", "Name", "Score") \
    .where("rank == 1") \
    .orderBy("Department") \
    .show()

+----------+-----+-----+
|Department| Name|Score|
+----------+-----+-----+
|        D0| Alma|  100|
|        D1|Dalma|  400|
|        D2|Nalma|  500|
|        D3|Lalma|  300|
+----------+-----+-----+

