Useful Links:

    https://medium.com/@sergey.ivanchuk/practical-pyspark-window-function-examples-cb5c7e1a3c41
    https://www.educba.com/pyspark-window-functions/
    https://knockdata.github.io/spark-window-function/
    https://sparkbyexamples.com/pyspark/pyspark-window-functions/
    https://www.geeksforgeeks.org/pyspark-window-functions/
    https://towardsdatascience.com/spark-sql-102-aggregations-and-window-functions-9f829eaa7549
    https://databricks.com/blog/2015/07/15/introducing-window-functions-in-spark-sql.html
    https://medium.com/analytics-vidhya/solving-complex-big-data-problems-using-combinations-of-window-functions-deep-dive-in-pyspark-b1830eb00b7d  

In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql.functions import unix_timestamp, from_unixtime,col, lit, udf, datediff, lead, explode,to_date
from pyspark.sql import SparkSession,Window,DataFrame
import datetime
from pyspark.sql.types import StringType,BooleanType,DateType,LongType,ArrayType
from typing import List
import pyspark.sql.functions as F
spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()

In [3]:
shopping_data = \
[('Alex','2018-10-10','Paint',80),('Alex','2018-04-02','Ladder',20),('Alex','2018-06-22','Stool',20),\
('Alex','2018-12-09','Vacuum',40),('Alex','2018-07-12','Bucket',5),('Alex','2018-02-18','Gloves',5),\
('Alex','2018-03-03','Brushes',30),('Alex','2018-09-26','Sandpaper',10)]

df = spark.createDataFrame(shopping_data, ['name','date','product','price'])\
                .withColumn('date',F.col('date').cast(DateType()))

In [9]:
df.filter(col("product") == "Paint").orderBy(col("date"))

DataFrame[name: string, date: date, product: string, price: bigint]

In [7]:
df.printSchema()


root
 |-- name: string (nullable = true)
 |-- date: date (nullable = true)
 |-- product: string (nullable = true)
 |-- price: long (nullable = true)



In [8]:
df.show()

+----+----------+---------+-----+
|name|      date|  product|price|
+----+----------+---------+-----+
|Alex|2018-10-10|    Paint|   80|
|Alex|2018-04-02|   Ladder|   20|
|Alex|2018-06-22|    Stool|   20|
|Alex|2018-12-09|   Vacuum|   40|
|Alex|2018-07-12|   Bucket|    5|
|Alex|2018-02-18|   Gloves|    5|
|Alex|2018-03-03|  Brushes|   30|
|Alex|2018-09-26|Sandpaper|   10|
+----+----------+---------+-----+



In [29]:
w0 = Window.partitionBy('name')

# Sort purchases by descending order of price and have continuous ranking for ties.

df.withColumn("price_rank",F.dense_rank().over(w0.orderBy(F.col('price').desc()))).show()

+----+----------+---------+-----+----------+
|name|      date|  product|price|price_rank|
+----+----------+---------+-----+----------+
|Alex|2018-10-10|    Paint|   80|         1|
|Alex|2018-12-09|   Vacuum|   40|         2|
|Alex|2018-03-03|  Brushes|   30|         3|
|Alex|2018-04-02|   Ladder|   20|         4|
|Alex|2018-06-22|    Stool|   20|         4|
|Alex|2018-09-26|Sandpaper|   10|         5|
|Alex|2018-07-12|   Bucket|    5|         6|
|Alex|2018-02-18|   Gloves|    5|         6|
+----+----------+---------+-----+----------+



In [30]:
w1= Window.partitionBy('name').orderBy(F.col('price').desc())

df.withColumn("price_rank",F.dense_rank().over(w1)).show()

+----+----------+---------+-----+----------+
|name|      date|  product|price|price_rank|
+----+----------+---------+-----+----------+
|Alex|2018-10-10|    Paint|   80|         1|
|Alex|2018-12-09|   Vacuum|   40|         2|
|Alex|2018-03-03|  Brushes|   30|         3|
|Alex|2018-04-02|   Ladder|   20|         4|
|Alex|2018-06-22|    Stool|   20|         4|
|Alex|2018-09-26|Sandpaper|   10|         5|
|Alex|2018-07-12|   Bucket|    5|         6|
|Alex|2018-02-18|   Gloves|    5|         6|
+----+----------+---------+-----+----------+



In [25]:
# Sort purchases by ascending order of price and have skip rankings for ties.
df.withColumn("price_rank",F.rank().over(w0.orderBy(F.col('price').asc()))).show()

+----+----------+---------+-----+----------+
|name|      date|  product|price|price_rank|
+----+----------+---------+-----+----------+
|Alex|2018-07-12|   Bucket|    5|         1|
|Alex|2018-02-18|   Gloves|    5|         1|
|Alex|2018-09-26|Sandpaper|   10|         3|
|Alex|2018-04-02|   Ladder|   20|         4|
|Alex|2018-06-22|    Stool|   20|         4|
|Alex|2018-03-03|  Brushes|   30|         6|
|Alex|2018-12-09|   Vacuum|   40|         7|
|Alex|2018-10-10|    Paint|   80|         8|
+----+----------+---------+-----+----------+



In [12]:
# Bucket purchases into 4 tiles (e.g. least expensive, middle tiers and most expensive purchases). and sort descending order of price
df.withColumn("price_rank",F.ntile(4).over(w0.orderBy(F.col('price').desc()))).show()

+----+----------+---------+-----+----------+
|name|      date|  product|price|price_rank|
+----+----------+---------+-----+----------+
|Alex|2018-10-10|    Paint|   80|         1|
|Alex|2018-12-09|   Vacuum|   40|         1|
|Alex|2018-03-03|  Brushes|   30|         2|
|Alex|2018-04-02|   Ladder|   20|         2|
|Alex|2018-06-22|    Stool|   20|         3|
|Alex|2018-09-26|Sandpaper|   10|         3|
|Alex|2018-07-12|   Bucket|    5|         4|
|Alex|2018-02-18|   Gloves|    5|         4|
+----+----------+---------+-----+----------+



In [13]:
# Sort purchases and generating a relative/percent rank to distance from max price.
df.withColumn('price_rel_rank',F.percent_rank().over(w0.orderBy(F.col('price').desc()))).show()


+----+----------+---------+-----+-------------------+
|name|      date|  product|price|     price_rel_rank|
+----+----------+---------+-----+-------------------+
|Alex|2018-10-10|    Paint|   80|                0.0|
|Alex|2018-12-09|   Vacuum|   40|0.14285714285714285|
|Alex|2018-03-03|  Brushes|   30| 0.2857142857142857|
|Alex|2018-04-02|   Ladder|   20|0.42857142857142855|
|Alex|2018-06-22|    Stool|   20|0.42857142857142855|
|Alex|2018-09-26|Sandpaper|   10| 0.7142857142857143|
|Alex|2018-07-12|   Bucket|    5| 0.8571428571428571|
|Alex|2018-02-18|   Gloves|    5| 0.8571428571428571|
+----+----------+---------+-----+-------------------+



In [14]:
# Row Item Difference - Lead and Lag

In [17]:
"""
    The two functions below, lag and lead, are probably the most abstract examples in this article and could be confusing at first. 
    The core concept here is essentially a subtraction between some row (e.g. current) and prior or future row(s). 
    For examples, from the table below we can say “ 13 = (2018–03–03) — (2018–02–18) “ — which is a difference of days between two dates.
"""

'\n    The two functions below, lag and lead, are probably the most abstract examples in this article and could be confusing at first. \n    The core concept here is essentially a subtraction between some row (e.g. current) and prior or future row(s). \n    For examples, from the table below we can say “ 13 = (2018–03–03) — (2018–02–18) “ — which is a difference of days between two dates.\n'

In [18]:
df.withColumn('days_from_last_purchase', F.datediff('date',F.lag('date',1).over(w0.orderBy(F.col('date')))))\
  .withColumn('days_before_next_purchase', F.datediff(F.lead('date',1).over(w0.orderBy(F.col('date'))),'date'))\
  .show()

+----+----------+---------+-----+-----------------------+-------------------------+
|name|      date|  product|price|days_from_last_purchase|days_before_next_purchase|
+----+----------+---------+-----+-----------------------+-------------------------+
|Alex|2018-02-18|   Gloves|    5|                   null|                       13|
|Alex|2018-03-03|  Brushes|   30|                     13|                       30|
|Alex|2018-04-02|   Ladder|   20|                     30|                       81|
|Alex|2018-06-22|    Stool|   20|                     81|                       20|
|Alex|2018-07-12|   Bucket|    5|                     20|                       76|
|Alex|2018-09-26|Sandpaper|   10|                     76|                       14|
|Alex|2018-10-10|    Paint|   80|                     14|                       60|
|Alex|2018-12-09|   Vacuum|   40|                     60|                     null|
+----+----------+---------+-----+-----------------------+-------------------

In [21]:
## This will give error as over not there
# df.withColumn('days_from_last_purchase', F.datediff('date',F.lag('date',1)))\
#   .withColumn('days_before_next_purchase', F.datediff(F.lead('date',1),'date'))\
#   .show()

### Aggregations : Lists and Sets

    Collect a set of prices ever paid (no duplicates) and collect a list of items paid at a certain price (permit duplicates).

    I’m adding another purchase of paint to my data set in line 1 for the sake of example to generate duplicated items in lines 14 & 15 below.

In [32]:
newRow = spark.createDataFrame([('Alex','2018-10-11','Paint',80)])
df2 = df.union(newRow)

df2.withColumn('items_by_price', F.collect_list('product').over(w0.partitionBy('price')))\
   .withColumn('all_prices',     F.collect_set('price').over(w0)).show()
   


+----+----------+---------+-----+----------------+--------------------+
|name|      date|  product|price|  items_by_price|          all_prices|
+----+----------+---------+-----+----------------+--------------------+
|Alex|2018-07-12|   Bucket|    5|[Bucket, Gloves]|[30, 5, 20, 10, 4...|
|Alex|2018-02-18|   Gloves|    5|[Bucket, Gloves]|[30, 5, 20, 10, 4...|
|Alex|2018-09-26|Sandpaper|   10|     [Sandpaper]|[30, 5, 20, 10, 4...|
|Alex|2018-10-10|    Paint|   80|  [Paint, Paint]|[30, 5, 20, 10, 4...|
|Alex|2018-10-11|    Paint|   80|  [Paint, Paint]|[30, 5, 20, 10, 4...|
|Alex|2018-03-03|  Brushes|   30|       [Brushes]|[30, 5, 20, 10, 4...|
|Alex|2018-04-02|   Ladder|   20| [Ladder, Stool]|[30, 5, 20, 10, 4...|
|Alex|2018-06-22|    Stool|   20| [Ladder, Stool]|[30, 5, 20, 10, 4...|
|Alex|2018-12-09|   Vacuum|   40|        [Vacuum]|[30, 5, 20, 10, 4...|
+----+----------+---------+-----+----------------+--------------------+



In [33]:
df2.withColumn('items', F.collect_set('product').over(w0.partitionBy('price')))\
   .select('name','Price','items')\
   .distinct()\
   .show()

+----+-----+----------------+
|name|Price|           items|
+----+-----+----------------+
|Alex|    5|[Bucket, Gloves]|
|Alex|   10|     [Sandpaper]|
|Alex|   80|         [Paint]|
|Alex|   30|       [Brushes]|
|Alex|   20| [Ladder, Stool]|
|Alex|   40|        [Vacuum]|
+----+-----+----------------+



### Average, Sum, Max, Max within Rows, Counts
    
    Below are 5 very common calculations in single operation: avg + round, sum, max, max + rowsBetween ,
    and count. They help us understanding various purchasing behavior about a profile.

In [34]:
w1 = w0.orderBy(F.col('date'))

df.withColumn('avg_to_date',     F.round(F.avg('price').over(w1),2))\
  .withColumn('accumulating_sum',F.sum('price').over(w1))\
  .withColumn('max_to_date',     F.max('price').over(w1))\
  .withColumn('max_of_last2',    F.max('price').over(w1.rowsBetween(-1,Window.currentRow)))\
  .withColumn('items_to_date',   F.count('*').over(w1))\
  .show()

+----+----------+---------+-----+-----------+----------------+-----------+------------+-------------+
|name|      date|  product|price|avg_to_date|accumulating_sum|max_to_date|max_of_last2|items_to_date|
+----+----------+---------+-----+-----------+----------------+-----------+------------+-------------+
|Alex|2018-02-18|   Gloves|    5|        5.0|               5|          5|           5|            1|
|Alex|2018-03-03|  Brushes|   30|       17.5|              35|         30|          30|            2|
|Alex|2018-04-02|   Ladder|   20|      18.33|              55|         30|          30|            3|
|Alex|2018-06-22|    Stool|   20|      18.75|              75|         30|          20|            4|
|Alex|2018-07-12|   Bucket|    5|       16.0|              80|         30|          20|            5|
|Alex|2018-09-26|Sandpaper|   10|       15.0|              90|         30|          10|            6|
|Alex|2018-10-10|    Paint|   80|      24.29|             170|         80|        

### Some Complex Window Functions Problem

### Problem 1

In [10]:
list=[['f1','a','b','c',1],
     ['f1','b','d','m',0],
     ['f2','a','b','c',0],
     ['f2','b','d','m',0],
     ['f3','a','b','c',1],
     ['f3','b','d','m',1],
     ['f4','a','b','c',0],
      ['f4','b','d','m',0]]

df= spark.createDataFrame(list,['function_name','param1','param2','param3','result'])
df.show()

+-------------+------+------+------+------+
|function_name|param1|param2|param3|result|
+-------------+------+------+------+------+
|           f1|     a|     b|     c|     1|
|           f1|     b|     d|     m|     0|
|           f2|     a|     b|     c|     0|
|           f2|     b|     d|     m|     0|
|           f3|     a|     b|     c|     1|
|           f3|     b|     d|     m|     1|
|           f4|     a|     b|     c|     0|
|           f4|     b|     d|     m|     0|
+-------------+------+------+------+------+



Output Required:

result_list |  function_name_lists
------------------------------------
    [1,0]   |   [f1]
    [0,0]   |   [f2,f4]
    [1,1]   |   [f3]

In [16]:
w=Window().partitionBy("function_name").orderBy(F.col("param1"),F.col("param2"),F.col("param3"))
w1 = Window.partitionBy("function_name")

df1 = df.withColumn("result_list",F.collect_list("result").over(w)).withColumn("rowNumber",F.row_number().over(w)).withColumn("result3",F.max("rowNumber").over(w1))


In [17]:
df1.show()

+-------------+------+------+------+------+-----------+---------+-------+
|function_name|param1|param2|param3|result|result_list|rowNumber|result3|
+-------------+------+------+------+------+-----------+---------+-------+
|           f2|     a|     b|     c|     0|        [0]|        1|      2|
|           f2|     b|     d|     m|     0|     [0, 0]|        2|      2|
|           f4|     a|     b|     c|     0|        [0]|        1|      2|
|           f4|     b|     d|     m|     0|     [0, 0]|        2|      2|
|           f1|     a|     b|     c|     1|        [1]|        1|      2|
|           f1|     b|     d|     m|     0|     [1, 0]|        2|      2|
|           f3|     a|     b|     c|     1|        [1]|        1|      2|
|           f3|     b|     d|     m|     1|     [1, 1]|        2|      2|
+-------------+------+------+------+------+-----------+---------+-------+



In [19]:
df2 = df1.filter(col("rowNumber") == col("result3")).drop("param1","param2","param3","result","rowNumber","result3")
df2.show()

+-------------+-----------+
|function_name|result_list|
+-------------+-----------+
|           f2|     [0, 0]|
|           f4|     [0, 0]|
|           f1|     [1, 0]|
|           f3|     [1, 1]|
+-------------+-----------+



In [20]:
df3 = df2.groupBy("result_list").agg(F.collect_list("function_name").alias("function_name_list"))

In [21]:
df3.show()

+-----------+------------------+
|result_list|function_name_list|
+-----------+------------------+
|     [1, 0]|              [f1]|
|     [1, 1]|              [f3]|
|     [0, 0]|          [f2, f4]|
+-----------+------------------+



### Problem 2

#### Input:
![Untitled](https://s3-us-west-2.amazonaws.com/secure.notion-static.com/aec861e5-6496-4a0d-878a-0deeff55a0db/Untitled.png)
    

Output

![Untitled](https://s3-us-west-2.amazonaws.com/secure.notion-static.com/77514104-dccc-4116-99d4-f47db0cd947a/Untitled.png)