In [3]:
import pyspark
from pyspark import SparkConf
from pyspark.sql import SparkSession, Row
import pyspark.sql.functions as F
import pyspark.sql.window as W

In [2]:
spark = SparkSession \
    .builder \
    .appName("Spark Training - DF APIs") \
    .getOrCreate()

## 1. Windows Functions
- Operate on a group of rows and return a single value for each input row
- Main package: pyspark.sql.window which has two classes - Window and WindowSpec
- Window class has APIs such as partitionBy, orderBy, rengeBetween, rowsBetween
- WindowSpec class defines the partitioning, ordering and frame boundaries. It has also above 4 APIs
- When you apply these APIs (e.g. partitionBy), a WindowSpec object will be created. Then you can apply other functions on the spec object to get the output

#### How to perform a window function
1. First partition data using Window.partitionBy

#### Typs of window functions:
- Ranking
- Analytical
- Aggregate

In [76]:
# Prepare input data
data = [
    ('James', 'Sales', 'NY', 900, 34),
    ('Alicia', 'Sales', 'NY', 8600, 56),
    ('Robert', 'Sales', 'CA', 8100, 30),
    ('John', 'Sales', 'AZ', 8600, 31),
    ('Rose', 'Sales', 'AZ', 8100, 33),
    ('Lisa', 'Finance', 'CA', 9000, 24),
    ('Deja', 'Finance', 'CA', 9900, 40),
    ('Sugie', 'Finance', 'NY', 8300, 36),
    ('Ram', 'Finance', 'NY', 7900, 53),
    ('Kyle', 'Marketing', 'CA', 8000, 25),
    ('Reid', 'Marketing', 'NY', 9100, 50)
]

schema = ['empname', 'dept', 'state', 'salary', 'age']
df = spark.createDataFrame(data=data, schema=schema)

df.show()

+-------+---------+-----+------+---+
|empname|     dept|state|salary|age|
+-------+---------+-----+------+---+
|  James|    Sales|   NY|   900| 34|
| Alicia|    Sales|   NY|  8600| 56|
| Robert|    Sales|   CA|  8100| 30|
|   John|    Sales|   AZ|  8600| 31|
|   Rose|    Sales|   AZ|  8100| 33|
|   Lisa|  Finance|   CA|  9000| 24|
|   Deja|  Finance|   CA|  9900| 40|
|  Sugie|  Finance|   NY|  8300| 36|
|    Ram|  Finance|   NY|  7900| 53|
|   Kyle|Marketing|   CA|  8000| 25|
|   Reid|Marketing|   NY|  9100| 50|
+-------+---------+-----+------+---+



#### 1.1 Ranking Functions
Used to provide a ranking to the result within a partition
- `row_number():` creates sequential Row numbers
- `rank():` creates Ranks but gaps when ties are present
- `dense_rank():` creates ranks without gaps
- `percent_rank():` creates relative rank (i.e. percentile) of rows within a window partition. First row is always 0 and last row is always 1
- `ntile():` returns the ntile group id (from 1 to n inclusive) in an ordered window partition. e.g. if n = 4, the first quarter of the rows will get value 1, the second quarter will get 2, the third quarter will get 3, the last quarter will get 4
- `cume_dist():` Returns the cumulative distribution of values within a window partition i.e. the fraction of rows that are below the current row


`Note:` ordering is required

Task 1: Create a ranking of the salaries within each department

In [26]:
# 1. first create a sa windowSpec Object, with salary sorted in the order you want to rank
spec = W.Window.partitionBy('dept').orderBy(df.salary.desc())
spec

<pyspark.sql.window.WindowSpec at 0x163930340>

In [34]:
# 2. then apply the ranking functions. Remember ntile(n) takes one argument
df.select('dept','salary')\
    .withColumn('row_number_rank', F.row_number().over(spec))\
    .withColumn('rank', F.rank().over(spec))\
    .withColumn('dense_rank', F.dense_rank().over(spec))\
    .withColumn('percent_rank', F.percent_rank().over(spec))\
    .withColumn('ntile_rank', F.ntile(4).over(spec))\
    .withColumn('cume_dist_rank', F.cume_dist().over(spec))\
    .show()

+---------+------+---------------+----+----------+------------------+----------+--------------+
|     dept|salary|row_number_rank|rank|dense_rank|      percent_rank|ntile_rank|cume_dist_rank|
+---------+------+---------------+----+----------+------------------+----------+--------------+
|    Sales|  8600|              1|   1|         1|               0.0|         1|           0.4|
|    Sales|  8600|              2|   1|         1|               0.0|         1|           0.4|
|    Sales|  8100|              3|   3|         2|               0.5|         2|           0.8|
|    Sales|  8100|              4|   3|         2|               0.5|         3|           0.8|
|    Sales|   900|              5|   5|         3|               1.0|         4|           1.0|
|  Finance|  9900|              1|   1|         1|               0.0|         1|          0.25|
|  Finance|  9000|              2|   2|         2|0.3333333333333333|         2|           0.5|
|  Finance|  8300|              3|   3| 

- Notice ranks 1 and 2 are for the same salary amount. 
- row_number() doesn't check for duplicates, hence is much faster
- other rank methods do check for duplicates and are thus slower. They would assign the same ranks to duplicates

### 1.2 Analytical Window Functions
- lag(colName,offsetVal, valueToReturnWhereNoOffsetExist): Returns offset row value before the current row value. 
- lead(): Returns offset row value after the current row value

`Note:` ordering is required

In [35]:
df.show()

+-------+---------+-----+------+---+
|empname|     dept|state|salary|age|
+-------+---------+-----+------+---+
|  James|    Sales|   NY|   900| 34|
| Alicia|    Sales|   NY|  8600| 56|
| Robert|    Sales|   CA|  8100| 30|
|   John|    Sales|   AZ|  8600| 31|
|   Rose|    Sales|   AZ|  8100| 33|
|   Lisa|  Finance|   CA|  9000| 24|
|   Deja|  Finance|   CA|  9900| 40|
|  Sugie|  Finance|   NY|  8300| 36|
|    Ram|  Finance|   NY|  7900| 53|
|   Kyle|Marketing|   CA|  8000| 25|
|   Reid|Marketing|   NY|  9100| 50|
+-------+---------+-----+------+---+



In [36]:
# as usual, start off by creating a spec i.e. a partition
spec = W.Window.partitionBy('dept').orderBy('salary')

In [38]:
df.select(df.dept, df.salary)\
    .withColumn('lag_prev_sal', F.lag('salary', 1, 0).over(spec))\
    .withColumn('lead_prev_sal', F.lead('salary', 1, 0).over(spec))\
    .show()

+---------+------+------------+-------------+
|     dept|salary|lag_prev_sal|lead_prev_sal|
+---------+------+------------+-------------+
|    Sales|   900|           0|         8100|
|    Sales|  8100|         900|         8100|
|    Sales|  8100|        8100|         8600|
|    Sales|  8600|        8100|         8600|
|    Sales|  8600|        8600|            0|
|  Finance|  7900|           0|         8300|
|  Finance|  8300|        7900|         9000|
|  Finance|  9000|        8300|         9900|
|  Finance|  9900|        9000|            0|
|Marketing|  8000|           0|         9100|
|Marketing|  9100|        8000|            0|
+---------+------+------------+-------------+



`lag arguments and results explained:`
- we are taking the lag1 (offset = 1) of salary, and where a lag doesn't exist (e.g. the case for all first values within a partition) we use the value 0
- similar for lead

### 1.3 Aggregate Window Functions
- We can apply these aggregate functions on the windows functions
- For these ones we dont need to apply any orderby, as this works on a group of records where ordering is not required

In [39]:
df.show()

+-------+---------+-----+------+---+
|empname|     dept|state|salary|age|
+-------+---------+-----+------+---+
|  James|    Sales|   NY|   900| 34|
| Alicia|    Sales|   NY|  8600| 56|
| Robert|    Sales|   CA|  8100| 30|
|   John|    Sales|   AZ|  8600| 31|
|   Rose|    Sales|   AZ|  8100| 33|
|   Lisa|  Finance|   CA|  9000| 24|
|   Deja|  Finance|   CA|  9900| 40|
|  Sugie|  Finance|   NY|  8300| 36|
|    Ram|  Finance|   NY|  7900| 53|
|   Kyle|Marketing|   CA|  8000| 25|
|   Reid|Marketing|   NY|  9100| 50|
+-------+---------+-----+------+---+



In [41]:
# create a spec
spec = W.Window.partitionBy('dept')

df.select('dept', 'salary')\
    .withColumn('sum_sal_per_dept', F.sum('salary').over(spec))\
    .show()

+---------+------+----------------+
|     dept|salary|sum_sal_per_dept|
+---------+------+----------------+
|    Sales|   900|           34300|
|    Sales|  8600|           34300|
|    Sales|  8100|           34300|
|    Sales|  8600|           34300|
|    Sales|  8100|           34300|
|  Finance|  9000|           35100|
|  Finance|  9900|           35100|
|  Finance|  8300|           35100|
|  Finance|  7900|           35100|
|Marketing|  8000|           17100|
|Marketing|  9100|           17100|
+---------+------+----------------+



Let's get the highest & lowest salary in each department

In [46]:
spec = W.Window.partitionBy('dept').orderBy(df.salary.desc())

df.select('dept', 'salary')\
    .withColumn('highest_salary', F.first('salary').over(spec))\
    .withColumn('lowest_salary', F.last('salary').over(spec))\
    .show()

+---------+------+--------------+-------------+
|     dept|salary|highest_salary|lowest_salary|
+---------+------+--------------+-------------+
|    Sales|  8600|          8600|         8600|
|    Sales|  8600|          8600|         8600|
|    Sales|  8100|          8600|         8100|
|    Sales|  8100|          8600|         8100|
|    Sales|   900|          8600|          900|
|  Finance|  9900|          9900|         9900|
|  Finance|  9000|          9900|         9000|
|  Finance|  8300|          9900|         8300|
|  Finance|  7900|          9900|         7900|
|Marketing|  9100|          9100|         9100|
|Marketing|  8000|          9100|         8000|
+---------+------+--------------+-------------+



### 1.4 rangeBetween() and rowBetween()

`rangeBetween()`
- Takes two arguments (start, end) to define frame boundaries
- Default: unboundedPreceding and unboundedFollowing
- Both `start` and `end` are relative from the current row. e.g. 0 = current row, -1= one less than the current row, 5= 5 after the current row
- Recommended to use: `Window.unboundedPreceding`, `Window.unboundedFollowing`, `Window.currentRow` rather than using inegral values directly

`rowsBetween()`
- similar notes to above

#### Example 1: from unbounded Preceeding to unbounded Following
- Aggregates from the first row of the partition to the last row of partition
- Same results returned for both rangBetween() and rowsBetween()

In [62]:
spec_rangebtw = W.Window.partitionBy('dept')\
                    .orderBy('salary')\
                    .rangeBetween(W.Window.unboundedPreceding, W.Window.unboundedFollowing)

spec2_rowsbtw = W.Window.partitionBy('dept')\
                    .orderBy('salary')\
                    .rowsBetween(W.Window.unboundedPreceding, W.Window.unboundedFollowing)

In [68]:
df.select(df.dept, df.salary).withColumn('sum_sal', F.sum('salary').over(spec_rangebtw)).show()

+---------+------+-------+
|     dept|salary|sum_sal|
+---------+------+-------+
|    Sales|   900|  34300|
|    Sales|  8100|  34300|
|    Sales|  8100|  34300|
|    Sales|  8600|  34300|
|    Sales|  8600|  34300|
|  Finance|  7900|  35100|
|  Finance|  8300|  35100|
|  Finance|  9000|  35100|
|  Finance|  9900|  35100|
|Marketing|  8000|  17100|
|Marketing|  9100|  17100|
+---------+------+-------+



In [67]:
df.select(df.dept, df.salary).withColumn('sum_sal', F.sum('salary').over(spec2_rowsbtw)).show()

+---------+------+-------+
|     dept|salary|sum_sal|
+---------+------+-------+
|    Sales|   900|  34300|
|    Sales|  8100|  34300|
|    Sales|  8100|  34300|
|    Sales|  8600|  34300|
|    Sales|  8600|  34300|
|  Finance|  7900|  35100|
|  Finance|  8300|  35100|
|  Finance|  9000|  35100|
|  Finance|  9900|  35100|
|Marketing|  8000|  17100|
|Marketing|  9100|  17100|
+---------+------+-------+



#### Example 2: from `currentRow` to `unboundedFollowing`
- This aggregates from the current row values up to the last row values in the partition
- similar results produced for rangeBetween() and rowsBetween()

In [77]:
# create specs
spec_rangebtw = W.Window.partitionBy('dept')\
                    .orderBy('salary')\
                    .rangeBetween(W.Window.currentRow, W.Window.unboundedFollowing)

spec2_rowsbtw = W.Window.partitionBy('dept')\
                    .orderBy('salary')\
                    .rowsBetween(W.Window.currentRow, W.Window.unboundedFollowing)

In [78]:
df.select(df.dept, df.salary).withColumn('sum_sal', F.sum('salary').over(spec_rangebtw)).show()

+---------+------+-------+
|     dept|salary|sum_sal|
+---------+------+-------+
|    Sales|   900|  34300|
|    Sales|  8100|  33400|
|    Sales|  8100|  33400|
|    Sales|  8600|  17200|
|    Sales|  8600|  17200|
|  Finance|  7900|  35100|
|  Finance|  8300|  27200|
|  Finance|  9000|  18900|
|  Finance|  9900|   9900|
|Marketing|  8000|  17100|
|Marketing|  9100|   9100|
+---------+------+-------+



In [79]:
df.select(df.dept, df.salary).withColumn('sum_sal', F.sum('salary').over(spec2_rowsbtw)).show()

+---------+------+-------+
|     dept|salary|sum_sal|
+---------+------+-------+
|    Sales|   900|  34300|
|    Sales|  8100|  33400|
|    Sales|  8100|  25300|
|    Sales|  8600|  17200|
|    Sales|  8600|   8600|
|  Finance|  7900|  35100|
|  Finance|  8300|  27200|
|  Finance|  9000|  18900|
|  Finance|  9900|   9900|
|Marketing|  8000|  17100|
|Marketing|  9100|   9100|
+---------+------+-------+



Notice:
- both outputs are the same
- first value of sum_sal = total for that partition
- second value of sum_sal = total for that partition from current value to the last value (for `rows between`)

Difference:
- `rangeBetween()` assigns the same aggregated value to duplicates wheras `rowsBetween()` treats each new row as new record hence the name 

#### Example 3: from currentRow to constant value
- Might be handy if you want to calculate 12Month and 3month rolling values

In [82]:
# create specs
# range between
spec_rangebtw = W.Window.partitionBy('dept')\
                    .orderBy('salary')\
                    .rangeBetween(W.Window.currentRow, 500)

df.select(df.dept, df.salary).withColumn('sum_sal', F.sum('salary').over(spec_rangebtw)).show()

+---------+------+-------+
|     dept|salary|sum_sal|
+---------+------+-------+
|    Sales|   900|    900|
|    Sales|  8100|  33400|
|    Sales|  8100|  33400|
|    Sales|  8600|  17200|
|    Sales|  8600|  17200|
|  Finance|  7900|  16200|
|  Finance|  8300|   8300|
|  Finance|  9000|   9000|
|  Finance|  9900|   9900|
|Marketing|  8000|   8000|
|Marketing|  9100|   9100|
+---------+------+-------+



In [81]:
# rows between current row and the next two
spec2_rowsbtw = W.Window.partitionBy('dept')\
                    .orderBy('salary')\
                    .rowsBetween(W.Window.currentRow, 2)

df.select(df.dept, df.salary).withColumn('sum_sal', F.sum('salary').over(spec2_rowsbtw)).show()

+---------+------+-------+
|     dept|salary|sum_sal|
+---------+------+-------+
|    Sales|   900|  17100|
|    Sales|  8100|  24800|
|    Sales|  8100|  25300|
|    Sales|  8600|  17200|
|    Sales|  8600|   8600|
|  Finance|  7900|  25200|
|  Finance|  8300|  27200|
|  Finance|  9000|  18900|
|  Finance|  9900|   9900|
|Marketing|  8000|  17100|
|Marketing|  9100|   9100|
+---------+------+-------+

