In [1]:
import numpy as np 
import pandas as pd 
import pyspark.sql.types as T
import pyspark.sql.functions as F
from pyspark.sql import SparkSession
from pyspark.sql.window import Window

In [2]:
# initialize spark session
spark = SparkSession.builder \
            .master("local[*]") \
            .appName("ShortNSimple") \
            .getOrCreate()
spark

In [3]:
data = spark.createDataFrame([['A'], ['B'], ['B'], ['C'], ['C'], ['C'], ['D']], ['Name'])
data.show(truncate=False)

+----+
|Name|
+----+
|A   |
|B   |
|B   |
|C   |
|C   |
|C   |
|D   |
+----+



## Row Number

In [5]:
data = data.withColumn(
    "row_number",
    F.row_number().over(Window.partitionBy().orderBy(F.desc("Name")))
)

data.show()

+----+----------+
|Name|row_number|
+----+----------+
|   D|         1|
|   C|         2|
|   C|         3|
|   C|         4|
|   B|         5|
|   B|         6|
|   A|         7|
+----+----------+



## Rank

In [7]:
data = data.withColumn(
    "rank",
    F.rank().over(Window.orderBy(F.desc("Name")))
)

data.cache()

data.show()

+----+----------+----+
|Name|row_number|rank|
+----+----------+----+
|   D|         1|   1|
|   C|         2|   2|
|   C|         3|   2|
|   C|         4|   2|
|   B|         5|   5|
|   B|         6|   5|
|   A|         7|   7|
+----+----------+----+



## Dense Rank

In [8]:
data = data.withColumn(
    "dense_rank",
    F.dense_rank().over(Window.orderBy(F.desc("Name")))
)

data.show()

+----+----------+----+----------+
|Name|row_number|rank|dense_rank|
+----+----------+----+----------+
|   D|         1|   1|         1|
|   C|         2|   2|         2|
|   C|         3|   2|         2|
|   C|         4|   2|         2|
|   B|         5|   5|         3|
|   B|         6|   5|         3|
|   A|         7|   7|         4|
+----+----------+----+----------+

