In [1]:
from pyspark.sql import SparkSession
# Initialize a Spark session
spark = SparkSession.builder.appName("WindowFunctionsExample").getOrCreate()

In [8]:
# Define schema
schema = ["country", "week_number", "number_of_invoices", "total_quantity", "invoice_values"]

# Data for each country (with repeated invoice values for ranking)
data = [
    ("Portugal", 22, 19, 1525, 4975.32), ("Portugal", 22, 18, 1230, 4975.32),
    ("Germany", 2, 14, 1680, 6040.99), ("Germany", 2, 15, 1890, 6040.99),
    ("Netherlands", 49, 5, 28, 103.78), ("Netherlands", 49, 7, 42, 103.78),
    ("Spain", 4, 16, 730, 1909.05), ("Spain", 4, 15, 680, 1909.05),
    ("Netherlands", 29, 4, 1723, 3435.96), ("Netherlands", 29, 3, 1543, 3435.96),
    ("Germany", 25, 5, 1169, 5154.49), ("Germany", 25, 4, 987, 5154.49),
    ("France", 47, 9, 1644, 3987.62), ("France", 47, 8, 1456, 3987.62),
    ("Spain", 49, 11, 657, 2881.72), ("Spain", 49, 10, 590, 2881.72),
    ("France", 18, 3, 1882, 9171.42), ("France", 18, 2, 1673, 9171.42),
    ("Germany", 42, 18, 1719, 1383.78), ("Germany", 42, 17, 1532, 1383.78),
    
]

df = spark.createDataFrame(data, schema=schema)

df.show()  

+-----------+-----------+------------------+--------------+--------------+
|    country|week_number|number_of_invoices|total_quantity|invoice_values|
+-----------+-----------+------------------+--------------+--------------+
|   Portugal|         22|                19|          1525|       4975.32|
|   Portugal|         22|                18|          1230|       4975.32|
|    Germany|          2|                14|          1680|       6040.99|
|    Germany|          2|                15|          1890|       6040.99|
|Netherlands|         49|                 5|            28|        103.78|
|Netherlands|         49|                 7|            42|        103.78|
|      Spain|          4|                16|           730|       1909.05|
|      Spain|          4|                15|           680|       1909.05|
|Netherlands|         29|                 4|          1723|       3435.96|
|Netherlands|         29|                 3|          1543|       3435.96|
|    Germany|         25|

In [13]:
from pyspark.sql.window import Window
from pyspark.sql.functions import *


In [14]:
# Define window specification
my_window = Window.partitionBy("country") \
            .orderBy(asc("week_number"))

In [17]:
df.withColumn("first_value",nth_value("invoice_values",1).over(my_window)).show()

+-----------+-----------+------------------+--------------+--------------+-----------+
|    country|week_number|number_of_invoices|total_quantity|invoice_values|first_value|
+-----------+-----------+------------------+--------------+--------------+-----------+
|     France|         18|                 3|          1882|       9171.42|    9171.42|
|     France|         18|                 2|          1673|       9171.42|    9171.42|
|     France|         47|                 9|          1644|       3987.62|    9171.42|
|     France|         47|                 8|          1456|       3987.62|    9171.42|
|    Germany|          2|                14|          1680|       6040.99|    6040.99|
|    Germany|          2|                15|          1890|       6040.99|    6040.99|
|    Germany|         25|                 5|          1169|       5154.49|    6040.99|
|    Germany|         25|                 4|           987|       5154.49|    6040.99|
|    Germany|         42|                18

In [20]:
my_window = Window.partitionBy("country") \
            .orderBy(asc("week_number"))
df.withColumn("nth_value_2",nth_value("invoice_values",3).over(my_window)).show()

+-----------+-----------+------------------+--------------+--------------+-----------+
|    country|week_number|number_of_invoices|total_quantity|invoice_values|nth_value_2|
+-----------+-----------+------------------+--------------+--------------+-----------+
|     France|         18|                 3|          1882|       9171.42|       null|
|     France|         18|                 2|          1673|       9171.42|       null|
|     France|         47|                 9|          1644|       3987.62|    3987.62|
|     France|         47|                 8|          1456|       3987.62|    3987.62|
|    Germany|          2|                14|          1680|       6040.99|       null|
|    Germany|          2|                15|          1890|       6040.99|       null|
|    Germany|         25|                 5|          1169|       5154.49|    5154.49|
|    Germany|         25|                 4|           987|       5154.49|    5154.49|
|    Germany|         42|                18

In [21]:
my_window = Window.partitionBy("country") \
            .orderBy(asc("week_number")) \
                .rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing)
df.withColumn("nth_value_2",nth_value("invoice_values",3).over(my_window)).show()

+-----------+-----------+------------------+--------------+--------------+-----------+
|    country|week_number|number_of_invoices|total_quantity|invoice_values|nth_value_2|
+-----------+-----------+------------------+--------------+--------------+-----------+
|     France|         18|                 3|          1882|       9171.42|    3987.62|
|     France|         18|                 2|          1673|       9171.42|    3987.62|
|     France|         47|                 9|          1644|       3987.62|    3987.62|
|     France|         47|                 8|          1456|       3987.62|    3987.62|
|    Germany|          2|                14|          1680|       6040.99|    5154.49|
|    Germany|          2|                15|          1890|       6040.99|    5154.49|
|    Germany|         25|                 5|          1169|       5154.49|    5154.49|
|    Germany|         25|                 4|           987|       5154.49|    5154.49|
|    Germany|         42|                18

In [26]:
df.withColumn("first_value",
              expr("first_value(invoice_values) over(partition by country order by week_number)")).show()

+-----------+-----------+------------------+--------------+--------------+-----------+
|    country|week_number|number_of_invoices|total_quantity|invoice_values|first_value|
+-----------+-----------+------------------+--------------+--------------+-----------+
|     France|         18|                 3|          1882|       9171.42|    9171.42|
|     France|         18|                 2|          1673|       9171.42|    9171.42|
|     France|         47|                 9|          1644|       3987.62|    9171.42|
|     France|         47|                 8|          1456|       3987.62|    9171.42|
|    Germany|          2|                14|          1680|       6040.99|    6040.99|
|    Germany|          2|                15|          1890|       6040.99|    6040.99|
|    Germany|         25|                 5|          1169|       5154.49|    6040.99|
|    Germany|         25|                 4|           987|       5154.49|    6040.99|
|    Germany|         42|                18

In [28]:
df.withColumn("last_value",expr(" last_value(invoice_values) \
over(partition by country order by week_number rows between unbounded preceding and unbounded following)")) \
    .show()

+-----------+-----------+------------------+--------------+--------------+----------+
|    country|week_number|number_of_invoices|total_quantity|invoice_values|last_value|
+-----------+-----------+------------------+--------------+--------------+----------+
|     France|         18|                 3|          1882|       9171.42|   3987.62|
|     France|         18|                 2|          1673|       9171.42|   3987.62|
|     France|         47|                 9|          1644|       3987.62|   3987.62|
|     France|         47|                 8|          1456|       3987.62|   3987.62|
|    Germany|          2|                14|          1680|       6040.99|   1383.78|
|    Germany|          2|                15|          1890|       6040.99|   1383.78|
|    Germany|         25|                 5|          1169|       5154.49|   1383.78|
|    Germany|         25|                 4|           987|       5154.49|   1383.78|
|    Germany|         42|                18|          

In [29]:
df.withColumn("nth_value3",expr(" nth_value(invoice_values,3) \
over(partition by country order by week_number rows between unbounded preceding and unbounded following)")) \
    .show()

+-----------+-----------+------------------+--------------+--------------+----------+
|    country|week_number|number_of_invoices|total_quantity|invoice_values|nth_value3|
+-----------+-----------+------------------+--------------+--------------+----------+
|     France|         18|                 3|          1882|       9171.42|   3987.62|
|     France|         18|                 2|          1673|       9171.42|   3987.62|
|     France|         47|                 9|          1644|       3987.62|   3987.62|
|     France|         47|                 8|          1456|       3987.62|   3987.62|
|    Germany|          2|                14|          1680|       6040.99|   5154.49|
|    Germany|          2|                15|          1890|       6040.99|   5154.49|
|    Germany|         25|                 5|          1169|       5154.49|   5154.49|
|    Germany|         25|                 4|           987|       5154.49|   5154.49|
|    Germany|         42|                18|          

In [30]:
df.createOrReplaceTempView("df_table")

In [34]:
spark.sql("""
select country ,week_number ,invoice_values,
first_value(invoice_values) over(partition by country order by week_number ) as fstvalue,
last_value(invoice_values) over(partition by country order by week_number 
                                rows between unbounded preceding and unbounded following ) as lstvalue,
nth_value(invoice_values , 2) over(partition by country order by week_number 
                                rows between unbounded preceding and unbounded following ) as nthvalue
from df_table
          """).show()

+-----------+-----------+--------------+--------+--------+--------+
|    country|week_number|invoice_values|fstvalue|lstvalue|nthvalue|
+-----------+-----------+--------------+--------+--------+--------+
|     France|         18|       9171.42| 9171.42| 3987.62| 9171.42|
|     France|         18|       9171.42| 9171.42| 3987.62| 9171.42|
|     France|         47|       3987.62| 9171.42| 3987.62| 9171.42|
|     France|         47|       3987.62| 9171.42| 3987.62| 9171.42|
|    Germany|          2|       6040.99| 6040.99| 1383.78| 6040.99|
|    Germany|          2|       6040.99| 6040.99| 1383.78| 6040.99|
|    Germany|         25|       5154.49| 6040.99| 1383.78| 6040.99|
|    Germany|         25|       5154.49| 6040.99| 1383.78| 6040.99|
|    Germany|         42|       1383.78| 6040.99| 1383.78| 6040.99|
|    Germany|         42|       1383.78| 6040.99| 1383.78| 6040.99|
|Netherlands|         29|       3435.96| 3435.96|  103.78| 3435.96|
|Netherlands|         29|       3435.96| 3435.96