## usual

In [None]:
#rank,dense_rank,row_number,ntile,percent_rank,cume_dist,
#first_value,last_valu,nth_value
#lead,lag
#sum(),count(),min(),max(),avg()

In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("window").getOrCreate()

## df

In [2]:
# Define schema
schema = ["country", "week_number", "number_of_invoices", "total_quantity", "invoice_values"]

# Data for each country (with repeated invoice values for ranking)
data = [
    ("Portugal", 22, 19, 1525, 4975.32), ("Portugal", 22, 18, 1230, 4975.32),
    ("Germany", 2, 14, 1680, 6040.99), ("Germany", 2, 15, 1890, 6040.99),
    ("Netherlands", 49, 5, 28, 103.78), ("Netherlands", 49, 7, 42, 103.78),
    ("Spain", 4, 16, 730, 1909.05), ("Spain", 4, 15, 680, 1909.05),
    ("Netherlands", 29, 4, 1723, 3435.96), ("Netherlands", 29, 3, 1543, 3435.96),
    ("Germany", 25, 5, 1169, 5154.49), ("Germany", 25, 4, 987, 5154.49),
    ("France", 47, 9, 1644, 3987.62), ("France", 47, 8, 1456, 3987.62),
    ("Spain", 49, 11, 657, 2881.72), ("Spain", 49, 10, 590, 2881.72),
    ("France", 18, 3, 1882, 9171.42), ("France", 18, 2, 1673, 9171.42),
    ("Germany", 42, 18, 1719, 1383.78), ("Germany", 42, 17, 1532, 1383.78),
    
]

df = spark.createDataFrame(data, schema=schema)

df.show()  

+-----------+-----------+------------------+--------------+--------------+
|    country|week_number|number_of_invoices|total_quantity|invoice_values|
+-----------+-----------+------------------+--------------+--------------+
|   Portugal|         22|                19|          1525|       4975.32|
|   Portugal|         22|                18|          1230|       4975.32|
|    Germany|          2|                14|          1680|       6040.99|
|    Germany|          2|                15|          1890|       6040.99|
|Netherlands|         49|                 5|            28|        103.78|
|Netherlands|         49|                 7|            42|        103.78|
|      Spain|          4|                16|           730|       1909.05|
|      Spain|          4|                15|           680|       1909.05|
|Netherlands|         29|                 4|          1723|       3435.96|
|Netherlands|         29|                 3|          1543|       3435.96|
|    Germany|         25|

In [5]:
df.createOrReplaceTempView("df_table")

In [3]:
df.orderBy("country").show()

+-----------+-----------+------------------+--------------+--------------+
|    country|week_number|number_of_invoices|total_quantity|invoice_values|
+-----------+-----------+------------------+--------------+--------------+
|     France|         18|                 3|          1882|       9171.42|
|     France|         47|                 9|          1644|       3987.62|
|     France|         18|                 2|          1673|       9171.42|
|     France|         47|                 8|          1456|       3987.62|
|    Germany|          2|                14|          1680|       6040.99|
|    Germany|         25|                 5|          1169|       5154.49|
|    Germany|         42|                18|          1719|       1383.78|
|    Germany|         25|                 4|           987|       5154.49|
|    Germany|          2|                15|          1890|       6040.99|
|    Germany|         42|                17|          1532|       1383.78|
|Netherlands|         49|

In [7]:
df.printSchema()

root
 |-- country: string (nullable = true)
 |-- week_number: long (nullable = true)
 |-- number_of_invoices: long (nullable = true)
 |-- total_quantity: long (nullable = true)
 |-- invoice_values: double (nullable = true)



In [10]:
from pyspark.sql import Window
from pyspark.sql.functions import *

## Running Sum

In [12]:
window1 = Window.partitionBy("country") \
    .orderBy("week_number") \
        .rowsBetween(Window.unboundedPreceding,Window.currentRow)
        
df.withColumn("running_sum" , round(sum("invoice_values").over(window1))).show(3)

+-------+-----------+------------------+--------------+--------------+-----------+
|country|week_number|number_of_invoices|total_quantity|invoice_values|running_sum|
+-------+-----------+------------------+--------------+--------------+-----------+
| France|         18|                 3|          1882|       9171.42|     9171.0|
| France|         18|                 2|          1673|       9171.42|    18343.0|
| France|         47|                 9|          1644|       3987.62|    22330.0|
+-------+-----------+------------------+--------------+--------------+-----------+
only showing top 3 rows



In [11]:
my_window1 = Window.partitionBy("country") \
                    .orderBy(col("week_number").asc()) \
                    .rowsBetween(Window.unboundedPreceding , Window.currentRow)
                    
df.withColumn(  "running_total" , round(sum("invoice_values").over(my_window1)) ).show()

+-----------+-----------+------------------+--------------+--------------+-------------+
|    country|week_number|number_of_invoices|total_quantity|invoice_values|running_total|
+-----------+-----------+------------------+--------------+--------------+-------------+
|     France|         18|                 3|          1882|       9171.42|       9171.0|
|     France|         18|                 2|          1673|       9171.42|      18343.0|
|     France|         47|                 9|          1644|       3987.62|      22330.0|
|     France|         47|                 8|          1456|       3987.62|      26318.0|
|    Germany|          2|                14|          1680|       6040.99|       6041.0|
|    Germany|          2|                15|          1890|       6040.99|      12082.0|
|    Germany|         25|                 5|          1169|       5154.49|      17236.0|
|    Germany|         25|                 4|           987|       5154.49|      22391.0|
|    Germany|        

In [14]:
spark.sql("""
          select * ,
          sum(invoice_values) over(partition by country order by week_number asc
          rows between unbounded preceding and current row ) as running_sum
          from df_table
          """).show()

+-----------+-----------+------------------+--------------+--------------+------------------+
|    country|week_number|number_of_invoices|total_quantity|invoice_values|       running_sum|
+-----------+-----------+------------------+--------------+--------------+------------------+
|     France|         18|                 3|          1882|       9171.42|           9171.42|
|     France|         18|                 2|          1673|       9171.42|          18342.84|
|     France|         47|                 9|          1644|       3987.62|          22330.46|
|     France|         47|                 8|          1456|       3987.62|26318.079999999998|
|    Germany|          2|                14|          1680|       6040.99|           6040.99|
|    Germany|          2|                15|          1890|       6040.99|          12081.98|
|    Germany|         25|                 5|          1169|       5154.49|          17236.47|
|    Germany|         25|                 4|           987| 

## pyspark way

In [69]:
my_window = Window.partitionBy("country") \
                    .orderBy(desc("invoice_values")) 
                        
df.select("country","invoice_values") \
.withColumn("rnk" , rank().over(my_window)) \
.withColumn("dnsrnk" , dense_rank().over(my_window)) \
.withColumn("rnum" , row_number().over(my_window)) \
.withColumn("pctrnk" , round(percent_rank().over(my_window),2)) \
.withColumn("cumdist" , round(cume_dist().over(my_window),2)) \
.withColumn("bucket" , ntile(2).over(my_window)) \
.show()


+-----------+--------------+---+------+----+------+-------+------+
|    country|invoice_values|rnk|dnsrnk|rnum|pctrnk|cumdist|bucket|
+-----------+--------------+---+------+----+------+-------+------+
|     France|       9171.42|  1|     1|   1|   0.0|    0.5|     1|
|     France|       9171.42|  1|     1|   2|   0.0|    0.5|     1|
|     France|       3987.62|  3|     2|   3|  0.67|    1.0|     2|
|     France|       3987.62|  3|     2|   4|  0.67|    1.0|     2|
|    Germany|       6040.99|  1|     1|   1|   0.0|   0.33|     1|
|    Germany|       6040.99|  1|     1|   2|   0.0|   0.33|     1|
|    Germany|       5154.49|  3|     2|   3|   0.4|   0.67|     1|
|    Germany|       5154.49|  3|     2|   4|   0.4|   0.67|     2|
|    Germany|       1383.78|  5|     3|   5|   0.8|    1.0|     2|
|    Germany|       1383.78|  5|     3|   6|   0.8|    1.0|     2|
|Netherlands|       3435.96|  1|     1|   1|   0.0|    0.5|     1|
|Netherlands|       3435.96|  1|     1|   2|   0.0|    0.5|   

In [71]:
my_window = Window.partitionBy("country") \
                    .orderBy(desc("invoice_values")) 
                        
df.select("country","invoice_values") \
.withColumn("rnk" , rank().over(my_window)) \
.where("rnk == 1") \
.show()

+-----------+--------------+---+
|    country|invoice_values|rnk|
+-----------+--------------+---+
|     France|       9171.42|  1|
|     France|       9171.42|  1|
|    Germany|       6040.99|  1|
|    Germany|       6040.99|  1|
|Netherlands|       3435.96|  1|
|Netherlands|       3435.96|  1|
|   Portugal|       4975.32|  1|
|   Portugal|       4975.32|  1|
|      Spain|       2881.72|  1|
|      Spain|       2881.72|  1|
+-----------+--------------+---+



## SQL

In [55]:
spark.sql("""
          select country,invoice_values,
          row_number() over w as rn,
          rank() over w as rnk,
          dense_rank() over w as dsrnk,
          percent_rank() over w as pctrnk,
          cume_dist() over w as distrnk,
          ntile(2) over w as bucket
          from df_table
          window w as (partition by country order by invoice_values desc)
          """).show()

+-----------+--------------+---+---+-----+------------------+------------------+------+
|    country|invoice_values| rn|rnk|dsrnk|            pctrnk|           distrnk|bucket|
+-----------+--------------+---+---+-----+------------------+------------------+------+
|     France|       9171.42|  1|  1|    1|               0.0|               0.5|     1|
|     France|       9171.42|  2|  1|    1|               0.0|               0.5|     1|
|     France|       3987.62|  3|  3|    2|0.6666666666666666|               1.0|     2|
|     France|       3987.62|  4|  3|    2|0.6666666666666666|               1.0|     2|
|    Germany|       6040.99|  1|  1|    1|               0.0|0.3333333333333333|     1|
|    Germany|       6040.99|  2|  1|    1|               0.0|0.3333333333333333|     1|
|    Germany|       5154.49|  3|  3|    2|               0.4|0.6666666666666666|     1|
|    Germany|       5154.49|  4|  3|    2|               0.4|0.6666666666666666|     2|
|    Germany|       1383.78|  5|

## student

In [54]:
schema = ["student","score"]
data =[("Alice",70),
       ("Bob",80),
       ("Charlie",90),
       ("David",60),
       ("Eve",100),
       ("Frank",50),
       ("Grace",40),
       ("Heidi",30),
       ("Ivan",20),
       ("Judy",10),
       ]
student_df = spark.createDataFrame(data , schema = schema)
student_df.createOrReplaceTempView("student")

spark.sql("""
          select *,
          row_number() over w as rn,
          rank() over w as rnk,
          dense_rank() over w as dsrnk,
          percent_rank() over w as pctrnk,
          cume_dist() over w as distrnk,
          ntile(3) over w as bucket
          from student
          window w as ( order by score desc)
          """).show()

# percent_rank() gives the percentage of scores that are less than the current score
#cume_dist()  is cumulative distribution the portion of scores that are less than or equal to current score

+-------+-----+---+---+-----+------------------+-------+------+
|student|score| rn|rnk|dsrnk|            pctrnk|distrnk|bucket|
+-------+-----+---+---+-----+------------------+-------+------+
|    Eve|  100|  1|  1|    1|               0.0|    0.1|     1|
|Charlie|   90|  2|  2|    2|0.1111111111111111|    0.2|     1|
|    Bob|   80|  3|  3|    3|0.2222222222222222|    0.3|     1|
|  Alice|   70|  4|  4|    4|0.3333333333333333|    0.4|     1|
|  David|   60|  5|  5|    5|0.4444444444444444|    0.5|     2|
|  Frank|   50|  6|  6|    6|0.5555555555555556|    0.6|     2|
|  Grace|   40|  7|  7|    7|0.6666666666666666|    0.7|     2|
|  Heidi|   30|  8|  8|    8|0.7777777777777778|    0.8|     3|
|   Ivan|   20|  9|  9|    9|0.8888888888888888|    0.9|     3|
|   Judy|   10| 10| 10|   10|               1.0|    1.0|     3|
+-------+-----+---+---+-----+------------------+-------+------+



## random

In [72]:
df.select("*").show()

+-----------+-----------+------------------+--------------+--------------+
|    country|week_number|number_of_invoices|total_quantity|invoice_values|
+-----------+-----------+------------------+--------------+--------------+
|   Portugal|         22|                19|          1525|       4975.32|
|   Portugal|         22|                18|          1230|       4975.32|
|    Germany|          2|                14|          1680|       6040.99|
|    Germany|          2|                15|          1890|       6040.99|
|Netherlands|         49|                 5|            28|        103.78|
|Netherlands|         49|                 7|            42|        103.78|
|      Spain|          4|                16|           730|       1909.05|
|      Spain|          4|                15|           680|       1909.05|
|Netherlands|         29|                 4|          1723|       3435.96|
|Netherlands|         29|                 3|          1543|       3435.96|
|    Germany|         25|

In [73]:
df.select(*).show()

SyntaxError: invalid syntax (1790677206.py, line 1)