## usual

In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("windows_aggregation").getOrCreate()

In [2]:
df = spark.read.format("csv").option("header","true").option("inferSchema","true") \
    .load(r"C:\Users\TARUN\Desktop\Pyspark\Trendytech\week8\window")
    
df.show(3)

+-----------+-----------+------------------+--------------+--------------+
|    country|week_number|number_of_invoices|total_quantity|invoice_values|
+-----------+-----------+------------------+--------------+--------------+
|Netherlands|         26|                13|          1105|       1695.64|
|      Spain|         47|                17|           924|       3397.66|
|    Germany|         33|                 9|            46|        229.47|
+-----------+-----------+------------------+--------------+--------------+
only showing top 3 rows



In [4]:
df.printSchema()

root
 |-- country: string (nullable = true)
 |-- week_number: integer (nullable = true)
 |-- number_of_invoices: integer (nullable = true)
 |-- total_quantity: integer (nullable = true)
 |-- invoice_values: double (nullable = true)



In [12]:
df.createOrReplaceTempView("df_table")

In [7]:
# running Sum
from pyspark.sql import Window
from pyspark.sql.functions import *

## windows programming

In [10]:
my_window1 = Window.partitionBy("Country") \
                    .orderBy(col("week_number").asc()) \
                    .rowsBetween(Window.unboundedPreceding , Window.currentRow)
                    
df.withColumn(  "running_total" , round(sum("invoice_values").over(my_window1)) ).show()

+-------+-----------+------------------+--------------+--------------+-------------+
|country|week_number|number_of_invoices|total_quantity|invoice_values|running_total|
+-------+-----------+------------------+--------------+--------------+-------------+
| France|          3|                17|          1424|       6539.15|       6539.0|
| France|          3|                 4|           470|        307.28|       6846.0|
| France|          6|                14|           320|       1595.44|       8442.0|
| France|          7|                14|           899|        3877.5|      12319.0|
| France|         12|                 4|          1661|       7977.64|      20297.0|
| France|         15|                14|          1584|       3020.55|      23318.0|
| France|         17|                 2|           179|        468.48|      23786.0|
| France|         18|                11|           428|       1599.14|      25385.0|
| France|         18|                 3|          1882|       917

In [16]:
my_window1 = Window.partitionBy("Country") \
                    .orderBy(col("week_number").asc()) \
                    
df.withColumn(  "running_total" , round(sum("invoice_values").over(my_window1)) ).show()

+-------+-----------+------------------+--------------+--------------+-------------+
|country|week_number|number_of_invoices|total_quantity|invoice_values|running_total|
+-------+-----------+------------------+--------------+--------------+-------------+
| France|          3|                17|          1424|       6539.15|       6846.0|
| France|          3|                 4|           470|        307.28|       6846.0|
| France|          6|                14|           320|       1595.44|       8442.0|
| France|          7|                14|           899|        3877.5|      12319.0|
| France|         12|                 4|          1661|       7977.64|      20297.0|
| France|         15|                14|          1584|       3020.55|      23318.0|
| France|         17|                 2|           179|        468.48|      23786.0|
| France|         18|                11|           428|       1599.14|      35274.0|
| France|         18|                 3|          1882|       917

## windows SQL

In [15]:
spark.sql("""
    select *,
    round(
    sum(invoice_values)
    over(partition by country order by week_number desc rows between unbounded preceding and current row ))
    as running_sum
    from df_table
          """).show()

+-------+-----------+------------------+--------------+--------------+-----------+
|country|week_number|number_of_invoices|total_quantity|invoice_values|running_sum|
+-------+-----------+------------------+--------------+--------------+-----------+
| France|         52|                 4|          1511|        950.36|      950.0|
| France|         51|                 4|           592|       2905.93|     3856.0|
| France|         48|                 4|           977|       2260.87|     6117.0|
| France|         47|                 9|          1644|       3987.62|    10105.0|
| France|         47|                 2|          1907|        968.93|    11074.0|
| France|         47|                 7|          1240|        720.77|    11794.0|
| France|         45|                13|           107|         55.98|    11850.0|
| France|         45|                17|           663|        737.53|    12588.0|
| France|         42|                 1|           428|       1558.82|    14147.0|
| Fr

In [17]:
spark.sql("""
    select *,
    round(
    sum(invoice_values)
    over(partition by country order by week_number desc  ))
    as running_sum
    from df_table
          """).show()

+-------+-----------+------------------+--------------+--------------+-----------+
|country|week_number|number_of_invoices|total_quantity|invoice_values|running_sum|
+-------+-----------+------------------+--------------+--------------+-----------+
| France|         52|                 4|          1511|        950.36|      950.0|
| France|         51|                 4|           592|       2905.93|     3856.0|
| France|         48|                 4|           977|       2260.87|     6117.0|
| France|         47|                 9|          1644|       3987.62|    11794.0|
| France|         47|                 2|          1907|        968.93|    11794.0|
| France|         47|                 7|          1240|        720.77|    11794.0|
| France|         45|                13|           107|         55.98|    12588.0|
| France|         45|                17|           663|        737.53|    12588.0|
| France|         42|                 1|           428|       1558.82|    14147.0|
| Fr

## windows programming variety

####### always remember frame is pointed to current row and we have to play accordingly in rows between ()function
by default it is unbounded preceeding and current row

In [18]:
my_window1 = Window.partitionBy("Country") \
                    .orderBy(col("week_number").asc()) \
                    .rowsBetween(-3 , Window.currentRow)
                    
df.withColumn(  "running_total" , round(sum("invoice_values").over(my_window1)) ).show()

+-------+-----------+------------------+--------------+--------------+-------------+
|country|week_number|number_of_invoices|total_quantity|invoice_values|running_total|
+-------+-----------+------------------+--------------+--------------+-------------+
| France|          3|                17|          1424|       6539.15|       6539.0|
| France|          3|                 4|           470|        307.28|       6846.0|
| France|          6|                14|           320|       1595.44|       8442.0|
| France|          7|                14|           899|        3877.5|      12319.0|
| France|         12|                 4|          1661|       7977.64|      13758.0|
| France|         15|                14|          1584|       3020.55|      16471.0|
| France|         17|                 2|           179|        468.48|      15344.0|
| France|         18|                11|           428|       1599.14|      13066.0|
| France|         18|                 3|          1882|       917

In [19]:
my_window1 = Window.partitionBy("Country") \
                    .orderBy(col("week_number").asc()) \
                    .rowsBetween(Window.currentRow,2)
                    
df.withColumn(  "running_total" , round(sum("invoice_values").over(my_window1)) ).show()

+-------+-----------+------------------+--------------+--------------+-------------+
|country|week_number|number_of_invoices|total_quantity|invoice_values|running_total|
+-------+-----------+------------------+--------------+--------------+-------------+
| France|          3|                17|          1424|       6539.15|       8442.0|
| France|          3|                 4|           470|        307.28|       5780.0|
| France|          6|                14|           320|       1595.44|      13451.0|
| France|          7|                14|           899|        3877.5|      14876.0|
| France|         12|                 4|          1661|       7977.64|      11467.0|
| France|         15|                14|          1584|       3020.55|       5088.0|
| France|         17|                 2|           179|        468.48|      11239.0|
| France|         18|                11|           428|       1599.14|      11488.0|
| France|         18|                 3|          1882|       917

In [21]:
my_window1 = Window.partitionBy("Country") \
                    .orderBy(col("week_number").asc()) \
                    .rowsBetween(-1,1)
                    
df.withColumn(  "running_total" , round(sum("invoice_values").over(my_window1)) ).show()

+-------+-----------+------------------+--------------+--------------+-------------+
|country|week_number|number_of_invoices|total_quantity|invoice_values|running_total|
+-------+-----------+------------------+--------------+--------------+-------------+
| France|          3|                17|          1424|       6539.15|       6846.0|
| France|          3|                 4|           470|        307.28|       8442.0|
| France|          6|                14|           320|       1595.44|       5780.0|
| France|          7|                14|           899|        3877.5|      13451.0|
| France|         12|                 4|          1661|       7977.64|      14876.0|
| France|         15|                14|          1584|       3020.55|      11467.0|
| France|         17|                 2|           179|        468.48|       5088.0|
| France|         18|                11|           428|       1599.14|      11239.0|
| France|         18|                 3|          1882|       917

In [22]:
my_window1 = Window.partitionBy("Country") \
                    .orderBy(col("week_number").asc()) \
                    .rowsBetween(-2,-1)
                    
df.withColumn(  "running_total" , round(sum("invoice_values").over(my_window1)) ).show()

+-------+-----------+------------------+--------------+--------------+-------------+
|country|week_number|number_of_invoices|total_quantity|invoice_values|running_total|
+-------+-----------+------------------+--------------+--------------+-------------+
| France|          3|                17|          1424|       6539.15|         null|
| France|          3|                 4|           470|        307.28|       6539.0|
| France|          6|                14|           320|       1595.44|       6846.0|
| France|          7|                14|           899|        3877.5|       1903.0|
| France|         12|                 4|          1661|       7977.64|       5473.0|
| France|         15|                14|          1584|       3020.55|      11855.0|
| France|         17|                 2|           179|        468.48|      10998.0|
| France|         18|                11|           428|       1599.14|       3489.0|
| France|         18|                 3|          1882|       917

In [23]:
my_window1 = Window.partitionBy("Country") \
                    .orderBy(col("week_number").asc()) \
                    .rowsBetween(1,2)
                    
df.withColumn(  "running_total" , round(sum("invoice_values").over(my_window1)) ).show()

+-------+-----------+------------------+--------------+--------------+-------------+
|country|week_number|number_of_invoices|total_quantity|invoice_values|running_total|
+-------+-----------+------------------+--------------+--------------+-------------+
| France|          3|                17|          1424|       6539.15|       1903.0|
| France|          3|                 4|           470|        307.28|       5473.0|
| France|          6|                14|           320|       1595.44|      11855.0|
| France|          7|                14|           899|        3877.5|      10998.0|
| France|         12|                 4|          1661|       7977.64|       3489.0|
| France|         15|                14|          1584|       3020.55|       2068.0|
| France|         17|                 2|           179|        468.48|      10771.0|
| France|         18|                11|           428|       1599.14|       9888.0|
| France|         18|                 3|          1882|       917

In [30]:
my_window1 = Window.partitionBy("Country") \
                    .orderBy(col("week_number").asc()) \
                    .rowsBetween(Window.unboundedPreceding , Window.unboundedFollowing)
                    
df.withColumn(  "running_total" , round(sum("invoice_values").over(my_window1)) ).show()

+-------+-----------+------------------+--------------+--------------+-------------+
|country|week_number|number_of_invoices|total_quantity|invoice_values|running_total|
+-------+-----------+------------------+--------------+--------------+-------------+
| France|          3|                17|          1424|       6539.15|      88241.0|
| France|          3|                 4|           470|        307.28|      88241.0|
| France|          6|                14|           320|       1595.44|      88241.0|
| France|          7|                14|           899|        3877.5|      88241.0|
| France|         12|                 4|          1661|       7977.64|      88241.0|
| France|         15|                14|          1584|       3020.55|      88241.0|
| France|         17|                 2|           179|        468.48|      88241.0|
| France|         18|                11|           428|       1599.14|      88241.0|
| France|         18|                 3|          1882|       917

## windows sql variety

In [24]:
spark.sql("""
    select *,
    round(
    sum(invoice_values)
    over(partition by country order by week_number desc rows between 2 preceding and current row ))
    as running_sum
    from df_table
          """).show()

+-------+-----------+------------------+--------------+--------------+-----------+
|country|week_number|number_of_invoices|total_quantity|invoice_values|running_sum|
+-------+-----------+------------------+--------------+--------------+-----------+
| France|         52|                 4|          1511|        950.36|      950.0|
| France|         51|                 4|           592|       2905.93|     3856.0|
| France|         48|                 4|           977|       2260.87|     6117.0|
| France|         47|                 9|          1644|       3987.62|     9154.0|
| France|         47|                 2|          1907|        968.93|     7217.0|
| France|         47|                 7|          1240|        720.77|     5677.0|
| France|         45|                13|           107|         55.98|     1746.0|
| France|         45|                17|           663|        737.53|     1514.0|
| France|         42|                 1|           428|       1558.82|     2352.0|
| Fr

In [25]:
spark.sql("""
    select *,
    round(
    sum(invoice_values)
    over(partition by country order by week_number desc rows between  current row and 1 following))
    as running_sum
    from df_table
          """).show()

+-------+-----------+------------------+--------------+--------------+-----------+
|country|week_number|number_of_invoices|total_quantity|invoice_values|running_sum|
+-------+-----------+------------------+--------------+--------------+-----------+
| France|         52|                 4|          1511|        950.36|     3856.0|
| France|         51|                 4|           592|       2905.93|     5167.0|
| France|         48|                 4|           977|       2260.87|     6248.0|
| France|         47|                 9|          1644|       3987.62|     4957.0|
| France|         47|                 2|          1907|        968.93|     1690.0|
| France|         47|                 7|          1240|        720.77|      777.0|
| France|         45|                13|           107|         55.98|      794.0|
| France|         45|                17|           663|        737.53|     2296.0|
| France|         42|                 1|           428|       1558.82|     2136.0|
| Fr

In [26]:
spark.sql("""
    select *,
    round(
    sum(invoice_values)
    over(partition by country order by week_number desc rows between 2 preceding and 1 preceding ))
    as running_sum
    from df_table
          """).show()

+-------+-----------+------------------+--------------+--------------+-----------+
|country|week_number|number_of_invoices|total_quantity|invoice_values|running_sum|
+-------+-----------+------------------+--------------+--------------+-----------+
| France|         52|                 4|          1511|        950.36|       null|
| France|         51|                 4|           592|       2905.93|      950.0|
| France|         48|                 4|           977|       2260.87|     3856.0|
| France|         47|                 9|          1644|       3987.62|     5167.0|
| France|         47|                 2|          1907|        968.93|     6248.0|
| France|         47|                 7|          1240|        720.77|     4957.0|
| France|         45|                13|           107|         55.98|     1690.0|
| France|         45|                17|           663|        737.53|      777.0|
| France|         42|                 1|           428|       1558.82|      794.0|
| Fr

In [27]:
spark.sql("""
    select *,
    round(
    sum(invoice_values)
    over(partition by country order by week_number desc rows between 1 following and 2 following ))
    as running_sum
    from df_table
          """).show()

+-------+-----------+------------------+--------------+--------------+-----------+
|country|week_number|number_of_invoices|total_quantity|invoice_values|running_sum|
+-------+-----------+------------------+--------------+--------------+-----------+
| France|         52|                 4|          1511|        950.36|     5167.0|
| France|         51|                 4|           592|       2905.93|     6248.0|
| France|         48|                 4|           977|       2260.87|     4957.0|
| France|         47|                 9|          1644|       3987.62|     1690.0|
| France|         47|                 2|          1907|        968.93|      777.0|
| France|         47|                 7|          1240|        720.77|      794.0|
| France|         45|                13|           107|         55.98|     2296.0|
| France|         45|                17|           663|        737.53|     2136.0|
| France|         42|                 1|           428|       1558.82|     2534.0|
| Fr

In [28]:
spark.sql("""
    select *,
    round(
    sum(invoice_values)
    over(partition by country order by week_number desc rows between unbounded preceding and unbounded following ))
    as running_sum
    from df_table
          """).show()

+-------+-----------+------------------+--------------+--------------+-----------+
|country|week_number|number_of_invoices|total_quantity|invoice_values|running_sum|
+-------+-----------+------------------+--------------+--------------+-----------+
| France|         52|                 4|          1511|        950.36|    88241.0|
| France|         51|                 4|           592|       2905.93|    88241.0|
| France|         48|                 4|           977|       2260.87|    88241.0|
| France|         47|                 9|          1644|       3987.62|    88241.0|
| France|         47|                 2|          1907|        968.93|    88241.0|
| France|         47|                 7|          1240|        720.77|    88241.0|
| France|         45|                13|           107|         55.98|    88241.0|
| France|         45|                17|           663|        737.53|    88241.0|
| France|         42|                 1|           428|       1558.82|    88241.0|
| Fr