In [1]:
import os, sys
os.environ['HADOOP_CONF_DIR'] = '/etc/hadoop/conf'
os.environ['YARN_CONF_DIR'] = '/etc/hadoop/conf'
os.environ['PYSPARK_PYTHON'] =  'python3.9'
os.environ['PYSPARK_DRIVER_PYTHON'] = 'python3.9'
os.environ['HADOOP_USER_NAME']='ssenigov'

from pyspark.sql import SparkSession
from pyspark import SparkConf

In [None]:
conf = SparkConf().setAppName('first_last_window_funcs').setMaster('yarn')
spark = SparkSession.builder.config(conf=conf).getOrCreate()

In [3]:
sql = """
   select order_id, order_date 
    from values 
     (1, '2025-09-08'), (2, '2025-09-12'), 
     (5, '2025-09-14'), (6, '2025-09-24'), 
     (3, '2025-10-04'), (4, '2025-10-09'), 
     (7, '2025-10-22'), (8, '2025-10-28')
   as (order_id, order_date) """
spark.sql(sql).createOrReplaceTempView('tbl_orders')

In [4]:
sql = """  
  with cte as (
  select 
    order_date, 
    substr(order_date, 1, 7) as order_mon
   from tbl_orders 
  )
  select 
    order_date, order_mon,
    first_value(order_date) over(partition by order_mon order by order_date)
      as date_first_mon_order
   from cte
  order by order_date """
spark.sql(sql).createOrReplaceTempView('tbl_orders_2')
spark.table("tbl_orders_2").show()

[Stage 4:>                                                          (0 + 1) / 1]

+----------+---------+--------------------+
|order_date|order_mon|date_first_mon_order|
+----------+---------+--------------------+
|2025-09-08|  2025-09|          2025-09-08|
|2025-09-12|  2025-09|          2025-09-08|
|2025-09-14|  2025-09|          2025-09-08|
|2025-09-24|  2025-09|          2025-09-08|
|2025-10-04|  2025-10|          2025-10-04|
|2025-10-09|  2025-10|          2025-10-04|
|2025-10-22|  2025-10|          2025-10-04|
|2025-10-28|  2025-10|          2025-10-04|
+----------+---------+--------------------+



                                                                                

In [5]:
sql = """  
  select 
    order_date, order_mon, date_first_mon_order,
    datediff(order_date, date_first_mon_order) days_diff
   from tbl_orders_2
  order by order_date """
spark.sql(sql).createOrReplaceTempView('tbl_orders_3')
spark.table("tbl_orders_3").show()

+----------+---------+--------------------+---------+
|order_date|order_mon|date_first_mon_order|days_diff|
+----------+---------+--------------------+---------+
|2025-09-08|  2025-09|          2025-09-08|        0|
|2025-09-12|  2025-09|          2025-09-08|        4|
|2025-09-14|  2025-09|          2025-09-08|        6|
|2025-09-24|  2025-09|          2025-09-08|       16|
|2025-10-04|  2025-10|          2025-10-04|        0|
|2025-10-09|  2025-10|          2025-10-04|        5|
|2025-10-22|  2025-10|          2025-10-04|       18|
|2025-10-28|  2025-10|          2025-10-04|       24|
+----------+---------+--------------------+---------+



In [6]:
spark.sql("""  
  select 
    order_mon, 
    collect_list(days_diff) days_diffs
   from tbl_orders_3
   where days_diff > 0
  group by order_mon 
  order by order_mon """).show()

+---------+-----------+
|order_mon| days_diffs|
+---------+-----------+
|  2025-09| [4, 6, 16]|
|  2025-10|[5, 18, 24]|
+---------+-----------+



In [8]:
spark.stop()