In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Window_fun_First_Last").getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/02/27 12:48:55 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/02/27 12:48:56 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


24/02/27 12:49:07 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors


In [2]:
from pyspark.sql.functions import *

In [8]:
# create sample dataframe
data = [
    ("C1", "2023-06-01", 100.0),
    ("C1", "2023-06-02", 150.0),
    ("C1", "2023-06-03", 200.0),
    ("C2", "2023-06-02", 50.0),
    ("C2", "2023-06-03", 75.0),
    ("C2", "2023-06-04", 100.0),
]

df = spark.createDataFrame(data, schema=["customer_id", "transaction_date", "amount"])

# convert the transaction_date column to date type
df = df.withColumn("transaction_date", to_date("transaction_date"))

df.show()

+-----------+----------------+------+
|customer_id|transaction_date|amount|
+-----------+----------------+------+
|         C1|      2023-06-01| 100.0|
|         C1|      2023-06-02| 150.0|
|         C1|      2023-06-03| 200.0|
|         C2|      2023-06-02|  50.0|
|         C2|      2023-06-03|  75.0|
|         C2|      2023-06-04| 100.0|
+-----------+----------------+------+



##### Window Functions

In [9]:
from pyspark.sql.window import Window

In [10]:
windowSpec = Window.partitionBy("customer_id")

In [11]:
result_df = df.withColumn(
    "first_transaction_date", first("transaction_date").over(window=windowSpec)
).withColumn("last_transaction_date", last("transaction_date").over(window=windowSpec))

result_df.show()

+-----------+----------------+------+----------------------+---------------------+
|customer_id|transaction_date|amount|first_transaction_date|last_transaction_date|
+-----------+----------------+------+----------------------+---------------------+
|         C1|      2023-06-01| 100.0|            2023-06-01|           2023-06-03|
|         C1|      2023-06-02| 150.0|            2023-06-01|           2023-06-03|
|         C1|      2023-06-03| 200.0|            2023-06-01|           2023-06-03|
|         C2|      2023-06-02|  50.0|            2023-06-02|           2023-06-04|
|         C2|      2023-06-03|  75.0|            2023-06-02|           2023-06-04|
|         C2|      2023-06-04| 100.0|            2023-06-02|           2023-06-04|
+-----------+----------------+------+----------------------+---------------------+



Find First and Last Transaction date for each customer

In [12]:
result_df = (
    df.withColumn(
        "first_transaction_date", first("transaction_date").over(window=windowSpec)
    )
    .withColumn(
        "last_transaction_date", last("transaction_date").over(window=windowSpec)
    )
    .drop("transaction_date", "amount")
    .distinct()
)

result_df.show()

+-----------+----------------------+---------------------+
|customer_id|first_transaction_date|last_transaction_date|
+-----------+----------------------+---------------------+
|         C1|            2023-06-01|           2023-06-03|
|         C2|            2023-06-02|           2023-06-04|
+-----------+----------------------+---------------------+



How to Perform same transformation using Spark SQl

- Convert dataframe to View

In [13]:
df.createOrReplaceTempView("transaction")

In [15]:
# SQl window function

spark.sql(
    """SELECT DISTINCT customer_id,
    FIRST(transaction_date) OVER (PARTITION BY customer_id) AS first_transaction_date,
    LAST(transaction_date) OVER (PARTITION BY customer_id) AS last_transaction_date
FROM transaction
ORDER BY customer_id"""
).show()

+-----------+----------------------+---------------------+
|customer_id|first_transaction_date|last_transaction_date|
+-----------+----------------------+---------------------+
|         C1|            2023-06-01|           2023-06-03|
|         C2|            2023-06-02|           2023-06-04|
+-----------+----------------------+---------------------+

