# Capgemini (Hard Level) PySpark Interview Question

You are provided with a DataFrame containing customer purchase data. Your task is to:
Identify and remove duplicate rows based on the customer_id column.
If duplicates exist for a customer_id, retain the row with the most recent purchase_date.

In [0]:
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql.window import *

In [0]:
data = [ (1, "Laptop", "2024-08-01"), (1, "Mouse", "2024-08-05"), (2, "Keyboard", "2024-08-02"), (2, "Monitor", "2024-08-03") ] 

columns = ["customer_id", "product", "purchase_date"]

df = spark.createDataFrame(data, columns)

In [0]:
df = df.withColumn('purchase_date', to_date(col('purchase_date')) )
df.display()

customer_id,product,purchase_date
1,Laptop,2024-08-01
1,Mouse,2024-08-05
2,Keyboard,2024-08-02
2,Monitor,2024-08-03


In [0]:
window_criteria = Window.partitionBy('customer_id').orderBy(col('purchase_date').desc())

(df.withColumn(
        'row_num'
        , row_number().over(window_criteria) 
    )
    .filter(col('row_num') == 1)
    .drop('row_num')
    .display()
)

customer_id,product,purchase_date
1,Mouse,2024-08-05
2,Monitor,2024-08-03
