### Interview Question

- Lets assume we have duplicate records in our dataset on particular key. We need to remove duplicate records but at the same time need to consider the maximum value of each column among duplicate values. To develop a logic for this requirement, below are the steps.

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Max_Over()").getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/02/23 11:07:44 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


- sample data

In [2]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

In [8]:
sample_data = (
    (100, "Mobile", 5000, 10),
    (100, "Mobile", 7000, 7),
    (200, "Laptop", 20000, 4),
    (200, "Laptop", 25000, 8),
    (200, "Laptop", 22000, 12),
)

defSchema = StructType(
    [
        StructField("Product_id", IntegerType(), False),
        StructField("Product_name", StringType(), False),
        StructField("Price", IntegerType(), False),
        StructField("DiscountPercent", IntegerType(), False),
    ]
)

df = spark.createDataFrame(data=sample_data, schema=defSchema)
df.show()

+----------+------------+-----+---------------+
|Product_id|Product_name|Price|DiscountPercent|
+----------+------------+-----+---------------+
|       100|      Mobile| 5000|             10|
|       100|      Mobile| 7000|              7|
|       200|      Laptop|20000|              4|
|       200|      Laptop|25000|              8|
|       200|      Laptop|22000|             12|
+----------+------------+-----+---------------+



#### Max Over() window function

In [9]:
from pyspark.sql import Window
from pyspark.sql.functions import max, col

In [10]:
windowSpec = Window.partitionBy("Product_id")

dfMax = df.withColumn("max_price", max("Price").over(windowSpec)).withColumn(
    "maxDiscountPercent", max("DiscountPercent").over(window=windowSpec)
)

dfMax.show()

+----------+------------+-----+---------------+---------+------------------+
|Product_id|Product_name|Price|DiscountPercent|max_price|maxDiscountPercent|
+----------+------------+-----+---------------+---------+------------------+
|       100|      Mobile| 5000|             10|     7000|                10|
|       100|      Mobile| 7000|              7|     7000|                10|
|       200|      Laptop|20000|              4|    25000|                12|
|       200|      Laptop|25000|              8|    25000|                12|
|       200|      Laptop|22000|             12|    25000|                12|
+----------+------------+-----+---------------+---------+------------------+



In [11]:
type(windowSpec)

pyspark.sql.window.WindowSpec

#### Select Max Columns

In [12]:
dfSel = dfMax.select(
    col("Product_id"),
    col("Product_name"),
    col("max_price").alias("Price"),
    col("maxDiscountPercent").alias("DiscountPercent"),
)
dfSel.show()

+----------+------------+-----+---------------+
|Product_id|Product_name|Price|DiscountPercent|
+----------+------------+-----+---------------+
|       100|      Mobile| 7000|             10|
|       100|      Mobile| 7000|             10|
|       200|      Laptop|25000|             12|
|       200|      Laptop|25000|             12|
|       200|      Laptop|25000|             12|
+----------+------------+-----+---------------+



#### Remove Duplicates

In [15]:
dfOut = dfSel.dropDuplicates()
dfOut.show()

+----------+------------+-----+---------------+
|Product_id|Product_name|Price|DiscountPercent|
+----------+------------+-----+---------------+
|       100|      Mobile| 7000|             10|
|       200|      Laptop|25000|             12|
+----------+------------+-----+---------------+



### Complete code

In [16]:
from pyspark.sql import Window
from pyspark.sql.functions import max, col

# create window partition based on product_id
windowSpec = Window.partitionBy("Product_id")

# create max_price and maxDiscountPercent columns
dfMax = df.withColumn("max_price", max("Price").over(window=windowSpec)).withColumn(
    "maxDiscountPercent", max("DiscountPercent").over(window=windowSpec)
)

# select the max value
dfSel = dfMax.select(
    col("Product_id"),
    col("Product_name"),
    col("max_price").alias("Price"),
    col("maxDiscountPercent").alias("DiscountPercent"),
)

# drop duplicates
dfOut = dfSel.dropDuplicates()
dfOut.show()

+----------+------------+-----+---------------+
|Product_id|Product_name|Price|DiscountPercent|
+----------+------------+-----+---------------+
|       100|      Mobile| 7000|             10|
|       200|      Laptop|25000|             12|
+----------+------------+-----+---------------+

