# DROP() + DISTINCT() + UNIQUE()

In [11]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

spark = (
    SparkSession.builder
    .appName("example-drop-distinct-unique")
    .getOrCreate()
)

In [12]:

data = [("James", "Sales", 3000), \
        ("Michael", "Sales", 4600), \
        ("Robert", "Sales", 4100), \
        ("Maria", "Finance", 3000), \
        ("James", "Sales", 3000), \
        ("Scott", "Finance", 3300), \
        ("Jen", "Finance", 3900), \
        ("Jeff", "Marketing", 3000), \
        ("Kumar", "Marketing", 2000), \
        ("Saif", "Sales", 4100)]

columns= ["employee_name", "department", "salary"]

df = spark.createDataFrame(data = data, schema = columns)

df.printSchema()
df.show(truncate=False)


root
 |-- employee_name: string (nullable = true)
 |-- department: string (nullable = true)
 |-- salary: long (nullable = true)

+-------------+----------+------+
|employee_name|department|salary|
+-------------+----------+------+
|James        |Sales     |3000  |
|Michael      |Sales     |4600  |
|Robert       |Sales     |4100  |
|Maria        |Finance   |3000  |
|James        |Sales     |3000  |
|Scott        |Finance   |3300  |
|Jen          |Finance   |3900  |
|Jeff         |Marketing |3000  |
|Kumar        |Marketing |2000  |
|Saif         |Sales     |4100  |
+-------------+----------+------+



In [13]:
# Get Distinct Rows (By Comparing All Columns)

distinctDF = df.distinct()
print("Distinct count: "+str(distinctDF.count()))
distinctDF.show(truncate=False)


Distinct count: 9
+-------------+----------+------+
|employee_name|department|salary|
+-------------+----------+------+
|Jen          |Finance   |3900  |
|Michael      |Sales     |4600  |
|Scott        |Finance   |3300  |
|Kumar        |Marketing |2000  |
|James        |Sales     |3000  |
|Robert       |Sales     |4100  |
|Jeff         |Marketing |3000  |
|Saif         |Sales     |4100  |
|Maria        |Finance   |3000  |
+-------------+----------+------+



In [14]:
# you can also run dropDuplicates() function which returns a new DataFrame after removing duplicate rows
df2 = df.dropDuplicates()
print("Distinct count: "+str(df2.count()))
df2.show(truncate=False)

Distinct count: 9
+-------------+----------+------+
|employee_name|department|salary|
+-------------+----------+------+
|Jen          |Finance   |3900  |
|Michael      |Sales     |4600  |
|Scott        |Finance   |3300  |
|Kumar        |Marketing |2000  |
|James        |Sales     |3000  |
|Robert       |Sales     |4100  |
|Jeff         |Marketing |3000  |
|Saif         |Sales     |4100  |
|Maria        |Finance   |3000  |
+-------------+----------+------+



In [15]:
# PySpark Distinct of Selected Multiple Columns

dropDisDF = df.dropDuplicates(["department","salary"])
print("Distinct count of department & salary : "+str(dropDisDF.count()))
dropDisDF.show(truncate=False)


Distinct count of department & salary : 8
+-------------+----------+------+
|employee_name|department|salary|
+-------------+----------+------+
|Michael      |Sales     |4600  |
|Robert       |Sales     |4100  |
|Jen          |Finance   |3900  |
|Maria        |Finance   |3000  |
|Scott        |Finance   |3300  |
|Kumar        |Marketing |2000  |
|James        |Sales     |3000  |
|Jeff         |Marketing |3000  |
+-------------+----------+------+



Which of the following code blocks returns only rows from DataFrame transactionsDf in which values in column productId are unique?

In [27]:
data = [(1, 3, 4, 25, 1, None, 1587915332),
         (2, 6, 7, 2, 2, None, 1586815312),
         (3, 3, None, 25, 3, None, 1585824821),
         (4, None, None, 3, 2, None, 1583244275),
         (5, None, None, None, 2, None, 1575285427),
         (6, 3, 2, 25, 2, None, 1572733275)]

schema = StructType([StructField('transactionId', IntegerType(), True),
                     StructField('predError', IntegerType(), True),
                     StructField('value', IntegerType(), True),
                     StructField('storeId', IntegerType(), True),
                     StructField('productId', IntegerType(), True),
                     StructField('f', IntegerType(), True),
                     StructField('transactionDate', LongType(), True)])

transactionsDf = spark.createDataFrame(data=data, schema=schema)
transactionsDf.show()

+-------------+---------+-----+-------+---------+----+---------------+
|transactionId|predError|value|storeId|productId|   f|transactionDate|
+-------------+---------+-----+-------+---------+----+---------------+
|            1|        3|    4|     25|        1|null|     1587915332|
|            2|        6|    7|      2|        2|null|     1586815312|
|            3|        3| null|     25|        3|null|     1585824821|
|            4|     null| null|      3|        2|null|     1583244275|
|            5|     null| null|   null|        2|null|     1575285427|
|            6|        3|    2|     25|        2|null|     1572733275|
+-------------+---------+-----+-------+---------+----+---------------+



In [20]:
transactionsDf.dropDuplicates(subset=["productId"]).show()

+-------------+---------+-----+-------+---------+----+---------------+
|transactionId|predError|value|storeId|productId|   f|transactionDate|
+-------------+---------+-----+-------+---------+----+---------------+
|            1|        3|    4|     25|        1|null|     1587915332|
|            3|        3| null|     25|        3|null|     1585824821|
|            2|        6|    7|      2|        2|null|     1586815312|
+-------------+---------+-----+-------+---------+----+---------------+



Which of the following code blocks returns a DataFrame where columns predError and productId are removed from DataFrame transactionsDf?
>
Sample of DataFrame transactionsDf:
>
- `+————-+———+—–+——-+———+—-+`
- `|transactionId|predError|value|storeId|productId|f |`
- `+————-+———+—–+——-+———+—-+`
- `|1 |3 |4 |25 |1 |null|`
- `|2 |6 |7 |2 |2 |null|`
- `|3 |3 |null |25 |3 |null|`
- `+————-+———+—–+——-+———+—-+`
>
- `transactionsDf.withColumnRemoved("predError", "productId")`
- `transactionsDf.drop(["predError", "productId", "associateId"])`
- `transactionsDf.drop("predError", "productId", "associateId")`
- `transactionsDf.dropColumns("predError", "productId", "associateId")`
- `transactionsDf.drop(col("predError", "productId"))`

In [23]:
# TypeError: col should be a string or a Column
transactionsDf.drop(["predError", "productId", "associateId"])

# AttributeError: 'DataFrame' object has no attribute 'dropColumns'
transactionsDf.dropColumns("predError", "productId", "associateId")    

# TypeError: _() takes 1 positional argument but 2 were given
transactionsDf.drop(col("predError", "productId"))

TypeError: col should be a string or a Column

In [29]:
transactionsDf.drop("predError", "productId", "associateId").show()

+-------------+-----+-------+----+---------------+
|transactionId|value|storeId|   f|transactionDate|
+-------------+-----+-------+----+---------------+
|            1|    4|     25|null|     1587915332|
|            2|    7|      2|null|     1586815312|
|            3| null|     25|null|     1585824821|
|            4| null|      3|null|     1583244275|
|            5| null|   null|null|     1575285427|
|            6|    2|     25|null|     1572733275|
+-------------+-----+-------+----+---------------+

