# DROP() + DISTINCT() + UNIQUE()

In [1]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

spark = (
    SparkSession.builder
    .appName("example-drop-distinct-unique")
    .getOrCreate()
)

In [2]:

data = [("James", "Sales", 3000), \
        ("Michael", "Sales", 4600), \
        ("Robert", "Sales", 4100), \
        ("Maria", "Finance", 3000), \
        ("James", "Sales", 3000), \
        ("Scott", "Finance", 3300), \
        ("Jen", "Finance", 3900), \
        ("Jeff", "Marketing", 3000), \
        ("Kumar", "Marketing", 2000), \
        ("Saif", "Sales", 4100)]

columns= ["employee_name", "department", "salary"]

df = spark.createDataFrame(data = data, schema = columns)

df.printSchema()
df.show(truncate=False)


root
 |-- employee_name: string (nullable = true)
 |-- department: string (nullable = true)
 |-- salary: long (nullable = true)

+-------------+----------+------+
|employee_name|department|salary|
+-------------+----------+------+
|James        |Sales     |3000  |
|Michael      |Sales     |4600  |
|Robert       |Sales     |4100  |
|Maria        |Finance   |3000  |
|James        |Sales     |3000  |
|Scott        |Finance   |3300  |
|Jen          |Finance   |3900  |
|Jeff         |Marketing |3000  |
|Kumar        |Marketing |2000  |
|Saif         |Sales     |4100  |
+-------------+----------+------+



In [3]:
# Get Distinct Rows (By Comparing All Columns)

distinctDF = df.distinct()
print("Distinct count: "+str(distinctDF.count()))
distinctDF.show(truncate=False)


Distinct count: 9
+-------------+----------+------+
|employee_name|department|salary|
+-------------+----------+------+
|Jen          |Finance   |3900  |
|Michael      |Sales     |4600  |
|Scott        |Finance   |3300  |
|Kumar        |Marketing |2000  |
|James        |Sales     |3000  |
|Robert       |Sales     |4100  |
|Jeff         |Marketing |3000  |
|Saif         |Sales     |4100  |
|Maria        |Finance   |3000  |
+-------------+----------+------+



In [4]:
# you can also run dropDuplicates() function which returns a new DataFrame after removing duplicate rows
df2 = df.dropDuplicates()
print("Distinct count: "+str(df2.count()))
df2.show(truncate=False)

Distinct count: 9
+-------------+----------+------+
|employee_name|department|salary|
+-------------+----------+------+
|Jen          |Finance   |3900  |
|Michael      |Sales     |4600  |
|Scott        |Finance   |3300  |
|Kumar        |Marketing |2000  |
|James        |Sales     |3000  |
|Robert       |Sales     |4100  |
|Jeff         |Marketing |3000  |
|Saif         |Sales     |4100  |
|Maria        |Finance   |3000  |
+-------------+----------+------+



In [5]:
# PySpark Distinct of Selected Multiple Columns

dropDisDF = df.dropDuplicates(["department","salary"])
print("Distinct count of department & salary : "+str(dropDisDF.count()))
dropDisDF.show(truncate=False)


Distinct count of department & salary : 8
+-------------+----------+------+
|employee_name|department|salary|
+-------------+----------+------+
|Michael      |Sales     |4600  |
|Robert       |Sales     |4100  |
|Jen          |Finance   |3900  |
|Maria        |Finance   |3000  |
|Scott        |Finance   |3300  |
|Kumar        |Marketing |2000  |
|James        |Sales     |3000  |
|Jeff         |Marketing |3000  |
+-------------+----------+------+



In [23]:
df.first().employee_name


'James'

In [11]:
from pyspark.sql.functions import approx_count_distinct

df.agg(approx_count_distinct(col("department"),0).alias("department"))


Py4JError: An error occurred while calling z:org.apache.spark.sql.functions.approx_count_distinct. Trace:
py4j.Py4JException: Method approx_count_distinct([class org.apache.spark.sql.Column, class java.lang.Integer]) does not exist
	at py4j.reflection.ReflectionEngine.getMethod(ReflectionEngine.java:341)
	at py4j.reflection.ReflectionEngine.getMethod(ReflectionEngine.java:362)
	at py4j.Gateway.invoke(Gateway.java:289)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:251)
	at java.lang.Thread.run(Thread.java:748)



Which of the following code blocks returns only rows from DataFrame transactionsDf in which values in column productId are unique?

In [3]:
data = [(1, 3, 4, 25, 1, None, 1587915332),
         (2, 6, 7, 2, 2, None, 1586815312),
         (3, 3, None, 25, 3, None, 1585824821),
         (4, None, None, 3, 2, None, 1583244275),
         (5, None, None, None, 2, None, 1575285427),
         (6, 3, 2, 25, 2, None, 1572733275)]

schema = StructType([StructField('transactionId', IntegerType(), True),
                     StructField('predError', IntegerType(), True),
                     StructField('value', IntegerType(), True),
                     StructField('storeId', IntegerType(), True),
                     StructField('productId', IntegerType(), True),
                     StructField('f', IntegerType(), True),
                     StructField('transactionDate', LongType(), True)])

transactionsDf = spark.createDataFrame(data=data, schema=schema)
transactionsDf.show()

+-------------+---------+-----+-------+---------+----+---------------+
|transactionId|predError|value|storeId|productId|   f|transactionDate|
+-------------+---------+-----+-------+---------+----+---------------+
|            1|        3|    4|     25|        1|null|     1587915332|
|            2|        6|    7|      2|        2|null|     1586815312|
|            3|        3| null|     25|        3|null|     1585824821|
|            4|     null| null|      3|        2|null|     1583244275|
|            5|     null| null|   null|        2|null|     1575285427|
|            6|        3|    2|     25|        2|null|     1572733275|
+-------------+---------+-----+-------+---------+----+---------------+



In [None]:
transactionsDf.dropDuplicates(subset=["productId"]).show()

Which of the following code blocks returns a DataFrame where columns predError and productId are removed from DataFrame transactionsDf?
>
Sample of DataFrame transactionsDf:
>
- `+————-+———+—–+——-+———+—-+`
- `|transactionId|predError|value|storeId|productId|f |`
- `+————-+———+—–+——-+———+—-+`
- `|1 |3 |4 |25 |1 |null|`
- `|2 |6 |7 |2 |2 |null|`
- `|3 |3 |null |25 |3 |null|`
- `+————-+———+—–+——-+———+—-+`
>
- `transactionsDf.withColumnRemoved("predError", "productId")`
- `transactionsDf.drop(["predError", "productId", "associateId"])`
- `transactionsDf.drop("predError", "productId", "associateId")`
- `transactionsDf.dropColumns("predError", "productId", "associateId")`
- `transactionsDf.drop(col("predError", "productId"))`

In [None]:
transactionsDf.drop("predError", "productId", "associateId").show()

In [None]:
# TypeError: col should be a string or a Column
transactionsDf.drop(["predError", "productId", "associateId"])

# AttributeError: 'DataFrame' object has no attribute 'dropColumns'
transactionsDf.dropColumns("predError", "productId", "associateId")    

# TypeError: _() takes 1 positional argument but 2 were given
transactionsDf.drop(col("predError", "productId"))

In [10]:
transactionsDf.select("storeId").distinct().show()

+-------+
|storeId|
+-------+
|   null|
|      3|
|     25|
|      2|
+-------+



In [4]:
# TypeError: 'Column' object is not callable
transactionsDf["storeId"].distinct()

# AnalysisException: filter expression '`storeId`' of type int is not a boolean
transactionsDf.filter("storeId").distinct()

# TypeError: 'Column' object is not callable
transactionsDf.select(col("storeId").distinct())

# TypeError: distinct() takes 1 positional argument but 2 were given
transactionsDf.distinct("storeId")

TypeError: 'Column' object is not callable