# distinct().count() + countDistinct()

In [1]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

spark = (
    SparkSession.builder
    .appName("example-count-distinct")
    .getOrCreate()
)

In [2]:
data = [("James", "Sales", 3000),
        ("Michael", "Sales", 4600),
        ("Robert", "Sales", 4100),
        ("Maria", "Finance", 3000),
        ("James", "Sales", 3000),
        ("Scott", "Finance", 3300),
        ("Jen", "Finance", 3900),
        ("Jeff", "Marketing", 3000),
        ("Kumar", "Marketing", 2000),
        ("Saif", "Sales", 4100)
      ]
columns = ["Name","Dept","Salary"]
df = spark.createDataFrame(data=data,schema=columns)
df.show()


+-------+---------+------+
|   Name|     Dept|Salary|
+-------+---------+------+
|  James|    Sales|  3000|
|Michael|    Sales|  4600|
| Robert|    Sales|  4100|
|  Maria|  Finance|  3000|
|  James|    Sales|  3000|
|  Scott|  Finance|  3300|
|    Jen|  Finance|  3900|
|   Jeff|Marketing|  3000|
|  Kumar|Marketing|  2000|
|   Saif|    Sales|  4100|
+-------+---------+------+



## Using DataFrame distinct() and count()
On the above DataFrame, we have a total of 10 rows and one row with all values duplicated, performing distinct count ( distinct().count() ) on this DataFrame should get us 9.

In [3]:
print("Distinct Count: " + str(df.distinct().count()))

Distinct Count: 9


## Using countDistinct() SQL Function
>
DataFrame distinct() returns a new DataFrame after eliminating duplicate rows (distinct on all columns). if you want to get count distinct on selected columns, use the PySpark SQL function countDistinct(). This function returns the number of distinct elements in a group.
>
In order to use this function, you need to import it first.

In [5]:
from pyspark.sql.functions import countDistinct

df2 = df.select(countDistinct("Dept", "Salary"))
df2.show()


+----------------------------+
|count(DISTINCT Dept, Salary)|
+----------------------------+
|                           8|
+----------------------------+



Note that countDistinct() function returns a value in a Column type hence, you need to collect it to get the value from the DataFrame. And this function can be used to get the distinct count of any number of selected or all columns.

In [6]:
print("Distinct Count of Department & Salary: "+ str(df2.collect()[0][0]))

Distinct Count of Department & Salary: 8


## Using SQL to get Count Distinct

In [7]:
df.createOrReplaceTempView("EMP")
spark.sql("select distinct(count(*)) from EMP").show()

+--------+
|count(1)|
+--------+
|      10|
+--------+



In [18]:
df.select("count(distinct Dept)").show()

+--------------------+
|count(DISTINCT Dept)|
+--------------------+
|                   3|
+--------------------+



In [19]:
df.select(countDistinct("Dept")).show()

+--------------------+
|count(DISTINCT Dept)|
+--------------------+
|                   3|
+--------------------+



In [20]:
df.selectExpr("count(distinct Dept)").show()

+--------------------+
|count(DISTINCT Dept)|
+--------------------+
|                   3|
+--------------------+



In [None]:
# AnalysisException: Undefined function: 'countDistinct'
df.selectExpr("countDistinct(Dept)").show()