[PySpark Aggregate Functions](https://sparkbyexamples.com/pyspark/pyspark-aggregate-functions/)

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
        .master("local[5]") \
        .appName("Aggregate Functions") \
        .getOrCreate()

spark

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/12/16 21:04:36 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
simpleData = [("James", "Sales", 3000),
    ("Michael", "Sales", 4600),
    ("Robert", "Sales", 4100),
    ("Maria", "Finance", 3000),
    ("James", "Sales", 3000),
    ("Scott", "Finance", 3300),
    ("Jen", "Finance", 3900),
    ("Jeff", "Marketing", 3000),
    ("Kumar", "Marketing", 2000),
    ("Saif", "Sales", 4100)
  ]
schema = ["employee_name", "department", "salary"]
df = spark.createDataFrame(data=simpleData, schema = schema)
df.printSchema()
df.show(truncate=False)

root
 |-- employee_name: string (nullable = true)
 |-- department: string (nullable = true)
 |-- salary: long (nullable = true)



                                                                                

+-------------+----------+------+
|employee_name|department|salary|
+-------------+----------+------+
|James        |Sales     |3000  |
|Michael      |Sales     |4600  |
|Robert       |Sales     |4100  |
|Maria        |Finance   |3000  |
|James        |Sales     |3000  |
|Scott        |Finance   |3300  |
|Jen          |Finance   |3900  |
|Jeff         |Marketing |3000  |
|Kumar        |Marketing |2000  |
|Saif         |Sales     |4100  |
+-------------+----------+------+



In [3]:
from pyspark.sql import functions

# approx_count_distinct() - return count of distinct items in a group
df.select(functions.approx_count_distinct("salary")).collect()[0][0]

23/12/16 21:04:58 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

6

In [4]:
# avg() - returns the average of input col
df.select(functions.avg("salary")).collect()[0][0]

3400.0

In [5]:
# collect_list() - returns all values from input list with duplicates
df.select(functions.collect_list("employee_name")).show(truncate=False)

+---------------------------------------------------------------------+
|collect_list(employee_name)                                          |
+---------------------------------------------------------------------+
|[James, Michael, Robert, Maria, James, Scott, Jen, Jeff, Kumar, Saif]|
+---------------------------------------------------------------------+



In [6]:
# collect_set() - returns all values from input list without duplicates
df.select(functions.collect_set("department")).show(truncate=False)

+---------------------------+
|collect_set(department)    |
+---------------------------+
|[Finance, Sales, Marketing]|
+---------------------------+



In [10]:
# countDistinct() - returns the number of distince elements in a col
df2 = df.select(functions.countDistinct("department", "salary"))
df2.show(truncate=False)
df.select(functions.countDistinct("department", "salary")).collect()[0][0]
# distinct in dept,sal tuple
# use count_distinct()

+----------------------------------+
|count(DISTINCT department, salary)|
+----------------------------------+
|8                                 |
+----------------------------------+



8

In [12]:
# count() - number of elements
df.select(functions.count("salary")).collect()

[Row(count(salary)=10)]

In [14]:
# first() - first element of a col, ignores null
# last() - last element of a col, ignores null
df.select(functions.first("salary")).show(truncate=False)
df.select(functions.last("salary")).show(truncate=False)

+-------------+
|first(salary)|
+-------------+
|3000         |
+-------------+

+------------+
|last(salary)|
+------------+
|4100        |
+------------+



In [15]:
# min(), max(), mean(), kurtosis(), skewness()
df.select(functions.min("salary")).show(truncate=False)
df.select(functions.max("salary")).show(truncate=False)
df.select(functions.skewness("salary")).show(truncate=False)
df.select(functions.kurtosis("salary")).show(truncate=False)

+-----------+
|min(salary)|
+-----------+
|2000       |
+-----------+

+-----------+
|max(salary)|
+-----------+
|4600       |
+-----------+

+--------------------+
|skewness(salary)    |
+--------------------+
|-0.12041791181069571|
+--------------------+

+-------------------+
|kurtosis(salary)   |
+-------------------+
|-0.6467803030303032|
+-------------------+



In [16]:
# stddev(), stddev_samp(), stddev_pop()
# variance(), var_samp(), var_pop()
# stddev() == stddev_samp(), variance() == var_samp()

df.select(functions.stddev("salary"), \
          functions.stddev_samp("salary"), \
          functions.stddev_pop("salary")).show(truncate=False)

df.select(functions.variance("salary"), \
          functions.var_samp("salary"), \
          functions.var_pop("salary")).show(truncate=False)

+-----------------+-------------------+------------------+
|stddev(salary)   |stddev_samp(salary)|stddev_pop(salary)|
+-----------------+-------------------+------------------+
|765.9416862050705|765.9416862050705  |726.636084983398  |
+-----------------+-------------------+------------------+

+-----------------+-----------------+---------------+
|var_samp(salary) |var_samp(salary) |var_pop(salary)|
+-----------------+-----------------+---------------+
|586666.6666666666|586666.6666666666|528000.0       |
+-----------------+-----------------+---------------+



In [17]:
# sum(), sumDistinct()
df.select(functions.sum("salary"), \
          functions.sumDistinct("salary")).show(truncate=False)

# use sum_distinct()



+-----------+--------------------+
|sum(salary)|sum(DISTINCT salary)|
+-----------+--------------------+
|34000      |20900               |
+-----------+--------------------+



In [18]:
spark.stop()