# PySpark Aggregate Functions

- approx_count_distinct
- avg
- collect_list
- collect_set
- countDistinct
- count
- grouping
- first
- last
- kurtosis
- max
- min
- mean
- skewness
- stddev
- stddev_samp
- stddev_pop
- sum
- sumDistinct
- variance, var_samp, var_pop

In [1]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

spark = (
    SparkSession.builder
    .appName("example-08")
    .getOrCreate()
)

In [2]:
simpleData = [("James","Sales","NY",90000,34,10000),
              ("Michael","Sales","NY",86000,56,20000),
              ("Robert","Sales","CA",81000,30,23000),
              ("Maria","Finance","CA",90000,24,23000),
              ("Raman","Finance","CA",99000,40,24000),
              ("Scott","Finance","NY",83000,36,19000),
              ("Jen","Finance","NY",79000,53,15000),
              ("Jeff","Marketing","CA",80000,25,18000),
              ("Kumar","Marketing","NY",91000,50,21000)]

schema = ["employee_name", "department", "state", "salary", "age", "bonus"]

df = spark.createDataFrame(data=simpleData, schema = schema)

df.printSchema()
df.show(truncate=False)

root
 |-- employee_name: string (nullable = true)
 |-- department: string (nullable = true)
 |-- state: string (nullable = true)
 |-- salary: long (nullable = true)
 |-- age: long (nullable = true)
 |-- bonus: long (nullable = true)

+-------------+----------+-----+------+---+-----+
|employee_name|department|state|salary|age|bonus|
+-------------+----------+-----+------+---+-----+
|James        |Sales     |NY   |90000 |34 |10000|
|Michael      |Sales     |NY   |86000 |56 |20000|
|Robert       |Sales     |CA   |81000 |30 |23000|
|Maria        |Finance   |CA   |90000 |24 |23000|
|Raman        |Finance   |CA   |99000 |40 |24000|
|Scott        |Finance   |NY   |83000 |36 |19000|
|Jen          |Finance   |NY   |79000 |53 |15000|
|Jeff         |Marketing |CA   |80000 |25 |18000|
|Kumar        |Marketing |NY   |91000 |50 |21000|
+-------------+----------+-----+------+---+-----+



In [3]:
# groupBy and aggregate
df.groupBy("department").sum("salary").show(truncate=False)

+----------+-----------+
|department|sum(salary)|
+----------+-----------+
|Sales     |257000     |
|Finance   |351000     |
|Marketing |171000     |
+----------+-----------+



In [4]:
# approx_count_distinct
print("approx_count_distinct: " + \
      str(df.select(approx_count_distinct("salary")).collect()[0][0]))


approx_count_distinct: 8


In [5]:
# avg (average)
print("avg: " + str(df.select(avg("salary")).collect()[0][0]))

avg: 86555.55555555556


In [6]:
# collect_list
df.select(collect_list("salary")).show(truncate=False)

+---------------------------------------------------------------+
|collect_list(salary)                                           |
+---------------------------------------------------------------+
|[90000, 86000, 81000, 90000, 99000, 83000, 79000, 80000, 91000]|
+---------------------------------------------------------------+



In [7]:
# collect_set
df.select(collect_set("salary")).show(truncate=False)

+--------------------------------------------------------+
|collect_set(salary)                                     |
+--------------------------------------------------------+
|[79000, 83000, 91000, 99000, 90000, 80000, 86000, 81000]|
+--------------------------------------------------------+



In [8]:
# countDistinct
df2 = df.select(countDistinct("department", "salary"))
df2.show(truncate=False)
print("Distinct Count of Department & Salary: "+str(df2.collect()[0][0]))

+----------------------------------+
|count(DISTINCT department, salary)|
+----------------------------------+
|9                                 |
+----------------------------------+

Distinct Count of Department & Salary: 9


In [9]:
# count function
print("count: "+str(df.select(count("salary")).collect()[0]))

count: Row(count(salary)=9)


In [3]:
# first
df.select(first("salary")).show(truncate=False)

+-------------+
|first(salary)|
+-------------+
|90000        |
+-------------+



In [10]:
df.first().salary

90000

In [11]:
# last
df.select(last("salary")).show(truncate=False)

+------------+
|last(salary)|
+------------+
|91000       |
+------------+



In [12]:
# kurtosis
df.select(kurtosis("salary")).show(truncate=False)

+-------------------+
|kurtosis(salary)   |
+-------------------+
|-0.6275168662506321|
+-------------------+



In [13]:
# max
df.select(max("salary")).show(truncate=False)

+-----------+
|max(salary)|
+-----------+
|99000      |
+-----------+



In [14]:
# min
df.select(min("salary")).show(truncate=False)

+-----------+
|min(salary)|
+-----------+
|79000      |
+-----------+



In [15]:
# mean
df.select(mean("salary")).show(truncate=False)

+-----------------+
|avg(salary)      |
+-----------------+
|86555.55555555556|
+-----------------+



In [16]:
# skewness
df.select(skewness("salary")).show(truncate=False)


+------------------+
|skewness(salary)  |
+------------------+
|0.5530468967432594|
+------------------+



In [17]:
# stddev(), stddev_samp() and stddev_pop()
df.select(stddev("salary"), stddev_samp("salary"), stddev_pop("salary")).show(truncate=False)

+-------------------+-------------------+------------------+
|stddev_samp(salary)|stddev_samp(salary)|stddev_pop(salary)|
+-------------------+-------------------+------------------+
|6540.472290116195  |6540.472290116195  |6166.416411338492 |
+-------------------+-------------------+------------------+



In [18]:
# sum
df.select(sum("salary")).show(truncate=False)

+-----------+
|sum(salary)|
+-----------+
|779000     |
+-----------+



In [19]:
# sumDistinct
df.select(sumDistinct("salary")).show(truncate=False)

+--------------------+
|sum(DISTINCT salary)|
+--------------------+
|689000              |
+--------------------+



In [20]:
# variance(), var_samp(), var_pop()
df.select(variance("salary"),var_samp("salary"),var_pop("salary")).show(truncate=False)

+-------------------+-------------------+--------------------+
|var_samp(salary)   |var_samp(salary)   |var_pop(salary)     |
+-------------------+-------------------+--------------------+
|4.277777777777778E7|4.277777777777778E7|3.8024691358024694E7|
+-------------------+-------------------+--------------------+



The code block displayed below contains an error. The code block should return the average of rows in column value grouped by unique storeId. Find the error.

Code block:

transactionsDf.agg(“storeId”).avg(“value”)

In [3]:
#  agg should be replaced by groupBy.
df.groupBy().avg("salary").show(truncate=False)

+-----------------+
|avg(salary)      |
+-----------------+
|86555.55555555556|
+-----------------+



Which of the following code blocks returns a DataFrame showing the mean value of column “value” of DataFrame transactionsDf, grouped by its column storeId?
>
- `transactionsDf.groupBy(col(storeId).avg())`
- `transactionsDf.groupBy("storeId").avg(col("value"))`
- `transactionsDf.groupBy("storeId").agg(avg("value"))`
- `transactionsDf.groupBy("storeId").agg(average("value"))`
- `transactionsDf.groupBy("value").average()`

In [4]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

data = [(1, 3, 4, 25, 1, None, 1587915332),
         (2, 6, 7, 2, 2, None, 1586815312),
         (3, 3, None, 25, 3, None, 1585824821),
         (4, None, None, 3, 2, None, 1583244275),
         (5, None, None, None, 2, None, 1575285427),
         (6, 3, 2, 25, 2, None, 1572733275)]

schema = StructType([StructField('transactionId', IntegerType(), True),
                     StructField('predError', IntegerType(), True),
                     StructField('value', IntegerType(), True),
                     StructField('storeId', IntegerType(), True),
                     StructField('productId', IntegerType(), True),
                     StructField('f', IntegerType(), True),
                     StructField('transactionDate', LongType(), True)])

transactionsDf = spark.createDataFrame(data=data, schema=schema)

In [7]:
transactionsDf.groupBy("storeId").agg(avg("value")).show()


+-------+----------+
|storeId|avg(value)|
+-------+----------+
|   null|      null|
|      3|      null|
|     25|       3.0|
|      2|       7.0|
+-------+----------+



In [6]:
transactionsDf.groupBy("storeId").avg("value").show()

+-------+----------+
|storeId|avg(value)|
+-------+----------+
|   null|      null|
|      3|      null|
|     25|       3.0|
|      2|       7.0|
+-------+----------+



In [None]:
# NameError: name 'storeId' is not defined
transactionsDf.groupBy(col(storeId).avg())

# TypeError: Column is not iterable
transactionsDf.groupBy("storeId").avg(col("value"))

# NameError: name 'average' is not defined
transactionsDf.groupBy("storeId").agg(average("value"))

# AttributeError: 'GroupedData' object has no attribute 'average'
transactionsDf.groupBy("value").average()

You are given the following DataFrame.
>
- `+—+——+—-+`
- `|Key| Name|Score|`
- `+—+——+—-+`
- `| 1| Apple| 0.76|`
- `| 1|Orange| 0.98|`
- `| 1|Banana| 0.24|`
- `| 2| Apple| 0.11|`
- `| 2|Banana| 0.99|`
- `+—+——+—-+`
>
And you are asked to transform and generate the following output.
>
- `+—+———————————————–+`
- `|Key|Collection |`
- `+—+———————————————–+`
- `|1 |[[Apple, 0.76], [Orange, 0.98], [Banana, 0.24]] |`
- `|2 |[[Apple, 0.11], [Banana, 0.99]] |`
- `+—+———————————————–+`
>
Choose the correct code to generate the above output.

- `df.groupBy("Key", "Name", "Score") .agg(collect_list(struct("Name", "Score")).alias("Collection")) .show(truncate=0)`
- `df.groupBy("Key") .agg(collect_list(struct("Name", "Score")).alias("Collection")) .show(truncate=0)`
- `df.groupBy("Key") .agg(struct(struct("Name", "Score")).alias("Collection")) .show(truncate=0)`
- `df.groupBy("Key") .agg(arrayt(struct("Name", "Score")).alias("Collection")) .show(truncate=0)`

In [3]:
data = [(1, 'Apple', 0.76),
        (1, 'Orange', 0.98),
        (1, 'Banana', 0.24),
        (2, 'Apple', 0.11),
        (2, 'Banana', 0.99)]

columns = ["Key", "Name", "Score"]

df = spark.createDataFrame(data=data, schema=columns)

df.printSchema()
df.show()

root
 |-- Key: long (nullable = true)
 |-- Name: string (nullable = true)
 |-- Score: double (nullable = true)

+---+------+-----+
|Key|  Name|Score|
+---+------+-----+
|  1| Apple| 0.76|
|  1|Orange| 0.98|
|  1|Banana| 0.24|
|  2| Apple| 0.11|
|  2|Banana| 0.99|
+---+------+-----+



In [4]:
df.groupBy("Key", "Name", "Score").agg(collect_list(struct("Name", "Score")).alias("Collection")).show(truncate=0)


+---+------+-----+----------------+
|Key|Name  |Score|Collection      |
+---+------+-----+----------------+
|2  |Apple |0.11 |[[Apple, 0.11]] |
|2  |Banana|0.99 |[[Banana, 0.99]]|
|1  |Apple |0.76 |[[Apple, 0.76]] |
|1  |Orange|0.98 |[[Orange, 0.98]]|
|1  |Banana|0.24 |[[Banana, 0.24]]|
+---+------+-----+----------------+



In [5]:
df.groupBy("Key").agg(collect_list(struct("Name", "Score")).alias("Collection")).show(truncate=0)


+---+-----------------------------------------------+
|Key|Collection                                     |
+---+-----------------------------------------------+
|1  |[[Apple, 0.76], [Orange, 0.98], [Banana, 0.24]]|
|2  |[[Apple, 0.11], [Banana, 0.99]]                |
+---+-----------------------------------------------+



In [None]:
# AnalysisException: expression '`Name`' is neither present in the group by, nor is it an aggregate function
df.groupBy("Key").agg(struct(struct("Name", "Score")).alias("Collection")).show(truncate=0)

# NameError: name 'arrayt' is not defined
df.groupBy("Key").agg(arrayt(struct("Name", "Score")).alias("Collection")).show(truncate=0)

Select all valid expressions to calculate the sum of Quantity for each InvoiceNo in the following DataFrame.
>
- `df.groupBy("InvoiceNo").agg("sum(Quantity)")`
- `df.groupBy("InvoiceNo").agg(sum("Quantity"))`
- `df.select("InvoiceNo", expr("sum(Quantity)"))`
- `df.groupBy("InvoiceNo").select("InvoiceNo", expr("sum(Quantity)"))`

In [2]:
data_list = [('536365','85123A', '6'),
             ('536365','85123B', '5'),
             ('536366','85123C', '2'),
             ('536366','85123D', '6'),
             ('536366','85123E', '8')]
             
df = spark.createDataFrame(data_list).toDF('InvoiceNo', 'StockCode', 'Quantity')

In [3]:
df.groupBy("InvoiceNo").agg(sum("Quantity")).show()

+---------+-------------+
|InvoiceNo|sum(Quantity)|
+---------+-------------+
|   536365|         11.0|
|   536366|         16.0|
+---------+-------------+



In [5]:
df.groupBy("InvoiceNo").agg("sum(Quantity)").show()

AssertionError: all exprs should be Column