## Basic aggregations

### Following code creates a Spark dataframe df containing four columns named emp_id, region, sales, and customer:

In [11]:
import pyspark
from pyspark.sql import SparkSession
# Create a Spark session
spark = SparkSession.builder.appName("Aggregations").getOrCreate()
df = spark.createDataFrame(
 [(1,"north",100,"walmart"),(2,"south",300,"apple"),
 (3,"west",200,"google"),(1,"east",200,"google"),
 (2,"north",100,"walmart"),(3,"west",300,"apple"),
 (1,"north",200,"walmart"),(2,"east",500,"google"), 
(3,"west",400,"apple"),],["emp_id","region","sales","customer"])

In [12]:
df.show()

+------+------+-----+--------+
|emp_id|region|sales|customer|
+------+------+-----+--------+
|     1| north|  100| walmart|
|     2| south|  300|   apple|
|     3|  west|  200|  google|
|     1|  east|  200|  google|
|     2| north|  100| walmart|
|     3|  west|  300|   apple|
|     1| north|  200| walmart|
|     2|  east|  500|  google|
|     3|  west|  400|   apple|
+------+------+-----+--------+



### The function sum adds all the values in the column to give an output value of 2300:


In [13]:
df.agg({"sales": "sum"}).show()


+----------+
|sum(sales)|
+----------+
|      2300|
+----------+



### The following code calculates the minimum value in the column sales:

In [14]:
df.agg({"sales": "min"}).show()

+----------+
|min(sales)|
+----------+
|       100|
+----------+



### The following code calculates the maximum value in the column sales

In [15]:
df.agg({"sales": "max"}).show()

+----------+
|max(sales)|
+----------+
|       500|
+----------+



### The following code calculates the count of number of rows in the dataset:

In [16]:
df.agg({"sales": "count"}).show()


+------------+
|count(sales)|
+------------+
|           9|
+------------+



### The following code calculates the average of all the values in the column sales

In [17]:
df.agg({"sales": "mean"}).show()


+------------------+
|        avg(sales)|
+------------------+
|255.55555555555554|
+------------------+



### The following code shows how to apply two aggregate functions simultaneously

In [18]:
df.agg({"sales": "mean","customer":"count"}).show()

+------------------+---------------+
|        avg(sales)|count(customer)|
+------------------+---------------+
|255.55555555555554|              9|
+------------------+---------------+

