In [1]:
import os
import sys
os.environ["PYSPARK_PYTHON"] = "/home/ec2-user/spark-2.4.4-bin-hadoop2.7/python"
os.environ["JAVA_HOME"] = "/usr/java/jdk1.8.0_161/jre"
os.environ["SPARK_HOME"] = "/home/ec2-user/spark-2.4.4-bin-hadoop2.7"
os.environ["PYLIB"] = os.environ["SPARK_HOME"] + "/python/lib"
sys.path.insert(0, os.environ["PYLIB"] + "/py4j-0.10.7-src.zip")
sys.path.insert(0, os.environ["PYLIB"] + "/pyspark.zip")

# GroupBy and Aggregate Functions

GroupBy allows you to group rows together based off some column value. 

Once you've performed the GroupBy operation you can use an aggregate function off that data. 

An aggregate function aggregates multiple rows of data into a single output, such as taking the sum of inputs, or counting the number of inputs.

In [2]:
from pyspark.sql import SparkSession

In [3]:
# May take a little while on a local computer
spark = SparkSession.builder.appName("GroupBy").getOrCreate()

Read in the customer sales data

In [4]:
df = spark.read.csv('sales_info.csv', inferSchema=True, header=True)

In [5]:
df.show()

+---------+-------+-----+
|  Company|Quarter|Sales|
+---------+-------+-----+
|   Google|     Q1|  200|
|   Google|     Q2|  120|
|   Google|     Q3|  340|
|   Google|     Q4|  300|
|Microsoft|     Q2|  300|
|Microsoft|     Q3|  124|
|Microsoft|     Q4|  243|
|Microsoft|     Q1|  330|
| Facebook|     Q3|  870|
| Facebook|     Q2|  750|
|    Apple|     Q1|  650|
|    Apple|     Q2|  550|
|    Apple|     Q3|  750|
|    Apple|     Q4|  850|
+---------+-------+-----+



Let's group together by company!

In [8]:
df.groupby('Company')

<pyspark.sql.group.GroupedData at 0x7fa057cc6310>

This returns a GroupedData object, off of which you can all various methods

In [9]:
# average
df.groupby('Company').mean().show()

+---------+----------+
|  Company|avg(Sales)|
+---------+----------+
|   Google|     240.0|
|Microsoft|    249.25|
|    Apple|     700.0|
| Facebook|     810.0|
+---------+----------+



In [10]:
# Count
df.groupby('Company').count().show()

+---------+-----+
|  Company|count|
+---------+-----+
|   Google|    4|
|Microsoft|    4|
|    Apple|    4|
| Facebook|    2|
+---------+-----+



In [11]:
# Max
df.groupby('Company').max().show()

+---------+----------+
|  Company|max(Sales)|
+---------+----------+
|   Google|       340|
|Microsoft|       330|
|    Apple|       850|
| Facebook|       870|
+---------+----------+



In [12]:
# Min
df.groupby('Company').min().show()

+---------+----------+
|  Company|min(Sales)|
+---------+----------+
|   Google|       120|
|Microsoft|       124|
|    Apple|       550|
| Facebook|       750|
+---------+----------+



In [13]:
# Sum
df.groupby('Company').sum().show()

+---------+----------+
|  Company|sum(Sales)|
+---------+----------+
|   Google|       960|
|Microsoft|       997|
|    Apple|      2800|
| Facebook|      1620|
+---------+----------+



## Functions
There are a variety of functions you can import from pyspark.sql.functions. 

In [14]:
from pyspark.sql.functions import countDistinct, avg, stddev 

In [15]:
df.select(countDistinct('Sales')).show()

+---------------------+
|count(DISTINCT Sales)|
+---------------------+
|                   12|
+---------------------+



In [17]:
df.select('Sales').show()

+-----+
|Sales|
+-----+
|  200|
|  120|
|  340|
|  300|
|  300|
|  124|
|  243|
|  330|
|  870|
|  750|
|  650|
|  550|
|  750|
|  850|
+-----+



In [18]:
df.select(avg('Sales')).show()

+----------+
|avg(Sales)|
+----------+
|     455.5|
+----------+



In [19]:
df.select(stddev('Sales')).show()

+------------------+
|stddev_samp(Sales)|
+------------------+
| 271.4634838738409|
+------------------+



#### Temporaryly chinging column name

using the methods on the data gives wired column names, use the .alias() method to change the column name temporaryly:

In [22]:
df.select(stddev('Sales').alias('std_sales')).show()

+-----------------+
|        std_sales|
+-----------------+
|271.4634838738409|
+-----------------+



In [28]:
df2 = df.select(stddev('Sales').alias('std_sales'))

In [29]:
df2.show()

+-----------------+
|        std_sales|
+-----------------+
|271.4634838738409|
+-----------------+



#### Rounding off digits

In [23]:
from pyspark.sql.functions import format_number

In [30]:
df2.select(format_number('std_sales',2).alias('std_sales')).show()

+---------+
|std_sales|
+---------+
|   271.46|
+---------+



## Order By

As the name suggests this method is used to sort the dataframe

In [31]:
df.orderBy('Sales').show()

+---------+-------+-----+
|  Company|Quarter|Sales|
+---------+-------+-----+
|   Google|     Q2|  120|
|Microsoft|     Q3|  124|
|   Google|     Q1|  200|
|Microsoft|     Q4|  243|
|   Google|     Q4|  300|
|Microsoft|     Q2|  300|
|Microsoft|     Q1|  330|
|   Google|     Q3|  340|
|    Apple|     Q2|  550|
|    Apple|     Q1|  650|
|    Apple|     Q3|  750|
| Facebook|     Q2|  750|
|    Apple|     Q4|  850|
| Facebook|     Q3|  870|
+---------+-------+-----+



By default the orderby sorts data in accending order, for desending we need to use syntax desc.

In [33]:
df.orderBy(df['Sales'].desc()).show(3)

+--------+-------+-----+
| Company|Quarter|Sales|
+--------+-------+-----+
|Facebook|     Q3|  870|
|   Apple|     Q4|  850|
|Facebook|     Q2|  750|
+--------+-------+-----+
only showing top 3 rows

