# GROUP BY and AGG (Aggregate methods)

### Start a simple Spark Session

In [1]:
import org.apache.spark.sql.SparkSession

import org.apache.spark.sql.SparkSession


In [2]:
val spark = SparkSession.builder().getOrCreate()

spark: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@25552da


### Create a DataFrame from Spark Session read csv

In [3]:
val df = spark.read.options(Map(("header","true"),("inferSchema","true"))).csv("Sales.csv")

df: org.apache.spark.sql.DataFrame = [Company: string, Person: string ... 1 more field]


### Show Schema

In [4]:
df.printSchema

root
 |-- Company: string (nullable = true)
 |-- Person: string (nullable = true)
 |-- Sales: integer (nullable = true)



### Show

In [12]:
df.show()

+-------+-------+-----+
|Company| Person|Sales|
+-------+-------+-----+
|   GOOG|    Sam|  200|
|   GOOG|Charlie|  120|
|   GOOG|  Frank|  340|
|   MSFT|   Tina|  600|
|   MSFT|    Amy|  124|
|   MSFT|Vanessa|  243|
|     FB|   Carl|  870|
|     FB|  Sarah|  350|
+-------+-------+-----+



### Groupby Categorical Columns

In [8]:
// Optional, usually won't save to another object
df.groupBy("Company")

res3: org.apache.spark.sql.RelationalGroupedDataset = RelationalGroupedDataset: [grouping expressions: [Company: string], value: [Company: string, Person: string ... 1 more field], type: GroupBy]


### Mean

In [10]:
df.groupBy("Company").mean().show()

+-------+-----------------+
|Company|       avg(Sales)|
+-------+-----------------+
|   GOOG|            220.0|
|     FB|            610.0|
|   MSFT|322.3333333333333|
+-------+-----------------+



### Count

In [11]:
df.groupBy("Company").count().show()

+-------+-----+
|Company|count|
+-------+-----+
|   GOOG|    3|
|     FB|    2|
|   MSFT|    3|
+-------+-----+



### Max

In [13]:
df.groupBy("Company").max().show()

+-------+----------+
|Company|max(Sales)|
+-------+----------+
|   GOOG|       340|
|     FB|       870|
|   MSFT|       600|
+-------+----------+



### Min

In [14]:
df.groupBy("Company").min().show()

+-------+----------+
|Company|min(Sales)|
+-------+----------+
|   GOOG|       120|
|     FB|       350|
|   MSFT|       124|
+-------+----------+



### Sum

In [15]:
df.groupBy("Company").sum().show()

+-------+----------+
|Company|sum(Sales)|
+-------+----------+
|   GOOG|       660|
|     FB|      1220|
|   MSFT|       967|
+-------+----------+



## Other Aggregate Functions

http://spark.apache.org/docs/latest/api/scala/index.html#org.apache.spark.sql.functions$

### approxCountDistinct

In [16]:
df.select(countDistinct("Sales")).show()

+---------------------+
|count(DISTINCT Sales)|
+---------------------+
|                    8|
+---------------------+



### sumDistinct

In [17]:
df.select(sumDistinct("Sales")).show()

+-------------------+
|sum(DISTINCT Sales)|
+-------------------+
|               2847|
+-------------------+



### variance

In [18]:
df.select(variance("Sales")).show()

+-----------------+
|  var_samp(Sales)|
+-----------------+
|67235.55357142855|
+-----------------+



### stddev

In [19]:
df.select(stddev("Sales")).show()

+------------------+
|stddev_samp(Sales)|
+------------------+
|259.29819430807567|
+------------------+



### avg

In [20]:
df.select(avg("Sales")).show()

+----------+
|avg(Sales)|
+----------+
|   355.875|
+----------+



### min

In [21]:
df.select(min("Sales")).show()

+----------+
|min(Sales)|
+----------+
|       120|
+----------+



### max

In [22]:
df.select(max("Sales")).show()

+----------+
|max(Sales)|
+----------+
|       870|
+----------+



### sum

In [23]:
df.select(sum("Sales")).show()

+----------+
|sum(Sales)|
+----------+
|      2847|
+----------+



## OrderBy

### Ascending

In [25]:
df.orderBy("Sales").show()

+-------+-------+-----+
|Company| Person|Sales|
+-------+-------+-----+
|   GOOG|Charlie|  120|
|   MSFT|    Amy|  124|
|   GOOG|    Sam|  200|
|   MSFT|Vanessa|  243|
|   GOOG|  Frank|  340|
|     FB|  Sarah|  350|
|   MSFT|   Tina|  600|
|     FB|   Carl|  870|
+-------+-------+-----+



### Descending

In [28]:
df.orderBy($"Sales".desc).show()

+-------+-------+-----+
|Company| Person|Sales|
+-------+-------+-----+
|     FB|   Carl|  870|
|   MSFT|   Tina|  600|
|     FB|  Sarah|  350|
|   GOOG|  Frank|  340|
|   MSFT|Vanessa|  243|
|   GOOG|    Sam|  200|
|   MSFT|    Amy|  124|
|   GOOG|Charlie|  120|
+-------+-------+-----+



## Closing Spark Session

In [29]:
spark.stop()

## Thank You!