In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

In [2]:
spark = SparkSession.builder.master('local').appName('aggDataFrames').enableHiveSupport().getOrCreate()

In [3]:
sc=spark.sparkContext

In [4]:
spark

In [5]:
orders = spark.read.csv('data//retail_db//orders.csv',header=True,inferSchema=True)
order_items = spark.read.csv('data//retail_db//order_items.csv',header=True,inferSchema=True)
customers = spark.read.csv('data//retail_db//customers.csv',header=True,inferSchema=True)
products = spark.read.csv('data//retail_db//products.csv',header=True,inferSchema=True)

# Total Aggregation

In [6]:
orders.select(count(orders.order_id)).show()

+---------------+
|count(order_id)|
+---------------+
|          68883|
+---------------+



In [7]:
order_items.select(max(order_items.order_item_subtotal).alias('total_maximum')).show()

+-------------+
|total_maximum|
+-------------+
|      1999.99|
+-------------+



In [8]:
order_items.select(min(order_items.order_item_subtotal).alias('total_minimum')).show()

+-------------+
|total_minimum|
+-------------+
|         9.99|
+-------------+



In [9]:
order_items.select(avg(order_items.order_item_subtotal).alias('total_avg')).show()

+------------------+
|         total_avg|
+------------------+
|199.32066533864273|
+------------------+



In [10]:
order_items.select(sum(order_items.order_item_subtotal).alias('total_sum')).show()

+------------------+
|         total_sum|
+------------------+
|3.43226199299836E7|
+------------------+



# groupBy

In [11]:
order_items.groupBy('order_item_order_id').sum('order_item_subtotal').show()

+-------------------+------------------------+
|order_item_order_id|sum(order_item_subtotal)|
+-------------------+------------------------+
|                148|                  479.99|
|                463|       829.9200000000001|
|                471|      169.98000000000002|
|                496|      441.95000000000005|
|               1088|      249.97000000000003|
|               1580|                  299.95|
|               1591|                  439.86|
|               1645|      1509.7900000000002|
|               2366|                  299.97|
|               2659|       724.9100000000001|
|               2866|                  569.96|
|               3175|                  209.97|
|               3749|                  143.97|
|               3794|                  299.95|
|               3918|       829.9300000000001|
|               3997|                  579.95|
|               4101|                  129.99|
|               4519|                   79.98|
|            

### agg() function

Addition of agg() right after groupBy helps us in the below cases


1. It helps us to use built-in functions to the resultant groupBy column as below

In [12]:
order_items.groupBy('order_item_order_id').agg(round(sum('order_item_subtotal'),2)).show()

+-------------------+----------------------------------+
|order_item_order_id|round(sum(order_item_subtotal), 2)|
+-------------------+----------------------------------+
|                148|                            479.99|
|                463|                            829.92|
|                471|                            169.98|
|                496|                            441.95|
|               1088|                            249.97|
|               1580|                            299.95|
|               1591|                            439.86|
|               1645|                           1509.79|
|               2366|                            299.97|
|               2659|                            724.91|
|               2866|                            569.96|
|               3175|                            209.97|
|               3749|                            143.97|
|               3794|                            299.95|
|               3918|          

In [13]:
#The above round() operation cannot be performed without agg() function 
# order_items.groupBy('order_item_order_id').round(sum('order_item_subtotal'),2)

2. It helps us to add alias to the resultant groupBy column for cleaner expression

In [14]:
order_items.groupBy('order_item_order_id').agg(round(sum('order_item_subtotal'),2).alias('order_expense')).show()

+-------------------+-------------+
|order_item_order_id|order_expense|
+-------------------+-------------+
|                148|       479.99|
|                463|       829.92|
|                471|       169.98|
|                496|       441.95|
|               1088|       249.97|
|               1580|       299.95|
|               1591|       439.86|
|               1645|      1509.79|
|               2366|       299.97|
|               2659|       724.91|
|               2866|       569.96|
|               3175|       209.97|
|               3749|       143.97|
|               3794|       299.95|
|               3918|       829.93|
|               3997|       579.95|
|               4101|       129.99|
|               4519|        79.98|
|               4818|       399.98|
|               4900|       179.97|
+-------------------+-------------+
only showing top 20 rows

