# Basic GroupBy and Aggregation

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import avg, sum, count, min, max

In [2]:
spark = SparkSession.builder.appName('Grouping').getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/01/08 16:47:30 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/01/08 16:47:30 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
25/01/08 16:47:30 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


In [3]:
data = [("Alice", "Sales", 3000),
        ("Bob", "Sales", 4000),
        ("Alice", "HR", 2000),
        ("Bob", "HR", 2500),
        ("Charlie", "Sales", 3000),
        ("Charlie", "HR", 2200)]
columns = ["Name", "Department", "Salary"]

df = spark.createDataFrame(data, columns)

# Group by Department and calculate the average salary

In [4]:
df.groupBy("Department").agg(avg("Salary").alias("Average_Salary")).show()

                                                                                

+----------+------------------+
|Department|    Average_Salary|
+----------+------------------+
|     Sales|3333.3333333333335|
|        HR|2233.3333333333335|
+----------+------------------+



# Count

In [5]:
df.groupBy("Department").agg(count("*").alias("Count")).show()    


+----------+-----+
|Department|Count|
+----------+-----+
|     Sales|    3|
|        HR|    3|
+----------+-----+



# Sum

In [6]:
df.groupBy("Department").agg(sum("Salary").alias("Total_Salary")).show()

+----------+------------+
|Department|Total_Salary|
+----------+------------+
|     Sales|       10000|
|        HR|        6700|
+----------+------------+



25/01/08 16:47:45 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors


# Multiple Aggregations in a Single GroupBy


In [7]:
df.groupBy("Department").agg(
    avg("Salary").alias("Average_Salary"),
    sum("Salary").alias("Total_Salary"),
    min("Salary").alias("Min_Salary"),
    max("Salary").alias("Max_Salary")
).show()


+----------+------------------+------------+----------+----------+
|Department|    Average_Salary|Total_Salary|Min_Salary|Max_Salary|
+----------+------------------+------------+----------+----------+
|     Sales|3333.3333333333335|       10000|      3000|      4000|
|        HR|2233.3333333333335|        6700|      2000|      2500|
+----------+------------------+------------+----------+----------+

