In [7]:
import pandas as pd

data = {
    'name': ['Alice', 'Bob', 'Charlie', 'Alice', 'Eve', 'Frank', 'Alice', 'Hannah', 'Ian', 'Bob'],
    'department': ['HR', 'IT', 'Finance', 'IT', 'HR', 'Finance', 'IT', 'HR', 'Finance', 'IT'],
    'salary': [50000, 60000, 55000, 62000, 52000, 58000, 61000, 53000, 57000, 60000]
}

df = pd.DataFrame(data)
df.to_csv('groupbyAgg.csv', index=False)
df

Unnamed: 0,name,department,salary
0,Alice,HR,50000
1,Bob,IT,60000
2,Charlie,Finance,55000
3,Alice,IT,62000
4,Eve,HR,52000
5,Frank,Finance,58000
6,Alice,IT,61000
7,Hannah,HR,53000
8,Ian,Finance,57000
9,Bob,IT,60000


In [8]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("EmployeeData").getOrCreate()

In [9]:
df_pyspark = spark.read.csv("groupbyAgg.csv", header=True, inferSchema=True)

In [10]:
df_pyspark.show()

+-------+----------+------+
|   name|department|salary|
+-------+----------+------+
|  Alice|        HR| 50000|
|    Bob|        IT| 60000|
|Charlie|   Finance| 55000|
|  Alice|        IT| 62000|
|    Eve|        HR| 52000|
|  Frank|   Finance| 58000|
|  Alice|        IT| 61000|
| Hannah|        HR| 53000|
|    Ian|   Finance| 57000|
|    Bob|        IT| 60000|
+-------+----------+------+



In [11]:
df_pyspark.printSchema()

root
 |-- name: string (nullable = true)
 |-- department: string (nullable = true)
 |-- salary: integer (nullable = true)



In [21]:
df_pyspark.groupBy('name').max().show()
df_pyspark.groupBy('name').min().show()
df_pyspark.groupBy('name').sum().show()

+-------+-----------+
|   name|max(salary)|
+-------+-----------+
| Hannah|      53000|
|Charlie|      55000|
|    Bob|      60000|
|  Alice|      62000|
|    Eve|      52000|
|    Ian|      57000|
|  Frank|      58000|
+-------+-----------+

+-------+-----------+
|   name|min(salary)|
+-------+-----------+
| Hannah|      53000|
|Charlie|      55000|
|    Bob|      60000|
|  Alice|      50000|
|    Eve|      52000|
|    Ian|      57000|
|  Frank|      58000|
+-------+-----------+

+-------+-----------+
|   name|sum(salary)|
+-------+-----------+
| Hannah|      53000|
|Charlie|      55000|
|    Bob|     120000|
|  Alice|     173000|
|    Eve|      52000|
|    Ian|      57000|
|  Frank|      58000|
+-------+-----------+



In [13]:
df_pyspark.groupBy('department').sum().show()

+----------+-----------+
|department|sum(salary)|
+----------+-----------+
|        HR|     155000|
|   Finance|     170000|
|        IT|     243000|
+----------+-----------+



In [15]:
df_pyspark.groupBy('department').count().show()

+----------+-----+
|department|count|
+----------+-----+
|        HR|    3|
|   Finance|    3|
|        IT|    4|
+----------+-----+



In [22]:
df_pyspark.groupBy('department').agg({'salary': 'avg'}).show()

+----------+------------------+
|department|       avg(salary)|
+----------+------------------+
|        HR|51666.666666666664|
|   Finance|56666.666666666664|
|        IT|           60750.0|
+----------+------------------+



In [23]:
df_pyspark.agg({'salary' : 'sum'}).show()

+-----------+
|sum(salary)|
+-----------+
|     568000|
+-----------+

