In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName("GroupBy_Agg").getOrCreate()

In [3]:
spark

In [4]:
employee = spark.read.csv("emp1.csv", inferSchema= True, header= True)

In [5]:
employee.show()

+------+-------+----+-------+----+---+
|emp_id|   name|dept|salary | age|sex|
+------+-------+----+-------+----+---+
|     1|  argha|sale|  10000|  25|  M|
|     2|  sohag|tech|  20000|  25|  M|
|     7|  soura|tech|  30000|  25|  M|
|     4| shivam|tech|  10000|  26|  F|
|     5|  lohit|  hr|  40000|  26|  F|
|     6|prakash|  hr|  50000|  26|  M|
|     8|sourabh|tech|  60000|null|  F|
|    10|  biswa|sale|  50000|null|  M|
|     7|   null|tech|  30000|  25|  M|
|     6|prakash|null|  50000|  26|  M|
|     4| shivam|tech|  10000|  26|  F|
+------+-------+----+-------+----+---+



In [6]:
employee.printSchema()

root
 |-- emp_id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- dept: string (nullable = true)
 |-- salary : integer (nullable = true)
 |-- age: integer (nullable = true)
 |-- sex: string (nullable = true)



In [7]:
employee.groupBy("dept").sum().show()

+----+-----------+------------+--------+
|dept|sum(emp_id)|sum(salary )|sum(age)|
+----+-----------+------------+--------+
|null|          6|       50000|      26|
|  hr|         11|       90000|      52|
|sale|         11|       60000|      25|
|tech|         32|      160000|     127|
+----+-----------+------------+--------+



In [8]:
employee.groupBy("dept").mean().show()

+----+-----------------+------------------+--------+
|dept|      avg(emp_id)|      avg(salary )|avg(age)|
+----+-----------------+------------------+--------+
|null|              6.0|           50000.0|    26.0|
|  hr|              5.5|           45000.0|    26.0|
|sale|              5.5|           30000.0|    25.0|
|tech|5.333333333333333|26666.666666666668|    25.4|
+----+-----------------+------------------+--------+



In [9]:
employee.groupBy("dept").count().show()

+----+-----+
|dept|count|
+----+-----+
|null|    1|
|  hr|    2|
|sale|    2|
|tech|    6|
+----+-----+



In [10]:
employee.groupBy("dept").count().show()

+----+-----+
|dept|count|
+----+-----+
|null|    1|
|  hr|    2|
|sale|    2|
|tech|    6|
+----+-----+



In [16]:
employee.select("salary ").show()

AttributeError: 'DataFrame' object has no attribute 'sum'

In [17]:
employee.agg({"salary " : "sum"}).show()

+------------+
|sum(salary )|
+------------+
|      360000|
+------------+



In [18]:
employee.agg({"salary " : "max"}).show()

+------------+
|max(salary )|
+------------+
|       60000|
+------------+



In [19]:
data_group = employee.groupBy("dept")
data_group.agg({"Salary " : "max" }).show()

+----+------------+
|dept|max(Salary )|
+----+------------+
|null|       50000|
|  hr|       50000|
|sale|       50000|
|tech|       60000|
+----+------------+



In [23]:
employee.groupby("dept").max().select(["dept","max(salary )"]).show()

+----+------------+
|dept|max(salary )|
+----+------------+
|null|       50000|
|  hr|       50000|
|sale|       50000|
|tech|       60000|
+----+------------+



# Importing Aggregate functions 

In [24]:
from pyspark.sql.functions import avg, stddev, count, countDistinct

In [26]:
employee.select(avg("salary ").alias("Avg Salary")).show()

+------------------+
|        Avg Salary|
+------------------+
|32727.272727272728|
+------------------+



In [29]:
from pyspark.sql.functions import format_number

In [32]:
std_data = employee.select(stddev("salary ").alias("std"))
std_data.select(format_number("std", 2). alias("final")).show()

+---------+
|    final|
+---------+
|18,488.33|
+---------+



In [34]:
# orderBy 
employee.orderBy("salary ").show()

+------+-------+----+-------+----+---+
|emp_id|   name|dept|salary | age|sex|
+------+-------+----+-------+----+---+
|     4| shivam|tech|  10000|  26|  F|
|     4| shivam|tech|  10000|  26|  F|
|     1|  argha|sale|  10000|  25|  M|
|     2|  sohag|tech|  20000|  25|  M|
|     7|  soura|tech|  30000|  25|  M|
|     7|   null|tech|  30000|  25|  M|
|     5|  lohit|  hr|  40000|  26|  F|
|     6|prakash|  hr|  50000|  26|  M|
|    10|  biswa|sale|  50000|null|  M|
|     6|prakash|null|  50000|  26|  M|
|     8|sourabh|tech|  60000|null|  F|
+------+-------+----+-------+----+---+



In [42]:
employee.orderBy(employee["salary "].desc()).show()

+------+-------+----+-------+----+---+
|emp_id|   name|dept|salary | age|sex|
+------+-------+----+-------+----+---+
|     8|sourabh|tech|  60000|null|  F|
|    10|  biswa|sale|  50000|null|  M|
|     6|prakash|  hr|  50000|  26|  M|
|     6|prakash|null|  50000|  26|  M|
|     5|  lohit|  hr|  40000|  26|  F|
|     7|   null|tech|  30000|  25|  M|
|     7|  soura|tech|  30000|  25|  M|
|     2|  sohag|tech|  20000|  25|  M|
|     1|  argha|sale|  10000|  25|  M|
|     4| shivam|tech|  10000|  26|  F|
|     4| shivam|tech|  10000|  26|  F|
+------+-------+----+-------+----+---+



In [43]:
employee["salary "]

Column<'salary '>