In [0]:
from pyspark.sql import SparkSession


# Initialize SparkSession
spark = SparkSession.builder.appName("example").getOrCreate()
simpleData = [("James","Sales","NY",90000,34,10000),
("Michael","Sales","NY",86000,56,20000),
("Robert","Sales","CA",81000,30,23000),
("Maria","Finance","CA",90000,24,23000),
("Raman","Finance","CA",99000,40,24000),
("Scott","Finance","NY",83000,36,19000),
("Jen","Finance","NY",79000,53,15000),
("Jeff","Marketing","CA",80000,25,18000),
("Kumar","Marketing","NY",91000,50,21000)
]
# Create DataFrame
schema = ["employee_name","department","state","salary","age","bonus"]
df = spark.createDataFrame(data=simpleData, schema = schema)
df.printSchema()
df.show()

root
 |-- employee_name: string (nullable = true)
 |-- department: string (nullable = true)
 |-- state: string (nullable = true)
 |-- salary: long (nullable = true)
 |-- age: long (nullable = true)
 |-- bonus: long (nullable = true)

+-------------+----------+-----+------+---+-----+
|employee_name|department|state|salary|age|bonus|
+-------------+----------+-----+------+---+-----+
|        James|     Sales|   NY| 90000| 34|10000|
|      Michael|     Sales|   NY| 86000| 56|20000|
|       Robert|     Sales|   CA| 81000| 30|23000|
|        Maria|   Finance|   CA| 90000| 24|23000|
|        Raman|   Finance|   CA| 99000| 40|24000|
|        Scott|   Finance|   NY| 83000| 36|19000|
|          Jen|   Finance|   NY| 79000| 53|15000|
|         Jeff| Marketing|   CA| 80000| 25|18000|
|        Kumar| Marketing|   NY| 91000| 50|21000|
+-------------+----------+-----+------+---+-----+



In [0]:
# groupby with sum of salaries
sumdata = df.groupBy("department").sum("salary")
sumdata.show()

+----------+-----------+
|department|sum(salary)|
+----------+-----------+
|     Sales|     257000|
|   Finance|     351000|
| Marketing|     171000|
+----------+-----------+



In [0]:

df.groupBy("department").min("salary").show()
df.groupBy("department").max("salary").show()
df.groupBy("department").avg("salary").show()
df.groupBy("department").mean("salary").show()
df.groupBy("department").count().show()


+----------+-----------+
|department|min(salary)|
+----------+-----------+
|     Sales|      81000|
|   Finance|      79000|
| Marketing|      80000|
+----------+-----------+

+----------+-----------+
|department|max(salary)|
+----------+-----------+
|     Sales|      90000|
|   Finance|      99000|
| Marketing|      91000|
+----------+-----------+

+----------+-----------------+
|department|      avg(salary)|
+----------+-----------------+
|     Sales|85666.66666666667|
|   Finance|          87750.0|
| Marketing|          85500.0|
+----------+-----------------+

+----------+-----------------+
|department|      avg(salary)|
+----------+-----------------+
|     Sales|85666.66666666667|
|   Finance|          87750.0|
| Marketing|          85500.0|
+----------+-----------------+

+----------+-----+
|department|count|
+----------+-----+
|     Sales|    3|
|   Finance|    4|
| Marketing|    2|
+----------+-----+



In [0]:
#grouping by multiple columns
df.groupBy("employee_name","department").min("salary").show()
df.groupBy("employee_name","department").max("salary").show()
df.groupBy("employee_name","department").avg("salary").show()
df.groupBy("employee_name","department").mean("salary").show()
df.groupBy("employee_name","department").count().show()


+-------------+----------+-----------+
|employee_name|department|min(salary)|
+-------------+----------+-----------+
|        James|     Sales|      90000|
|      Michael|     Sales|      86000|
|       Robert|     Sales|      81000|
|        Maria|   Finance|      90000|
|        Raman|   Finance|      99000|
|        Scott|   Finance|      83000|
|          Jen|   Finance|      79000|
|         Jeff| Marketing|      80000|
|        Kumar| Marketing|      91000|
+-------------+----------+-----------+

+-------------+----------+-----------+
|employee_name|department|max(salary)|
+-------------+----------+-----------+
|        James|     Sales|      90000|
|      Michael|     Sales|      86000|
|       Robert|     Sales|      81000|
|        Maria|   Finance|      90000|
|        Raman|   Finance|      99000|
|        Scott|   Finance|      83000|
|          Jen|   Finance|      79000|
|         Jeff| Marketing|      80000|
|        Kumar| Marketing|      91000|
+-------------+---------

In [0]:
#using pivot function
df.groupBy("department").sum("salary").show()                 #without pivot    
df.groupBy("department").pivot("employee_name").sum("salary").show()    #with pivot

+----------+-----------+
|department|sum(salary)|
+----------+-----------+
|     Sales|     257000|
|   Finance|     351000|
| Marketing|     171000|
+----------+-----------+

+----------+-----+-----+-----+-----+-----+-------+-----+------+-----+
|department|James| Jeff|  Jen|Kumar|Maria|Michael|Raman|Robert|Scott|
+----------+-----+-----+-----+-----+-----+-------+-----+------+-----+
|     Sales|90000| null| null| null| null|  86000| null| 81000| null|
|   Finance| null| null|79000| null|90000|   null|99000|  null|83000|
| Marketing| null|80000| null|91000| null|   null| null|  null| null|
+----------+-----+-----+-----+-----+-----+-------+-----+------+-----+



In [0]:
simpleData = [("James","Sales","NY",90000,34,10000),
("Michael","Sales","NY",86000,56,20000),
("Robert","Sales","CA",81000,None,23000),
("Maria","Finance","CA",90000,24,23000),
("Raman","Finance","CA",99000,40,None),
("Scott","Finance","NY",None,36,44000),
("Jen","Finance","NY",55000,53,15000),
("Jeff",None,"CA",80000,25,18000),
(None,"Marketing","NY",91000,50,21000)
]
# Create DataFrame
schema = ["employee_name","department","state","salary","age","bonus"]
dfa = spark.createDataFrame(data=simpleData, schema = schema)
dfa.show()
dfa.dropna().show()



+-------------+----------+-----+------+----+-----+
|employee_name|department|state|salary| age|bonus|
+-------------+----------+-----+------+----+-----+
|        James|     Sales|   NY| 90000|  34|10000|
|      Michael|     Sales|   NY| 86000|  56|20000|
|       Robert|     Sales|   CA| 81000|null|23000|
|        Maria|   Finance|   CA| 90000|  24|23000|
|        Raman|   Finance|   CA| 99000|  40| null|
|        Scott|   Finance|   NY|  null|  36|44000|
|          Jen|   Finance|   NY| 55000|  53|15000|
|         Jeff|      null|   CA| 80000|  25|18000|
|         null| Marketing|   NY| 91000|  50|21000|
+-------------+----------+-----+------+----+-----+

+-------------+----------+-----+------+---+-----+
|employee_name|department|state|salary|age|bonus|
+-------------+----------+-----+------+---+-----+
|        James|     Sales|   NY| 90000| 34|10000|
|      Michael|     Sales|   NY| 86000| 56|20000|
|        Maria|   Finance|   CA| 90000| 24|23000|
|          Jen|   Finance|   NY| 550

In [0]:
df.groupBy("department").agg(({"salary":"sum"})).show()
df.agg(({"salary":"sum"})).show()  # Without group using agg on salary colums


+----------+-----------+
|department|sum(salary)|
+----------+-----------+
|     Sales|     257000|
|   Finance|     351000|
| Marketing|     171000|
+----------+-----------+

+-----------+
|sum(salary)|
+-----------+
|     779000|
+-----------+



In [0]:
df.show()

+-------------+----------+-----+------+---+-----+
|employee_name|department|state|salary|age|bonus|
+-------------+----------+-----+------+---+-----+
|        James|     Sales|   NY| 90000| 34|10000|
|      Michael|     Sales|   NY| 86000| 56|20000|
|       Robert|     Sales|   CA| 81000| 30|23000|
|        Maria|   Finance|   CA| 90000| 24|23000|
|        Raman|   Finance|   CA| 99000| 40|24000|
|        Scott|   Finance|   NY| 83000| 36|19000|
|          Jen|   Finance|   NY| 79000| 53|15000|
|         Jeff| Marketing|   CA| 80000| 25|18000|
|        Kumar| Marketing|   NY| 91000| 50|21000|
+-------------+----------+-----+------+---+-----+



In [0]:
df.sort("salary").show()           #sorted in ascending order by default
df.sort(df["salary"].desc()).show() #sorted in descending order

+-------------+----------+-----+------+---+-----+
|employee_name|department|state|salary|age|bonus|
+-------------+----------+-----+------+---+-----+
|          Jen|   Finance|   NY| 79000| 53|15000|
|         Jeff| Marketing|   CA| 80000| 25|18000|
|       Robert|     Sales|   CA| 81000| 30|23000|
|        Scott|   Finance|   NY| 83000| 36|19000|
|      Michael|     Sales|   NY| 86000| 56|20000|
|        Maria|   Finance|   CA| 90000| 24|23000|
|        James|     Sales|   NY| 90000| 34|10000|
|        Kumar| Marketing|   NY| 91000| 50|21000|
|        Raman|   Finance|   CA| 99000| 40|24000|
+-------------+----------+-----+------+---+-----+

+-------------+----------+-----+------+---+-----+
|employee_name|department|state|salary|age|bonus|
+-------------+----------+-----+------+---+-----+
|        Raman|   Finance|   CA| 99000| 40|24000|
|        Kumar| Marketing|   NY| 91000| 50|21000|
|        James|     Sales|   NY| 90000| 34|10000|
|        Maria|   Finance|   CA| 90000| 24|23000|