# GROUPBY()

In [19]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

spark = (
    SparkSession.builder
    .appName("example-groupby")
    .getOrCreate()
)

In [20]:
data_list = [("David", "Account", "United States", "6500"),
             ("Ravi", "Account", "India", "5500"),
             ("John", "Software", "India", "6500"),
             ("Rosy", "Software", "India", "8200"),
             ("Abdul", "Support", "Brazil", "4800")]
 
df = spark.createDataFrame(data_list).toDF("name", "department", "country", "salary")


In [5]:
df.groupBy("department", "country") \
  .agg(expr("count(*) as NumEmployee"), expr("sum(salary) as TotalSalary")) \
  .show()

+----------+-------------+-----------+-----------+
|department|      country|NumEmployee|TotalSalary|
+----------+-------------+-----------+-----------+
|   Account|        India|          1|     5500.0|
|   Support|       Brazil|          1|     4800.0|
|   Account|United States|          1|     6500.0|
|  Software|        India|          2|    14700.0|
+----------+-------------+-----------+-----------+



In [17]:
mylist = [1002, 3001, 4002, 2003, 2002, 3004, 1003, 4006]
df = spark.createDataFrame(mylist, IntegerType()).toDF("value")
df.withColumn("key", col("value") % 1000) \
  .groupBy("key") \
  .agg(expr("count(key) as count"), expr("sum(key) as sum")) \
  .show()


+---+-----+---+
|key|count|sum|
+---+-----+---+
|  1|    1|  1|
|  6|    1|  6|
|  3|    2|  6|
|  4|    1|  4|
|  2|    3|  6|
+---+-----+---+



In [18]:
df.withColumn("key", col("value") % 1000) \
  .groupBy("key") \
  .agg(expr("count(key) as count"), expr("sum(key) as sum")) \
  .orderBy(col("key").desc()) \
  .limit(1) \
  .select("count", "sum") \
  .show()


+-----+---+
|count|sum|
+-----+---+
|    1|  6|
+-----+---+



In [27]:
df.groupBy("country", "department").agg(expr("count(*)")).show()

+-------------+----------+--------+
|      country|department|count(1)|
+-------------+----------+--------+
|        India|   Account|       1|
|United States|   Account|       1|
|        India|  Software|       2|
|       Brazil|   Support|       1|
+-------------+----------+--------+



In [28]:
df.groupBy("country", "department").count().show()

+-------------+----------+-----+
|      country|department|count|
+-------------+----------+-----+
|        India|   Account|    1|
|United States|   Account|    1|
|        India|  Software|    2|
|       Brazil|   Support|    1|
+-------------+----------+-----+



In [29]:
df.groupBy("department", "country").count().show()

+----------+-------------+-----+
|department|      country|count|
+----------+-------------+-----+
|   Account|        India|    1|
|   Support|       Brazil|    1|
|   Account|United States|    1|
|  Software|        India|    2|
+----------+-------------+-----+



In [None]:
# mismatched input ',' expecting {<EOF>,
df.groupBy(expr("country, department")).count().show()
