In [1]:
import os, sys
os.environ['HADOOP_CONF_DIR'] = '/etc/hadoop/conf'
os.environ['YARN_CONF_DIR'] = '/etc/hadoop/conf'
os.environ['PYSPARK_PYTHON'] =  'python3.9'
os.environ['PYSPARK_DRIVER_PYTHON'] = 'python3.9'
os.environ['HADOOP_USER_NAME']='ssenigov'

from pyspark.sql import SparkSession
from pyspark import SparkContext, SparkConf

In [None]:
conf = SparkConf().setAppName('CollectSet_List').setMaster('yarn')
spark = SparkSession.builder.config(conf=conf).getOrCreate()

sc = spark.sparkContext
print("app_id".ljust(40), sc.applicationId)

In [3]:
countries = [('fra', 'France', 68, 'EU'), ('bel', 'Belgium', 11, 'EU'), 
             ('usa', 'United States', 334, 'NA'), ('swe', 'Sweden', 10, 'EU'),
             ('ita', 'Italy', 59, 'EU'), ('chi', 'China', 1411, 'AS'),
             ('ind', 'India', 1425, 'AS'), ('bra', 'Brasil', 220, 'SA'),
             ('mex', 'Mexico', 126, 'CA'), ('rom', 'Romania', 19, 'EU'),
             ('jpn', 'Japan', 123, 'AS')]

schema=["country", "country_name", "population", "continent"]

df_countries = spark.createDataFrame(data=countries, schema=schema)

In [4]:
from pyspark.sql.functions import avg, col, count, collect_set, collect_list
from pyspark.sql.types import IntegerType

df_grouped = df_countries.groupBy("continent").agg(
        collect_set("country_name").alias("countries"),
        avg("population").cast(IntegerType()).alias("average_population"),
        count("country").alias("number_countries"))

df_grouped.show(truncate=True)

df_grouped.printSchema()



+---------+--------------------+------------------+----------------+
|continent|           countries|average_population|number_countries|
+---------+--------------------+------------------+----------------+
|       NA|     [United States]|               334|               1|
|       EU|[France, Italy, S...|                33|               5|
|       SA|            [Brasil]|               220|               1|
|       CA|            [Mexico]|               126|               1|
|       AS|[India, Japan, Ch...|               986|               3|
+---------+--------------------+------------------+----------------+

root
 |-- continent: string (nullable = true)
 |-- countries: array (nullable = false)
 |    |-- element: string (containsNull = false)
 |-- average_population: integer (nullable = true)
 |-- number_countries: long (nullable = false)



                                                                                

In [5]:
spark.stop()