# SORT() + SORT_ARRAY()

In [21]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

spark = (
    SparkSession.builder
    .appName("example-sort-array")
    .getOrCreate()
)

In [22]:
data = [
        ('1',['blue', 'winter', 'cozy'],'Sports Company Inc.'),
        ('2',['red', 'summer', 'fresh', 'cooling'],'YetiX'),
        ('3',['green', 'summer', 'travel'],'Sports Company Inc.')]

itemsDf = spark.createDataFrame(data=data, schema = ['itemId','attributes','supplier'])
itemsDf.printSchema()
itemsDf.show()

root
 |-- itemId: string (nullable = true)
 |-- attributes: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- supplier: string (nullable = true)

+------+--------------------+-------------------+
|itemId|          attributes|           supplier|
+------+--------------------+-------------------+
|     1|[blue, winter, cozy]|Sports Company Inc.|
|     2|[red, summer, fre...|              YetiX|
|     3|[green, summer, t...|Sports Company Inc.|
+------+--------------------+-------------------+



Which of the following code blocks reorders the values inside the arrays in column attributes of DataFrame itemsDf from last to first one in the alphabet?

In [20]:
itemsDf.withColumn("attributes", sort_array("attributes", asc=False)).show(truncate=False)


+------+-----------------------------+-------------------+
|itemId|attributes                   |supplier           |
+------+-----------------------------+-------------------+
|1     |[winter, cozy, blue]         |Sports Company Inc.|
|2     |[summer, red, fresh, cooling]|YetiX              |
|3     |[travel, summer, green]      |Sports Company Inc.|
+------+-----------------------------+-------------------+



In [19]:
itemsDf.select(sort_array("attributes")).show(truncate=False)

+-----------------------------+
|sort_array(attributes, true) |
+-----------------------------+
|[blue, cozy, winter]         |
|[cooling, fresh, red, summer]|
|[green, summer, travel]      |
+-----------------------------+



In [None]:
# Py4JJavaError: An error occurred while calling o113.showString
itemsDf.withColumn('attributes', sort_array(col('attributes').desc())).show()

# An error occurred while calling o131.showString.
itemsDf.withColumn('attributes', sort_array(desc('attributes'))).show()

# NameError: name 'sort' is not defined
itemsDf.withColumn('attributes', sort(col('attributes'), asc=False))


You are given a DataFrame as shown below.
>
- `+——-+—-+———-+`
- `|BatchID|Year|CourseName|`
- `+——-+—-+———-+`
- `| X1 |2020| Scala |`
- `| X2 |2020| Python |`
- `| X3 |null| Java |`
- `| X4 |2021| Scala |`
- `| X5 |null| Python |`
- `| X6 |2021| Spark |`
- `+——-+—-+———-+`
>
You want to sort this DataFrame by Year in descending order and you want all the nulls to appear first.
>
You are given some code blocks to do this. Choose the incorrect option.

In [23]:
data = [
        ('X1',2020,'Scala'),
        ('X2',2020,'Python'),
        ('X3',None,'Java'),
        ('X4',2021,'Scala'),
        ('X5',None,'Python'),
        ('X6',2021,'Spark')]

df = spark.createDataFrame(data=data, schema = ['BatchID','Year','CourseName'])
df.printSchema()
df.show()

root
 |-- BatchID: string (nullable = true)
 |-- Year: long (nullable = true)
 |-- CourseName: string (nullable = true)

+-------+----+----------+
|BatchID|Year|CourseName|
+-------+----+----------+
|     X1|2020|     Scala|
|     X2|2020|    Python|
|     X3|null|      Java|
|     X4|2021|     Scala|
|     X5|null|    Python|
|     X6|2021|     Spark|
+-------+----+----------+



In [25]:
df.orderBy(desc_nulls_first("Year")).show()


+-------+----+----------+
|BatchID|Year|CourseName|
+-------+----+----------+
|     X5|null|    Python|
|     X3|null|      Java|
|     X4|2021|     Scala|
|     X6|2021|     Spark|
|     X2|2020|    Python|
|     X1|2020|     Scala|
+-------+----+----------+



In [26]:
df.sort(desc_nulls_first("Year")).show()


+-------+----+----------+
|BatchID|Year|CourseName|
+-------+----+----------+
|     X3|null|      Java|
|     X5|null|    Python|
|     X4|2021|     Scala|
|     X6|2021|     Spark|
|     X1|2020|     Scala|
|     X2|2020|    Python|
+-------+----+----------+



In [29]:
df.sort(df.Year.desc_nulls_first()).show()


+-------+----+----------+
|BatchID|Year|CourseName|
+-------+----+----------+
|     X3|null|      Java|
|     X5|null|    Python|
|     X4|2021|     Scala|
|     X6|2021|     Spark|
|     X1|2020|     Scala|
|     X2|2020|    Python|
+-------+----+----------+



In [None]:
# AnalysisException: Undefined function: 'desc_nulls_first'. 
# This function is neither a registered temporary function nor a permanent function registered in the database 'default'.
df.orderBy(expr("desc_nulls_first(Year)")).show()