# PySpark Column Functions

In [1]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

spark = (
    SparkSession.builder
    .appName("example-columns-functions")
    .getOrCreate()
)

In [2]:
data=[("James", "Bond", "100", None),
      ("Ann", "Varsa", "200", 'F'),
      ("Tom Cruise", "XXX", "400",''),
      ("Tom Brand", None, "400", 'M')] 

columns = ["fname", "lname", "id", "gender"]

df = spark.createDataFrame(data,columns)

In [3]:
# alias() – Set’s name to Column
df.select(df.fname.alias("first_name"), df.lname.alias("last_name")).show()

# Another example
df.select(expr(" fname ||','|| lname").alias("fullName")).show()

+----------+---------+
|first_name|last_name|
+----------+---------+
|     James|     Bond|
|       Ann|    Varsa|
|Tom Cruise|      XXX|
| Tom Brand|     null|
+----------+---------+

+--------------+
|      fullName|
+--------------+
|    James,Bond|
|     Ann,Varsa|
|Tom Cruise,XXX|
|          null|
+--------------+



In [4]:
# asc() & desc() – Sort the DataFrame columns by Ascending or Descending order.
df.sort(df.fname.asc()).show()
df.sort(df.fname.desc()).show()

+----------+-----+---+------+
|     fname|lname| id|gender|
+----------+-----+---+------+
|       Ann|Varsa|200|     F|
|     James| Bond|100|  null|
| Tom Brand| null|400|     M|
|Tom Cruise|  XXX|400|      |
+----------+-----+---+------+

+----------+-----+---+------+
|     fname|lname| id|gender|
+----------+-----+---+------+
|Tom Cruise|  XXX|400|      |
| Tom Brand| null|400|     M|
|     James| Bond|100|  null|
|       Ann|Varsa|200|     F|
+----------+-----+---+------+



In [5]:
# asc_nulls_first
df.select(df.lname).orderBy(df.lname.asc_nulls_first()).show()

# desc_nulls_first
df.select(df.lname).orderBy(df.lname.desc_nulls_first()).show()

+-----+
|lname|
+-----+
| null|
| Bond|
|Varsa|
|  XXX|
+-----+

+-----+
|lname|
+-----+
| null|
|  XXX|
|Varsa|
| Bond|
+-----+



In [6]:
# asc_nulls_last
df.select(df.lname).orderBy(df.lname.asc_nulls_last()).show()

# desc_nulls_last
df.select(df.lname).orderBy(df.lname.desc_nulls_last()).show()

+-----+
|lname|
+-----+
| Bond|
|Varsa|
|  XXX|
| null|
+-----+

+-----+
|lname|
+-----+
|  XXX|
|Varsa|
| Bond|
| null|
+-----+



In [7]:
# sort asc column lname with null values last
df.sort(asc_nulls_last("lname")).show()

# sort desc column lname with null values last
df.sort("lname", ascending=False).show()

#df.orderBy("lname").desc_nulls_last().show() ### Error
#df.desc_nulls_last("lname").show() ### Error
#df.orderBy("lname").asc_nulls_last().show() ### Error

+----------+-----+---+------+
|     fname|lname| id|gender|
+----------+-----+---+------+
|     James| Bond|100|  null|
|       Ann|Varsa|200|     F|
|Tom Cruise|  XXX|400|      |
| Tom Brand| null|400|     M|
+----------+-----+---+------+

+----------+-----+---+------+
|     fname|lname| id|gender|
+----------+-----+---+------+
|Tom Cruise|  XXX|400|      |
|       Ann|Varsa|200|     F|
|     James| Bond|100|  null|
| Tom Brand| null|400|     M|
+----------+-----+---+------+



In [8]:
# cast() & astype() – Used to convert the data Type
df.select(df.fname,df.id.cast("int")).printSchema()

root
 |-- fname: string (nullable = true)
 |-- id: integer (nullable = true)



In [9]:
# between() – Returns a Boolean expression when a column values in between lower and upper bound.
df.filter(df.id.between(100,300)).show()

+-----+-----+---+------+
|fname|lname| id|gender|
+-----+-----+---+------+
|James| Bond|100|  null|
|  Ann|Varsa|200|     F|
+-----+-----+---+------+



In [10]:
# contains() – Checks if a DataFrame column value contains a a value specified in this function.
df.filter(df.fname.contains("Cruise")).show()

+----------+-----+---+------+
|     fname|lname| id|gender|
+----------+-----+---+------+
|Tom Cruise|  XXX|400|      |
+----------+-----+---+------+



In [11]:
# startswith() & endswith() – Checks if the value of the DataFrame Column starts and ends with a String respectively
df.filter(df.fname.startswith("T")).show()
df.filter(df.fname.endswith("Cruise")).show()

+----------+-----+---+------+
|     fname|lname| id|gender|
+----------+-----+---+------+
|Tom Cruise|  XXX|400|      |
| Tom Brand| null|400|     M|
+----------+-----+---+------+

+----------+-----+---+------+
|     fname|lname| id|gender|
+----------+-----+---+------+
|Tom Cruise|  XXX|400|      |
+----------+-----+---+------+



In [12]:
# isNull & isNotNull() – Checks if the DataFrame column has NULL or non NULL values.
df.filter(df.lname.isNull()).show()
df.filter(df.lname.isNotNull()).show()

+---------+-----+---+------+
|    fname|lname| id|gender|
+---------+-----+---+------+
|Tom Brand| null|400|     M|
+---------+-----+---+------+

+----------+-----+---+------+
|     fname|lname| id|gender|
+----------+-----+---+------+
|     James| Bond|100|  null|
|       Ann|Varsa|200|     F|
|Tom Cruise|  XXX|400|      |
+----------+-----+---+------+



In [13]:
# like() & rlike() – Similar to SQL LIKE expression
df.select(df.fname,df.lname,df.id).filter(df.fname.like("%om")).show()

+-----+-----+---+
|fname|lname| id|
+-----+-----+---+
+-----+-----+---+



In [14]:
# substr() – Returns a Column after getting sub string from the Column
df.select(df.fname.substr(1,2).alias("substr")).show()

+------+
|substr|
+------+
|    Ja|
|    An|
|    To|
|    To|
+------+



In [15]:
# when() & otherwise() – It is similar to SQL Case When, executes sequence of expressions until it matches the condition and returns a value when match.
df.select(df.fname,df.lname, when(df.gender=="M","Male") \
                            .when(df.gender=="F","Female") \
                            .when(df.gender==None ,"") \
                            .otherwise(df.gender).alias("new_gender") \
         ).show()

+----------+-----+----------+
|     fname|lname|new_gender|
+----------+-----+----------+
|     James| Bond|      null|
|       Ann|Varsa|    Female|
|Tom Cruise|  XXX|          |
| Tom Brand| null|      Male|
+----------+-----+----------+



In [16]:
# isin() – Check if value presents in a List.
li=["100","200"]

df.select(df.fname,df.lname,df.id) \
  .filter(df.id.isin(li)) \
  .show()

+-----+-----+---+
|fname|lname| id|
+-----+-----+---+
|James| Bond|100|
|  Ann|Varsa|200|
+-----+-----+---+



In [17]:
data=[(("James","Bond"),["Java","C#"],{'hair':'black','eye':'brown'}),
      (("Ann","Varsa"),[".NET","Python"],{'hair':'brown','eye':'black'}),
      (("Tom Cruise",""),["Python","Scala"],{'hair':'red','eye':'grey'}),
      (("Tom Brand",None),["Perl","Ruby"],{'hair':'black','eye':'blue'})]

schema = StructType([
                        StructField('name', StructType([
                                                            StructField('fname', StringType(), True),
                                                            StructField('lname', StringType(), True)]
                                                        )   
                                    ),
                        StructField('languages', ArrayType(StringType()),True),
                        StructField('properties', MapType(StringType(),StringType()),True)
                    ])

df2 = spark.createDataFrame(data,schema)
df2.printSchema()

root
 |-- name: struct (nullable = true)
 |    |-- fname: string (nullable = true)
 |    |-- lname: string (nullable = true)
 |-- languages: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- properties: map (nullable = true)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)



In [20]:
# getField() – To get the value by key from MapType column and by stuct child name from StructType column
df2.select(df2.properties.getField("hair")).show()

# getField from Struct
df2.select(df2.name.getField("fname")).show()

+----------------+
|properties[hair]|
+----------------+
|           black|
|           brown|
|             red|
|           black|
+----------------+

+----------+
|name.fname|
+----------+
|     James|
|       Ann|
|Tom Cruise|
| Tom Brand|
+----------+



In [2]:
data = [(1, 3, 4, 25, 1, None, 1587915332),
         (2, 6, 7, 2, 2, None, 1586815312),
         (3, 3, None, 25, 3, None, 1585824821),
         (4, None, None, 3, 2, None, 1583244275),
         (5, None, None, None, 2, None, 1575285427),
         (6, 3, 2, 25, 2, None, 1572733275)]

schema = StructType([StructField('transactionId', IntegerType(), True),
                     StructField('predError', IntegerType(), True),
                     StructField('value', IntegerType(), True),
                     StructField('storeId', IntegerType(), True),
                     StructField('productId', IntegerType(), True),
                     StructField('f', IntegerType(), True),
                     StructField('transactionDate', LongType(), True)])

transactionsDf = spark.createDataFrame(data=data, schema=schema)

Which of the following code blocks displays the 10 rows with the smallest values of column value in DataFrame transactionsDf in a nicely formatted way?

- `transactionsDf.sort(asc(value)).show(10)`
- `transactionsDf.sort(col("value")).show(10)`
- `transactionsDf.sort(col("value").desc()).head()`
- `transactionsDf.sort(col("value").asc()).print(10)`
- `transactionsDf.orderBy("value").asc().show(10)`

In [35]:
# ascending:  default True. 
transactionsDf.sort(col("value")).show(10)

# return the same result
transactionsDf.sort(col("value").asc()).show(10)

# return the same result
transactionsDf.sort(asc("value")).show(10)


+-------------+---------+-----+-------+---------+----+---------------+
|transactionId|predError|value|storeId|productId|   f|transactionDate|
+-------------+---------+-----+-------+---------+----+---------------+
|            4|     null| null|      3|        2|null|     1583244275|
|            3|        3| null|     25|        3|null|     1585824821|
|            5|     null| null|   null|        2|null|     1575285427|
|            6|        3|    2|     25|        2|null|     1572733275|
|            1|        3|    4|     25|        1|null|     1587915332|
|            2|        6|    7|      2|        2|null|     1586815312|
+-------------+---------+-----+-------+---------+----+---------------+

+-------------+---------+-----+-------+---------+----+---------------+
|transactionId|predError|value|storeId|productId|   f|transactionDate|
+-------------+---------+-----+-------+---------+----+---------------+
|            4|     null| null|      3|        2|null|     1583244275|
|    

In [28]:
transactionsDf.sort(col("value").desc()).head()

Row(transactionId=2, predError=6, value=7, storeId=2, productId=2, f=None, transactionDate=1586815312)

In [27]:
# NameError: name 'value' is not defined
transactionsDf.sort(asc(value)).show(10)

# AttributeError: 'DataFrame' object has no attribute 'print'
transactionsDf.sort(col("value").asc()).print(10)

# AttributeError: 'DataFrame' object has no attribute 'asc'
transactionsDf.orderBy("value").asc().show(10)

NameError: name 'value' is not defined

In [4]:
transactionsDf.sort("storeId", asc("productId")).show()


+-------------+---------+-----+-------+---------+----+---------------+
|transactionId|predError|value|storeId|productId|   f|transactionDate|
+-------------+---------+-----+-------+---------+----+---------------+
|            5|     null| null|   null|        2|null|     1575285427|
|            2|        6|    7|      2|        2|null|     1586815312|
|            4|     null| null|      3|        2|null|     1583244275|
|            1|        3|    4|     25|        1|null|     1587915332|
|            6|        3|    2|     25|        2|null|     1572733275|
|            3|        3| null|     25|        3|null|     1585824821|
+-------------+---------+-----+-------+---------+----+---------------+



In [8]:
transactionsDf.sort("storeId", desc("productId")).show()


+-------------+---------+-----+-------+---------+----+---------------+
|transactionId|predError|value|storeId|productId|   f|transactionDate|
+-------------+---------+-----+-------+---------+----+---------------+
|            5|     null| null|   null|        2|null|     1575285427|
|            2|        6|    7|      2|        2|null|     1586815312|
|            4|     null| null|      3|        2|null|     1583244275|
|            3|        3| null|     25|        3|null|     1585824821|
|            6|        3|    2|     25|        2|null|     1572733275|
|            1|        3|    4|     25|        1|null|     1587915332|
+-------------+---------+-----+-------+---------+----+---------------+



In [10]:
transactionsDf.sort("storeId").sort(desc("productId")).show()

+-------------+---------+-----+-------+---------+----+---------------+
|transactionId|predError|value|storeId|productId|   f|transactionDate|
+-------------+---------+-----+-------+---------+----+---------------+
|            3|        3| null|     25|        3|null|     1585824821|
|            4|     null| null|      3|        2|null|     1583244275|
|            5|     null| null|   null|        2|null|     1575285427|
|            6|        3|    2|     25|        2|null|     1572733275|
|            2|        6|    7|      2|        2|null|     1586815312|
|            1|        3|    4|     25|        1|null|     1587915332|
+-------------+---------+-----+-------+---------+----+---------------+



In [None]:
# NameError: name 'storeId' is not defined
transactionsDf.sort(col(storeId)).desc(col(productId))

# AttributeError: 'DataFrame' object has no attribute 'order_by'
transactionsDf.order_by(col(storeId), desc(col(productId)))
