## Pyspark Functions
1. array_contains
2. split
3. cast
4. explode
5. map_keys
6. map values
7. when-otherwise
8. alis
9. asc
10. desc
11. like

In [1]:
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('pyspark-by-examples').getOrCreate()

arrayData = [
        ('James',['Java','Scala'],{'hair':'black','eye':'brown'}),
        ('Michael',['Spark','Java',None],{'hair':'brown','eye':None}),
        ('Robert',['CSharp',''],{'hair':'red','eye':''}),
        ('Washington',None,None),
        ('Jefferson',['1','2'],{})]

df = spark.createDataFrame(data=arrayData, schema = ['name','knownLanguages','properties'])
df.printSchema()
df.show(truncate=False)

root
 |-- name: string (nullable = true)
 |-- knownLanguages: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- properties: map (nullable = true)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)

+----------+-------------------+-----------------------------+
|name      |knownLanguages     |properties                   |
+----------+-------------------+-----------------------------+
|James     |[Java, Scala]      |{eye -> brown, hair -> black}|
|Michael   |[Spark, Java, NULL]|{eye -> NULL, hair -> brown} |
|Robert    |[CSharp, ]         |{eye -> , hair -> red}       |
|Washington|NULL               |NULL                         |
|Jefferson |[1, 2]             |{}                           |
+----------+-------------------+-----------------------------+



In [2]:
## syntax : array_contains(column_name, value) -> boolean type
## array_contains check whether the value in the array or not, if present it returns 'true' else returns false
from pyspark.sql.functions import *
df1 = df.withColumn('Array_contains_java', array_contains(col('knownLanguages'),'Java'))
print(df1.printSchema())
print(df1.show(truncate=False))

root
 |-- name: string (nullable = true)
 |-- knownLanguages: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- properties: map (nullable = true)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)
 |-- Array_contains_java: boolean (nullable = true)

None
+----------+-------------------+-----------------------------+-------------------+
|name      |knownLanguages     |properties                   |Array_contains_java|
+----------+-------------------+-----------------------------+-------------------+
|James     |[Java, Scala]      |{eye -> brown, hair -> black}|true               |
|Michael   |[Spark, Java, NULL]|{eye -> NULL, hair -> brown} |true               |
|Robert    |[CSharp, ]         |{eye -> , hair -> red}       |false              |
|Washington|NULL               |NULL                         |NULL               |
|Jefferson |[1, 2]             |{}                           |false              |
+----------+-------------------+

In [3]:
data=data = [('James','','Smith','1991-04-01'),
  ('Michael','Rose','','2000-05-19'),
  ('Robert','','Williams','1978-09-05'),
  ('Maria','Anne','Jones','1967-12-01'),
  ('Jen','Mary','Brown','1980-02-17')
]

columns=["firstname","middlename","lastname","dob"]
df2=spark.createDataFrame(data,columns)
df2.printSchema()
df2.show(truncate=False)

root
 |-- firstname: string (nullable = true)
 |-- middlename: string (nullable = true)
 |-- lastname: string (nullable = true)
 |-- dob: string (nullable = true)

+---------+----------+--------+----------+
|firstname|middlename|lastname|dob       |
+---------+----------+--------+----------+
|James    |          |Smith   |1991-04-01|
|Michael  |Rose      |        |2000-05-19|
|Robert   |          |Williams|1978-09-05|
|Maria    |Anne      |Jones   |1967-12-01|
|Jen      |Mary      |Brown   |1980-02-17|
+---------+----------+--------+----------+



In [4]:
## syntax: split(column_name, sep_value) -> array
# 
df3 = df2.withColumn('split_dob',split(col('dob'),'-')) \
        .withColumn('year',split(col('dob'),'-').getItem(0)) \
        .withColumn('month',split(col('dob'),'-')[1]) \
        .withColumn('day',split(col('dob'),'-').getItem(2))
df3.printSchema()
df3.show(truncate=False)

root
 |-- firstname: string (nullable = true)
 |-- middlename: string (nullable = true)
 |-- lastname: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- split_dob: array (nullable = true)
 |    |-- element: string (containsNull = false)
 |-- year: string (nullable = true)
 |-- month: string (nullable = true)
 |-- day: string (nullable = true)

+---------+----------+--------+----------+--------------+----+-----+---+
|firstname|middlename|lastname|dob       |split_dob     |year|month|day|
+---------+----------+--------+----------+--------------+----+-----+---+
|James    |          |Smith   |1991-04-01|[1991, 04, 01]|1991|04   |01 |
|Michael  |Rose      |        |2000-05-19|[2000, 05, 19]|2000|05   |19 |
|Robert   |          |Williams|1978-09-05|[1978, 09, 05]|1978|09   |05 |
|Maria    |Anne      |Jones   |1967-12-01|[1967, 12, 01]|1967|12   |01 |
|Jen      |Mary      |Brown   |1980-02-17|[1980, 02, 17]|1980|02   |17 |
+---------+----------+--------+----------+-------------

In [5]:
## syntax: column_name.cast(data_type)
df4 = df3.withColumn('year',col('year').cast('integer')) \
        .withColumn('month',col('month').cast('integer')) \
        .withColumn('day',col('day').cast('Integer'))
df4.printSchema()


root
 |-- firstname: string (nullable = true)
 |-- middlename: string (nullable = true)
 |-- lastname: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- split_dob: array (nullable = true)
 |    |-- element: string (containsNull = false)
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- day: integer (nullable = true)



In [6]:
## 4. explode
df4 = df.select('name','properties', explode('knownLanguages'))
df4.show(truncate =False)

+---------+-----------------------------+------+
|name     |properties                   |col   |
+---------+-----------------------------+------+
|James    |{eye -> brown, hair -> black}|Java  |
|James    |{eye -> brown, hair -> black}|Scala |
|Michael  |{eye -> NULL, hair -> brown} |Spark |
|Michael  |{eye -> NULL, hair -> brown} |Java  |
|Michael  |{eye -> NULL, hair -> brown} |NULL  |
|Robert   |{eye -> , hair -> red}       |CSharp|
|Robert   |{eye -> , hair -> red}       |      |
|Jefferson|{}                           |1     |
|Jefferson|{}                           |2     |
+---------+-----------------------------+------+



In [7]:
## 5. map_keys function it returns the whole keys as array
df5 = df.withColumn('keys',map_keys('properties'))
df5.show(truncate=False)

+----------+-------------------+-----------------------------+-----------+
|name      |knownLanguages     |properties                   |keys       |
+----------+-------------------+-----------------------------+-----------+
|James     |[Java, Scala]      |{eye -> brown, hair -> black}|[eye, hair]|
|Michael   |[Spark, Java, NULL]|{eye -> NULL, hair -> brown} |[eye, hair]|
|Robert    |[CSharp, ]         |{eye -> , hair -> red}       |[eye, hair]|
|Washington|NULL               |NULL                         |NULL       |
|Jefferson |[1, 2]             |{}                           |[]         |
+----------+-------------------+-----------------------------+-----------+



In [8]:
## 6. map_values function it returns the whole values as array
df6 = df.withColumn('values',map_values('properties'))
df6.show(truncate=False)

+----------+-------------------+-----------------------------+--------------+
|name      |knownLanguages     |properties                   |values        |
+----------+-------------------+-----------------------------+--------------+
|James     |[Java, Scala]      |{eye -> brown, hair -> black}|[brown, black]|
|Michael   |[Spark, Java, NULL]|{eye -> NULL, hair -> brown} |[NULL, brown] |
|Robert    |[CSharp, ]         |{eye -> , hair -> red}       |[, red]       |
|Washington|NULL               |NULL                         |NULL          |
|Jefferson |[1, 2]             |{}                           |[]            |
+----------+-------------------+-----------------------------+--------------+



In [10]:
## 7. when-otherwise
df7 = df.withColumn('java_contains',when(array_contains(df.knownLanguages,'Java'),"True") \
                                    .otherwise("False"))
df7.show(truncate= False)

+----------+-------------------+-----------------------------+-------------+
|name      |knownLanguages     |properties                   |java_contains|
+----------+-------------------+-----------------------------+-------------+
|James     |[Java, Scala]      |{eye -> brown, hair -> black}|True         |
|Michael   |[Spark, Java, NULL]|{eye -> NULL, hair -> brown} |True         |
|Robert    |[CSharp, ]         |{eye -> , hair -> red}       |False        |
|Washington|NULL               |NULL                         |False        |
|Jefferson |[1, 2]             |{}                           |False        |
+----------+-------------------+-----------------------------+-------------+



In [15]:
## 8. alias - change the column names
df8 = df7.select('name','knownLanguages',df7.java_contains.alias('java_know'))
df8.show(truncate=False)

+----------+-------------------+---------+
|name      |knownLanguages     |java_know|
+----------+-------------------+---------+
|James     |[Java, Scala]      |True     |
|Michael   |[Spark, Java, NULL]|True     |
|Robert    |[CSharp, ]         |False    |
|Washington|NULL               |False    |
|Jefferson |[1, 2]             |False    |
+----------+-------------------+---------+



In [16]:
data = [('James','','Smith','1991-04-01','M',3000),
  ('Michael','Rose','','2000-05-19','M',4000),
  ('Robert','','Williams','1978-09-05','M',4000),
  ('Maria','Anne','Jones','1967-12-01','F',4000),
  ('Jen','Mary','Brown','1980-02-17','F',-1)
]

columns = ["firstname","middlename","lastname","dob","gender","salary"]

df9 = spark.createDataFrame(data=data, schema = columns)

In [17]:
df9.show(truncate=False)

+---------+----------+--------+----------+------+------+
|firstname|middlename|lastname|dob       |gender|salary|
+---------+----------+--------+----------+------+------+
|James    |          |Smith   |1991-04-01|M     |3000  |
|Michael  |Rose      |        |2000-05-19|M     |4000  |
|Robert   |          |Williams|1978-09-05|M     |4000  |
|Maria    |Anne      |Jones   |1967-12-01|F     |4000  |
|Jen      |Mary      |Brown   |1980-02-17|F     |-1    |
+---------+----------+--------+----------+------+------+



In [33]:
## 9. desc -decresing order
df9.sort(df9.salary).show()

+---------+----------+--------+----------+------+------+
|firstname|middlename|lastname|       dob|gender|salary|
+---------+----------+--------+----------+------+------+
|      Jen|      Mary|   Brown|1980-02-17|     F|    -1|
|    James|          |   Smith|1991-04-01|     M|  3000|
|  Michael|      Rose|        |2000-05-19|     M|  4000|
|   Robert|          |Williams|1978-09-05|     M|  4000|
|    Maria|      Anne|   Jones|1967-12-01|     F|  4000|
+---------+----------+--------+----------+------+------+



In [32]:
## 9. asc -decresing order
df9.orderBy(df9.salary).show()

+---------+----------+--------+----------+------+------+
|firstname|middlename|lastname|       dob|gender|salary|
+---------+----------+--------+----------+------+------+
|      Jen|      Mary|   Brown|1980-02-17|     F|    -1|
|    James|          |   Smith|1991-04-01|     M|  3000|
|   Robert|          |Williams|1978-09-05|     M|  4000|
|  Michael|      Rose|        |2000-05-19|     M|  4000|
|    Maria|      Anne|   Jones|1967-12-01|     F|  4000|
+---------+----------+--------+----------+------+------+



In [30]:
## 9. asc -ascending order
df9.sort(df9.salary.asc()).show()

+---------+----------+--------+----------+------+------+
|firstname|middlename|lastname|       dob|gender|salary|
+---------+----------+--------+----------+------+------+
|      Jen|      Mary|   Brown|1980-02-17|     F|    -1|
|    James|          |   Smith|1991-04-01|     M|  3000|
|   Robert|          |Williams|1978-09-05|     M|  4000|
|    Maria|      Anne|   Jones|1967-12-01|     F|  4000|
|  Michael|      Rose|        |2000-05-19|     M|  4000|
+---------+----------+--------+----------+------+------+



In [38]:
## 9. desc -decresing order
df9.sort(df9.firstname.desc(),df9.salary.desc()).show()

+---------+----------+--------+----------+------+------+
|firstname|middlename|lastname|       dob|gender|salary|
+---------+----------+--------+----------+------+------+
|   Robert|          |Williams|1978-09-05|     M|  4000|
|  Michael|      Rose|        |2000-05-19|     M|  4000|
|    Maria|      Anne|   Jones|1967-12-01|     F|  4000|
|      Jen|      Mary|   Brown|1980-02-17|     F|    -1|
|    James|          |   Smith|1991-04-01|     M|  3000|
+---------+----------+--------+----------+------+------+



In [39]:
##11. like 
# Here filter the date that firstname starts with the letter 'M'
df9.filter(df9.firstname.like('M%')).show()

+---------+----------+--------+----------+------+------+
|firstname|middlename|lastname|       dob|gender|salary|
+---------+----------+--------+----------+------+------+
|  Michael|      Rose|        |2000-05-19|     M|  4000|
|    Maria|      Anne|   Jones|1967-12-01|     F|  4000|
+---------+----------+--------+----------+------+------+



In [40]:
# filter
df9.filter(df9.gender == 'M').show()

+---------+----------+--------+----------+------+------+
|firstname|middlename|lastname|       dob|gender|salary|
+---------+----------+--------+----------+------+------+
|    James|          |   Smith|1991-04-01|     M|  3000|
|  Michael|      Rose|        |2000-05-19|     M|  4000|
|   Robert|          |Williams|1978-09-05|     M|  4000|
+---------+----------+--------+----------+------+------+



In [41]:
#where
df9.where(df9.gender == 'M').show()

+---------+----------+--------+----------+------+------+
|firstname|middlename|lastname|       dob|gender|salary|
+---------+----------+--------+----------+------+------+
|    James|          |   Smith|1991-04-01|     M|  3000|
|  Michael|      Rose|        |2000-05-19|     M|  4000|
|   Robert|          |Williams|1978-09-05|     M|  4000|
+---------+----------+--------+----------+------+------+

