In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()

In [2]:
data = [
 ("James,,Smith",["Java","Scala","C++"],["Spark","Java"],"OH","CA"),
 ("Michael,Rose,",["Spark","Java","C++"],["Spark","Java"],"NY","NJ"),
 ("Robert,,Williams",["CSharp","VB"],["Spark","Python"],"UT","NV")
]

from pyspark.sql.types import StringType, ArrayType,StructType,StructField
schema = StructType([ 
    StructField("name",StringType(),True), 
    StructField("languagesAtSchool",ArrayType(StringType()),True), 
    StructField("languagesAtWork",ArrayType(StringType()),True), 
    StructField("currentState", StringType(), True), 
    StructField("previousState", StringType(), True)
  ])

df = spark.createDataFrame(data=data,schema=schema)
df.printSchema()
df.show()

root
 |-- name: string (nullable = true)
 |-- languagesAtSchool: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- languagesAtWork: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- currentState: string (nullable = true)
 |-- previousState: string (nullable = true)

+----------------+------------------+---------------+------------+-------------+
|            name| languagesAtSchool|languagesAtWork|currentState|previousState|
+----------------+------------------+---------------+------------+-------------+
|    James,,Smith|[Java, Scala, C++]|  [Spark, Java]|          OH|           CA|
|   Michael,Rose,|[Spark, Java, C++]|  [Spark, Java]|          NY|           NJ|
|Robert,,Williams|      [CSharp, VB]|[Spark, Python]|          UT|           NV|
+----------------+------------------+---------------+------------+-------------+



In [4]:
# pyspark explode function
from pyspark.sql.functions import explode

df.select(df.name, explode(df.languagesAtSchool).alias("language")).show()

+----------------+--------+
|            name|language|
+----------------+--------+
|    James,,Smith|    Java|
|    James,,Smith|   Scala|
|    James,,Smith|     C++|
|   Michael,Rose,|   Spark|
|   Michael,Rose,|    Java|
|   Michael,Rose,|     C++|
|Robert,,Williams|  CSharp|
|Robert,,Williams|      VB|
+----------------+--------+



In [5]:
# taking specific element from array
df.select(df.name, df.languagesAtSchool[0]).show()

+----------------+--------------------+
|            name|languagesAtSchool[0]|
+----------------+--------------------+
|    James,,Smith|                Java|
|   Michael,Rose,|               Spark|
|Robert,,Williams|              CSharp|
+----------------+--------------------+



### array()
    
    Use array() function to create a new array column by merging the data from multiple columns. All input columns must have the same data type. 
    The below example combines the data from currentState and previousState and creates a new column states.



In [6]:
from pyspark.sql.functions import array
df.select(df.name,array(df.currentState,df.previousState).alias("States")).show()


+----------------+--------+
|            name|  States|
+----------------+--------+
|    James,,Smith|[OH, CA]|
|   Michael,Rose,|[NY, NJ]|
|Robert,,Williams|[UT, NV]|
+----------------+--------+



In [7]:
#array_contains()
#array_contains() sql function is used to check if array column contains a value. 
#Returns null if the array is null, true if the array contains the value, and false otherwise.

In [8]:
from pyspark.sql.functions import array_contains
df.select(df.name,array_contains(df.languagesAtSchool,"Java")
    .alias("array_contains")).show()

+----------------+--------------+
|            name|array_contains|
+----------------+--------------+
|    James,,Smith|          true|
|   Michael,Rose,|          true|
|Robert,,Williams|         false|
+----------------+--------------+

