# array() + array_contains()

In [14]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import array_contains, col, array, array_contains, explode
from pyspark.sql.types import StringType, ArrayType, StructType, StructField, FloatType
from pyspark.sql import functions as F
from functools import reduce

spark = (
    SparkSession.builder
    .appName("example-07")
    .getOrCreate()
)

In [9]:
arrayCol = ArrayType(StringType(),False)

data = [("James,,Smith", ["Java","Scala","C++"], ["Spark","Java"], "OH", "CA", [1.0,2.1,3.2,4.3,5.4]),
        ("Michael,Rose,", ["Spark","Java","C++"], ["Spark","Java"], "NY", "NJ",[2.0,3.1,4.2,5.3,6.4]),
        ("Robert,,Williams", ["CSharp","VB"], ["Spark","Python"], "UT", "NV",[3.0,4.1,5.2,6.3,7.4])]

schema = StructType([ 
    StructField("name",StringType(),True), 
    StructField("languagesAtSchool",ArrayType(StringType()),True), 
    StructField("languagesAtWork",ArrayType(StringType()),True), 
    StructField("currentState", StringType(), True), 
    StructField("previousState", StringType(), True),
    StructField("grade",ArrayType(FloatType()),True), 
  ])

df = spark.createDataFrame(data=data, schema=schema)
df.printSchema()
df.show()

root
 |-- name: string (nullable = true)
 |-- languagesAtSchool: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- languagesAtWork: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- currentState: string (nullable = true)
 |-- previousState: string (nullable = true)
 |-- grade: array (nullable = true)
 |    |-- element: float (containsNull = true)

+----------------+------------------+---------------+------------+-------------+--------------------+
|            name| languagesAtSchool|languagesAtWork|currentState|previousState|               grade|
+----------------+------------------+---------------+------------+-------------+--------------------+
|    James,,Smith|[Java, Scala, C++]|  [Spark, Java]|          OH|           CA|[1.0, 2.1, 3.2, 4...|
|   Michael,Rose,|[Spark, Java, C++]|  [Spark, Java]|          NY|           NJ|[2.0, 3.1, 4.2, 5...|
|Robert,,Williams|      [CSharp, VB]|[Spark, Python]|          UT|           NV|[3.0, 

In [19]:
n = df.select(F.max(F.size('grade')).alias('n')).first().n
df1 = df.select('name', *[F.col('grade')[i].alias('val_{}'.format(i+1)) for i in range(n)])
df2 = df1.groupby('name').agg(*[ F.mean('val_{}'.format(i+1)).alias('average_{}'.format(i+1)) for i in range(n)])
df2.show()


+----------------+---------+------------------+-----------------+-----------------+-----------------+
|            name|average_1|         average_2|        average_3|        average_4|        average_5|
+----------------+---------+------------------+-----------------+-----------------+-----------------+
|    James,,Smith|      1.0|2.0999999046325684|3.200000047683716|4.300000190734863|5.400000095367432|
|   Michael,Rose,|      2.0|3.0999999046325684|4.199999809265137|5.300000190734863|6.400000095367432|
|Robert,,Williams|      3.0| 4.099999904632568|5.199999809265137|6.300000190734863|7.400000095367432|
+----------------+---------+------------------+-----------------+-----------------+-----------------+



In [21]:
df_new = reduce(lambda x,y: x.union(y), [
    df2.select('name', F.col('average_{}'.format(i+1)).alias('average')) \
       .dropna(subset=['average']) for i in range(n)
])
df_new.show()

+----------------+------------------+
|            name|           average|
+----------------+------------------+
|    James,,Smith|               1.0|
|   Michael,Rose,|               2.0|
|Robert,,Williams|               3.0|
|    James,,Smith|2.0999999046325684|
|   Michael,Rose,|3.0999999046325684|
|Robert,,Williams| 4.099999904632568|
|    James,,Smith| 3.200000047683716|
|   Michael,Rose,| 4.199999809265137|
|Robert,,Williams| 5.199999809265137|
|    James,,Smith| 4.300000190734863|
|   Michael,Rose,| 5.300000190734863|
|Robert,,Williams| 6.300000190734863|
|    James,,Smith| 5.400000095367432|
|   Michael,Rose,| 6.400000095367432|
|Robert,,Williams| 7.400000095367432|
+----------------+------------------+



In [3]:
# array()
df.select(df.name,array(df.currentState,df.previousState).alias("States")).show()

+----------------+--------+
|            name|  States|
+----------------+--------+
|    James,,Smith|[OH, CA]|
|   Michael,Rose,|[NY, NJ]|
|Robert,,Williams|[UT, NV]|
+----------------+--------+



In [4]:
# array_contains()
df.select(df.name, array_contains(df.languagesAtSchool,"Java").alias("array_contains")).show()

+----------------+--------------+
|            name|array_contains|
+----------------+--------------+
|    James,,Smith|          true|
|   Michael,Rose,|          true|
|Robert,,Williams|         false|
+----------------+--------------+



The code block shown below should return a DataFrame with two columns, itemId and col. 

In this DataFrame, for each element in column attributes of DataFrame itemDf there should be a separate row in which the column itemId contains the associated itemId from DataFrame itemsDf. 

The new DataFrame should only contain rows for rows in DataFrame itemsDf in which the column attributes contains the element cozy.

A sample of DataFrame itemsDf is below.
+——+—————————–+——————-+
|itemId|attributes |supplier |
+——+—————————–+——————-+
|1 |[blue, winter, cozy] |Sports Company Inc.|
|2 |[red, summer, fresh, cooling]|YetiX |
|3 |[green, summer, travel] |Sports Company Inc.|
+——+—————————–+——————-+

Code block:
itemsDf.__1__(__2__).__3__(__4__, __5__(__6__))

In [5]:
data = [
        ('1',['blue', 'winter', 'cozy'],'Sports Company Inc.'),
        ('2',['red', 'summer', 'fresh', 'cooling'],'YetiX'),
        ('3',['green', 'summer', 'travel'],'Sports Company Inc.')]

itemsDf = spark.createDataFrame(data=data, schema = ['itemId','attributes','supplier'])
itemsDf.printSchema()
itemsDf.show()

root
 |-- itemId: string (nullable = true)
 |-- attributes: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- supplier: string (nullable = true)

+------+--------------------+-------------------+
|itemId|          attributes|           supplier|
+------+--------------------+-------------------+
|     1|[blue, winter, cozy]|Sports Company Inc.|
|     2|[red, summer, fre...|              YetiX|
|     3|[green, summer, t...|Sports Company Inc.|
+------+--------------------+-------------------+



In [6]:
itemsDf.filter("array_contains(attributes, 'cozy')").select("itemId", explode("attributes"))

DataFrame[itemId: string, col: string]

In [7]:
itemsDf.filter("array_contains(attributes, 'cozy')").select("itemId", map("attributes"))

TypeError: map() must have at least two arguments.