# EXPLODE() + SELECT() + SPLIT()

Question

The code block displayed below contains an error. The code block should create DataFrame itemsAttributesDf which has columns itemId and attribute and lists every attribute from the attributes column in DataFrame itemsDf next to the itemId of the respective row in itemsDf. Find the error.

A sample of DataFrame itemsDf is below.
>

- `+——————+—————————————————————————————+———————————————————+`
- `|itemId|attributes                   |supplier           |`
- `+——————+—————————————————————————————+———————————————————+`
- `|1     |[blue, winter, cozy]         |Sports Company Inc.|`
- `|2     |[red, summer, fresh, cooling]|YetiX              |`
- `|3     |[green, summer, travel]      |Sports Company Inc.|`
- `+——————+—————————————————————————————+———————————————————+`

Code block:

`itemsAttributesDf = itemsDf.explode("attributes").alias("attribute").select("attribute", "itemId")`

In [10]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

spark = (
    SparkSession.builder
    .appName("example-explode-select-split")
    .getOrCreate()
)

In [2]:
data = [
        ('1',['blue', 'winter', 'cozy'],'Sports Company Inc.'),
        ('2',['red', 'summer', 'fresh', 'cooling'],'YetiX'),
        ('3',['green', 'summer', 'travel'],'Sports Company Inc.')]

itemsDf = spark.createDataFrame(data=data, schema = ['itemId','attributes','supplier'])
itemsDf.printSchema()
itemsDf.show()

root
 |-- itemId: string (nullable = true)
 |-- attributes: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- supplier: string (nullable = true)

+------+--------------------+-------------------+
|itemId|          attributes|           supplier|
+------+--------------------+-------------------+
|     1|[blue, winter, cozy]|Sports Company Inc.|
|     2|[red, summer, fre...|              YetiX|
|     3|[green, summer, t...|Sports Company Inc.|
+------+--------------------+-------------------+



In [None]:
# AttributeError: 'DataFrame' object has no attribute 'explode'
itemsAttributesDf = itemsDf.explode("attributes").alias("attribute").select("attribute", "itemId")

In [3]:
itemsAttributesDf = itemsDf.select(explode("attributes").alias("attribute"),"itemId")
itemsAttributesDf.printSchema()
itemsAttributesDf.show()

root
 |-- attribute: string (nullable = true)
 |-- itemId: string (nullable = true)

+---------+------+
|attribute|itemId|
+---------+------+
|     blue|     1|
|   winter|     1|
|     cozy|     1|
|      red|     2|
|   summer|     2|
|    fresh|     2|
|  cooling|     2|
|    green|     3|
|   summer|     3|
|   travel|     3|
+---------+------+



In [4]:
itemsDf.show()

+------+--------------------+-------------------+
|itemId|          attributes|           supplier|
+------+--------------------+-------------------+
|     1|[blue, winter, cozy]|Sports Company Inc.|
|     2|[red, summer, fre...|              YetiX|
|     3|[green, summer, t...|Sports Company Inc.|
+------+--------------------+-------------------+



In [6]:
# AnalysisException: cannot resolve '(NOT `attributes`)' due to data type mismatch: argument 1 requires boolean type, however, '`attributes`' is of array<string> type.
itemsDf.select(~col('attributes'), ~col('supplier'))

AnalysisException: cannot resolve '(NOT `attributes`)' due to data type mismatch: argument 1 requires boolean type, however, '`attributes`' is of array<string> type.;;
'Project [NOT attributes#1 AS (NOT attributes)#53, NOT supplier#2 AS (NOT supplier)#54]
+- LogicalRDD [itemId#0, attributes#1, supplier#2], false


In [7]:
# The code block is intended to return all columns of DataFrame except for columns attributes and supplier
itemsDf.drop('attributes','supplier').show()

+------+
|itemId|
+------+
|     1|
|     2|
|     3|
+------+



In [5]:
itemsDf.select(explode("attributes").alias("attributes_exploded")).filter(col("attributes_exploded").contains("i")).show()

+-------------------+
|attributes_exploded|
+-------------------+
|             winter|
|            cooling|
+-------------------+



In [None]:
# NameError: name 'attributes_exploded' is not defined
itemsDf.select(explode("attributes").alias("attributes_exploded")).filter(attributes_exploded.contains("i"))

# AttributeError: 'DataFrame' object has no attribute 'explode'
itemsDf.explode(attributes).alias("attributes_exploded").filter(col("attributes_exploded").contains("i"))

# AttributeError: 'str' object has no attribute 'contains'
itemsDf.select(explode("attributes")).filter("attributes_exploded".contains("i"))

# TypeError: 'Column' object is not callable
itemsDf.select(col("attributes").explode().alias("attributes_exploded")).filter(col("attributes_exploded").contains("i"))

Which of the following code blocks returns a single-column DataFrame showing the number of words in column supplier of DataFrame itemsDf?
>
- `itemsDf.split("supplier", " ").count()`
- `itemsDf.split("supplier", " ").size()`
- `itemsDf.select(word_count("supplier"))`
- `spark.select(size(split(col(supplier), " ")))`
- `itemsDf.select(size(split("supplier", " ")))`

In [12]:
itemsDf.select(size(split("supplier", " "))).show()

+----------------------------+
|size(split(supplier,  , -1))|
+----------------------------+
|                           3|
|                           1|
|                           3|
+----------------------------+



In [None]:
from pyspark.sql.types import *

data = [(1, 3, 4, 25, 1, None, 1587915332),
         (2, 6, 7, 2, 2, None, 1586815312),
         (3, 3, None, 25, 3, None, 1585824821),
         (4, None, None, 3, 2, None, 1583244275),
         (5, None, None, None, 2, None, 1575285427),
         (6, 3, 2, 25, 2, None, 1572733275)]

schema = StructType([StructField('transactionId', IntegerType(), True),
                     StructField('predError', IntegerType(), True),
                     StructField('value', IntegerType(), True),
                     StructField('storeId', IntegerType(), True),
                     StructField('productId', IntegerType(), True),
                     StructField('f', IntegerType(), True),
                     StructField('transactionDate', LongType(), True)])

transactionsDf = spark.createDataFrame(data=data, schema=schema)

In [None]:
transactionsDf.select('storeId', 'predError').collect()