# EXPLODE() + SELECT() + SPLIT()

Question

The code block displayed below contains an error. The code block should create DataFrame itemsAttributesDf which has columns itemId and attribute and lists every attribute from the attributes column in DataFrame itemsDf next to the itemId of the respective row in itemsDf. Find the error.

A sample of DataFrame itemsDf is below.
>

- `+——————+—————————————————————————————+———————————————————+`
- `|itemId|attributes                   |supplier           |`
- `+——————+—————————————————————————————+———————————————————+`
- `|1     |[blue, winter, cozy]         |Sports Company Inc.|`
- `|2     |[red, summer, fresh, cooling]|YetiX              |`
- `|3     |[green, summer, travel]      |Sports Company Inc.|`
- `+——————+—————————————————————————————+———————————————————+`

Code block:

`itemsAttributesDf = itemsDf.explode("attributes").alias("attribute").select("attribute", "itemId")`

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

spark = (
    SparkSession.builder
    .appName("example-explode-select-split")
    .config('spark.serializer','org.apache.spark.serializer.KryoSerializer')
    .getOrCreate()
)

In [2]:
data = [
        ('1',['blue', 'winter', 'cozy'],'Sports Company Inc.'),
        ('2',['red', 'summer', 'fresh', 'cooling'],'YetiX'),
        ('3',['green', 'summer', 'travel'],'Sports Company Inc.')]

itemsDf = spark.createDataFrame(data=data, schema = ['itemId','attributes','supplier'])
itemsDf.printSchema()
itemsDf.show()

root
 |-- itemId: string (nullable = true)
 |-- attributes: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- supplier: string (nullable = true)

+------+--------------------+-------------------+
|itemId|          attributes|           supplier|
+------+--------------------+-------------------+
|     1|[blue, winter, cozy]|Sports Company Inc.|
|     2|[red, summer, fre...|              YetiX|
|     3|[green, summer, t...|Sports Company Inc.|
+------+--------------------+-------------------+



In [3]:
itemsDf.withColumn("now", current_timestamp()).show()

+------+--------------------+-------------------+--------------------+
|itemId|          attributes|           supplier|                 now|
+------+--------------------+-------------------+--------------------+
|     1|[blue, winter, cozy]|Sports Company Inc.|2021-11-24 22:09:...|
|     2|[red, summer, fre...|              YetiX|2021-11-24 22:09:...|
|     3|[green, summer, t...|Sports Company Inc.|2021-11-24 22:09:...|
+------+--------------------+-------------------+--------------------+



In [4]:
#itemsDf.selectExpr("itemId", "split(attributes) as attributes").show()
itemsDf.selectExpr("itemId", "explode(attributes) as attributes").show()
#itemsDf.select("itemId", "explode(attributes) as attributes").show()
#itemsDf.selectExpr("itemId", "splitArray(attributes) as attributes").show()

+------+----------+
|itemId|attributes|
+------+----------+
|     1|      blue|
|     1|    winter|
|     1|      cozy|
|     2|       red|
|     2|    summer|
|     2|     fresh|
|     2|   cooling|
|     3|     green|
|     3|    summer|
|     3|    travel|
+------+----------+



In [5]:
# AttributeError: 'DataFrame' object has no attribute 'explode'
itemsAttributesDf = itemsDf.explode("attributes").alias("attribute").select("attribute", "itemId")

AttributeError: 'DataFrame' object has no attribute 'explode'

In [6]:
itemsAttributesDf = itemsDf.select(explode("attributes").alias("attribute"),"itemId")
itemsAttributesDf.printSchema()
itemsAttributesDf.show()

root
 |-- attribute: string (nullable = true)
 |-- itemId: string (nullable = true)

+---------+------+
|attribute|itemId|
+---------+------+
|     blue|     1|
|   winter|     1|
|     cozy|     1|
|      red|     2|
|   summer|     2|
|    fresh|     2|
|  cooling|     2|
|    green|     3|
|   summer|     3|
|   travel|     3|
+---------+------+



In [7]:
itemsDf.show()

+------+--------------------+-------------------+
|itemId|          attributes|           supplier|
+------+--------------------+-------------------+
|     1|[blue, winter, cozy]|Sports Company Inc.|
|     2|[red, summer, fre...|              YetiX|
|     3|[green, summer, t...|Sports Company Inc.|
+------+--------------------+-------------------+



In [8]:
# AnalysisException: cannot resolve '(NOT `attributes`)' due to data type mismatch: argument 1 requires boolean type, however, '`attributes`' is of array<string> type.
itemsDf.select(~col('attributes'), ~col('supplier'))

AnalysisException: cannot resolve '(NOT attributes)' due to data type mismatch: argument 1 requires boolean type, however, 'attributes' is of array<string> type.;
'Project [unresolvedalias(NOT attributes#1, Some(org.apache.spark.sql.Column$$Lambda$2628/947079569@1adf055d)), unresolvedalias(NOT supplier#2, Some(org.apache.spark.sql.Column$$Lambda$2628/947079569@1adf055d))]
+- LogicalRDD [itemId#0, attributes#1, supplier#2], false


In [9]:
# The code block is intended to return all columns of DataFrame except for columns attributes and supplier
itemsDf.drop('attributes','supplier').show()

+------+
|itemId|
+------+
|     1|
|     2|
|     3|
+------+



In [10]:
itemsDf.select(explode("attributes").alias("attributes_exploded")).filter(col("attributes_exploded").contains("i")).show()

+-------------------+
|attributes_exploded|
+-------------------+
|             winter|
|            cooling|
+-------------------+



In [None]:
# NameError: name 'attributes_exploded' is not defined
itemsDf.select(explode("attributes").alias("attributes_exploded")).filter(attributes_exploded.contains("i"))

# AttributeError: 'DataFrame' object has no attribute 'explode'
itemsDf.explode(attributes).alias("attributes_exploded").filter(col("attributes_exploded").contains("i"))

# AttributeError: 'str' object has no attribute 'contains'
itemsDf.select(explode("attributes")).filter("attributes_exploded".contains("i"))

# TypeError: 'Column' object is not callable
itemsDf.select(col("attributes").explode().alias("attributes_exploded")).filter(col("attributes_exploded").contains("i"))

Which of the following code blocks returns a single-column DataFrame showing the number of words in column supplier of DataFrame itemsDf?
>
- `itemsDf.split("supplier", " ").count()`
- `itemsDf.split("supplier", " ").size()`
- `itemsDf.select(word_count("supplier"))`
- `spark.select(size(split(col(supplier), " ")))`
- `itemsDf.select(size(split("supplier", " ")))`

In [11]:
itemsDf.select(size(split("supplier", " "))).show()

+----------------------------+
|size(split(supplier,  , -1))|
+----------------------------+
|                           3|
|                           1|
|                           3|
+----------------------------+



In [12]:
from pyspark.sql.types import *

data = [(1, 3, 4, 25, 1, None, 1587915332),
         (2, 6, 7, 2, 2, None, 1586815312),
         (3, 3, None, 25, 3, None, 1585824821),
         (4, None, None, 3, 2, None, 1583244275),
         (5, None, None, None, 2, None, 1575285427),
         (6, 3, 2, 25, 2, None, 1572733275)]

schema = StructType([StructField('transactionId', IntegerType(), True),
                     StructField('predError', IntegerType(), True),
                     StructField('value', IntegerType(), True),
                     StructField('storeId', IntegerType(), True),
                     StructField('productId', IntegerType(), True),
                     StructField('f', IntegerType(), True),
                     StructField('transactionDate', LongType(), True)])

transactionsDf = spark.createDataFrame(data=data, schema=schema)

In [13]:
transactionsDf.select('storeId', 'predError').collect()

[Row(storeId=25, predError=3),
 Row(storeId=2, predError=6),
 Row(storeId=25, predError=3),
 Row(storeId=3, predError=None),
 Row(storeId=None, predError=None),
 Row(storeId=25, predError=3)]

The code block shown below should return a DataFrame with columns transactionsId, predError, value, and f from DataFrame transactionsDf. Choose the answer that correctly fills the blanks in the code block to accomplish this.
>
- `transactionsDf.__1__(__2__)`
>
- `1. filter 2. "transactionId", "predError", "value", "f"`
- `1. select 2. "transactionId, predError, value, f"`
- `1. select 2. ["transactionId", "predError", "value", "f"]`
- `1. where 2. col("transactionId"), col("predError"), col("value"), col("f")`
- `1. select 2. col(["transactionId", "predError", "value", "f"])`

In [14]:
transactionsDf.select(["transactionId", "predError", "value", "f"]).show()

+-------------+---------+-----+----+
|transactionId|predError|value|   f|
+-------------+---------+-----+----+
|            1|        3|    4|null|
|            2|        6|    7|null|
|            3|        3| null|null|
|            4|     null| null|null|
|            5|     null| null|null|
|            6|        3|    2|null|
+-------------+---------+-----+----+



The code block shown below should return a single-column DataFrame with a column named consonant_ct that, for each row, shows the number of consonants in column itemName of DataFrame itemsDf. Choose the answer that correctly fills the blanks in the code block to accomplish this.
>
- `itemsDf.select(__1__(__2__(__3__(__4__), "a|e|i|o|u|s", "”)).__5__(consonant_ct”))`
>
- `1. length 2. regexp_extract 3. upper 4. col("itemName") 5. as`
- `1. size 2. regexp_replace 3. lower 4. "itemName" 5. alias`
- `1. lower 2. regexp_replace 3. length 4. "itemName" 5. alias`
- `1. length 2. regexp_replace 3. lower 4. col("itemName") 5. alias`
- `1. size 2. regexp_extract 3. lower 4. col("itemName") 5. alias`

In [15]:
data = [(1, 'Thick Coat for Walking in the Snow', ['blue', 'winter', 'cozy'], 'Sports Company Inc.'),
        (2, 'Elegant Outdoors Summer Dress', ['red', 'summer'], 'YetiX'),
        (3, 'Outdoors Backpack', ['green', 'summer'], 'Sports Company Inc.')]

columns = ["itemId", "itemName", "attributes", "supplier"]

itemsDf = spark.createDataFrame(data=data, schema=columns)

itemsDf.printSchema()
itemsDf.show(truncate=False)

root
 |-- itemId: long (nullable = true)
 |-- itemName: string (nullable = true)
 |-- attributes: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- supplier: string (nullable = true)

+------+----------------------------------+--------------------+-------------------+
|itemId|itemName                          |attributes          |supplier           |
+------+----------------------------------+--------------------+-------------------+
|1     |Thick Coat for Walking in the Snow|[blue, winter, cozy]|Sports Company Inc.|
|2     |Elegant Outdoors Summer Dress     |[red, summer]       |YetiX              |
|3     |Outdoors Backpack                 |[green, summer]     |Sports Company Inc.|
+------+----------------------------------+--------------------+-------------------+



In [16]:
itemsDf.select(lower(regexp_replace(length("itemName"), "a|e|i|o|u|s", "")) \
       .alias("consonant_ct")) \
       .show()

+------------+
|consonant_ct|
+------------+
|          34|
|          29|
|          17|
+------------+



In [17]:
itemsDf.select(length(regexp_replace(lower(col("itemName")), "a|e|i|o|u|s", "")) \
       .alias("consonant_ct")) \
       .show()

+------------+
|consonant_ct|
+------------+
|          24|
|          15|
|          10|
+------------+



In [None]:
# y4JError: An error occurred while calling z:org.apache.spark.sql.functions.regexp_extract
itemsDf.select(size(regexp_extract(lower(col("itemName")), "a|e|i|o|u|s", "")).alias("consonant_ct"))

# SyntaxError: invalid syntax "as"
itemsDf.select(length(regexp_extract(upper(col("itemName")), "a|e|i|o|u|s", "")).as("consonant_ct"))

# AnalysisException: cannot resolve 'size(regexp_replace(lower(`itemName`), 'a|e|i|o|u|s', ''))' due to data type mismatch
itemsDf.select(size(regexp_replace(lower("itemName"), "a|e|i|o|u|s", "")).alias("consonant_ct"))

In [18]:
itemsDf.show()

+------+--------------------+--------------------+-------------------+
|itemId|            itemName|          attributes|           supplier|
+------+--------------------+--------------------+-------------------+
|     1|Thick Coat for Wa...|[blue, winter, cozy]|Sports Company Inc.|
|     2|Elegant Outdoors ...|       [red, summer]|              YetiX|
|     3|   Outdoors Backpack|     [green, summer]|Sports Company Inc.|
+------+--------------------+--------------------+-------------------+



In [19]:
itemsDf.select("itemId").show()

+------+
|itemId|
+------+
|     1|
|     2|
|     3|
+------+



In [20]:
itemsDf.selectExpr("itemId", "case when (itemId < 3) then itemId * 0.20 else 0 end as increment").show()

+------+---------+
|itemId|increment|
+------+---------+
|     1|     0.20|
|     2|     0.40|
|     3|     0.00|
+------+---------+



Given the following Data Frame definition.
>
- `df = spark.range(500).toDF("number”)`
>
Choose the incorrect expression
>
- `df.select("number" + 10)`
- `df.select(df["number"] + 10)`
- `df.select(expr("number + 10"))`
- `df.select(col("number") + 10)`

In [21]:
df = spark.range(500).toDF('number')

In [22]:
df.select(df["number"] + 10)


DataFrame[(number + 10): bigint]

In [23]:
df.select(expr("number + 10"))


DataFrame[(number + 10): bigint]

In [24]:
df.select(col("number") + 10)

DataFrame[(number + 10): bigint]

In [25]:
# TypeError: can only concatenate str (not "int") to str
df.select("number" + 10)

TypeError: can only concatenate str (not "int") to str

In [26]:
 df.selectExpr("number", "if(number < 20, number * 0.20 , 0) as increment").show()

+------+---------+
|number|increment|
+------+---------+
|     0|     0.00|
|     1|     0.20|
|     2|     0.40|
|     3|     0.60|
|     4|     0.80|
|     5|     1.00|
|     6|     1.20|
|     7|     1.40|
|     8|     1.60|
|     9|     1.80|
|    10|     2.00|
|    11|     2.20|
|    12|     2.40|
|    13|     2.60|
|    14|     2.80|
|    15|     3.00|
|    16|     3.20|
|    17|     3.40|
|    18|     3.60|
|    19|     3.80|
+------+---------+
only showing top 20 rows



Which of the following code blocks returns a DataFrame with a new column salary_increment and all previously existing columns.
>
- `df.withColumn("salary_increment", "salary * 0.15")`
- `df.select("*", expr("salary * 0.15").alias("salary_increment"))`
- `df.selectExpr("*", expr("salary * 0.15").alias("salary_increment"))`
- `All of the above`

In [27]:
data = [('James','','Smith','1991-04-01','M',3000),
        ('Michael','Rose','','2000-05-19','M',4000),
        ('Robert','','Williams','1978-09-05','M',4000),
        ('Maria','Anne','Jones','1967-12-01','F',4000),
        ('Jen','Mary','Brown','1980-02-17','F',-1)]

columns = ["firstname", "middlename", "lastname", "dob", "gender", "salary"]

df = spark.createDataFrame(data=data, schema = columns)

In [28]:
df.select("*", expr("salary * 0.15").alias("salary_increment")).show()


+---------+----------+--------+----------+------+------+----------------+
|firstname|middlename|lastname|       dob|gender|salary|salary_increment|
+---------+----------+--------+----------+------+------+----------------+
|    James|          |   Smith|1991-04-01|     M|  3000|          450.00|
|  Michael|      Rose|        |2000-05-19|     M|  4000|          600.00|
|   Robert|          |Williams|1978-09-05|     M|  4000|          600.00|
|    Maria|      Anne|   Jones|1967-12-01|     F|  4000|          600.00|
|      Jen|      Mary|   Brown|1980-02-17|     F|    -1|           -0.15|
+---------+----------+--------+----------+------+------+----------------+



In [38]:
df.selectExpr("*", "salary * 0.15").show()
df.withColumn("salary_increment", expr("salary * 0.15")).show()
df.selectExpr("*", "salary * 0.15 as salary_increment").show()

# TypeError: col should be Column
# df.withColumn("salary_increment", "salary * 0.15").show()

+---------+----------+--------+----------+------+------+---------------+
|firstname|middlename|lastname|       dob|gender|salary|(salary * 0.15)|
+---------+----------+--------+----------+------+------+---------------+
|    James|          |   Smith|1991-04-01|     M|  3000|         450.00|
|  Michael|      Rose|        |2000-05-19|     M|  4000|         600.00|
|   Robert|          |Williams|1978-09-05|     M|  4000|         600.00|
|    Maria|      Anne|   Jones|1967-12-01|     F|  4000|         600.00|
|      Jen|      Mary|   Brown|1980-02-17|     F|    -1|          -0.15|
+---------+----------+--------+----------+------+------+---------------+

+---------+----------+--------+----------+------+------+----------------+
|firstname|middlename|lastname|       dob|gender|salary|salary_increment|
+---------+----------+--------+----------+------+------+----------------+
|    James|          |   Smith|1991-04-01|     M|  3000|          450.00|
|  Michael|      Rose|        |2000-05-19|    

In [None]:
# AssertionError: col should be Column
df.withColumn("salary_increment", "salary * 0.15")

# TypeError: Column is not iterable
df.selectExpr("*", expr("salary * 0.15").alias("salary_increment"))

In [None]:

df.withColumn("now", now())

In [None]:
data_list = [("Ravi", "28", "3200"),
             ("Abdul", "23", "4800"),
             ("John", "32", "6500"),
             ("Rosy", "48", "8200")]
df = spark.createDataFrame(data_list).toDF("name", "age", "salary")

In [None]:
df.select("name", expr("salary") * 0.20).show()

In [None]:
df.select("name", expr("salary * 0.20")).show()

In [None]:
df.select("name", col("salary") * 0.20).show()

In [None]:
# TypeError: can't multiply sequence by non-int of type 'float'
df.select("name", "salary" * 0.20).show()

In [None]:
df.selectExpr("avg(salary)").show()

In [None]:
df.select(expr("avg(salary)")).show()

In [None]:
df.select(avg("salary")).show()

In [None]:
# AnalysisException: cannot resolve '`avg(salary)`' given input columns: [age, name, salary]
df.select(col("avg(salary)")).show()

In [None]:
df.printSchema()