# FILTER() + WHERE() + ISIN() + CONTAIN() + WHEN()

In [48]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

spark = (
    SparkSession.builder
    .appName("example-filter-isin")
    .getOrCreate()
)

In [37]:

dept = [("Finance",10), ("Marketing",20), ("Sales",30), ("IT",40), ("X",40), ("Y",40), ("Z",40)]
deptColumns = ["dept_name", "dept_id"]

deptDF = spark.createDataFrame(data=dept, schema = deptColumns)
deptDF.show(truncate=False)

+---------+-------+
|dept_name|dept_id|
+---------+-------+
|Finance  |10     |
|Marketing|20     |
|Sales    |30     |
|IT       |40     |
|X        |40     |
|Y        |40     |
|Z        |40     |
+---------+-------+



In [38]:
#  The argument to the where method cannot be a string.
deptDF.where("col(dept_id) >= 30")

AnalysisException: Undefined function: 'col'. This function is neither a registered temporary function nor a permanent function registered in the database 'default'.; line 1 pos 0

In [None]:
deptDF.where(col('dept_id') >= 30).show()

The code block displayed below contains an error. The code block should return a DataFrame where all entries in column supplier contain the letter combination et in this order. Find the error.
>
Code block:
>
- `itemsDf.filter(column(‘supplier’).isin(‘et’))`

In [39]:
data = [(1, 'Thick Coat for Walking in the Snow', 'Sports Company Inc.'),
        (2, 'Elegant Outdoors Summer Dress', 'YetiX'),
        (3, 'Outdoors Backpack', 'Sports Company Inc.')]

columns = ["itemId", "itemName", "supplier"]

itemsDf = spark.createDataFrame(data=data, schema=columns)

itemsDf.printSchema()
itemsDf.show()

root
 |-- itemId: long (nullable = true)
 |-- itemName: string (nullable = true)
 |-- supplier: string (nullable = true)

+------+--------------------+-------------------+
|itemId|            itemName|           supplier|
+------+--------------------+-------------------+
|     1|Thick Coat for Wa...|Sports Company Inc.|
|     2|Elegant Outdoors ...|              YetiX|
|     3|   Outdoors Backpack|Sports Company Inc.|
+------+--------------------+-------------------+



In [40]:
itemsDf[itemsDf.supplier.isin("Sports Company Inc.")].show()

+------+--------------------+-------------------+
|itemId|            itemName|           supplier|
+------+--------------------+-------------------+
|     1|Thick Coat for Wa...|Sports Company Inc.|
|     3|   Outdoors Backpack|Sports Company Inc.|
+------+--------------------+-------------------+



In [41]:
itemsDf.filter(col("supplier").contains("Company")).show()

+------+--------------------+-------------------+
|itemId|            itemName|           supplier|
+------+--------------------+-------------------+
|     1|Thick Coat for Wa...|Sports Company Inc.|
|     3|   Outdoors Backpack|Sports Company Inc.|
+------+--------------------+-------------------+



Which of the following code blocks returns a new DataFrame with only columns predError and values of every second row of DataFrame transactionsDf?
>
Entire DataFrame transactionsDf:
>
- `+————-+———+—–+——-+———+—-+`
- `|transactionId|predError|value|storeId|productId| f|`
- `+————-+———+—–+——-+———+—-+`
- `| 1| 3| 4| 25| 1|null|`
- `| 2| 6| 7| 2| 2|null|`
- `| 3| 3| null| 25| 3|null|`
- `| 4| null| null| 3| 2|null|`
- `| 5| null| null| null| 2|null|`
- `| 6| 3| 2| 25| 2|null|`
- `+————-+———+—–+——-+———+—-+`
>
- `transactionsDf.filter(col("transactionId").isin([3,4,6])).select([predError, value])`
- `transactionsDf.select(col("transactionId").isin([3,4,6]), "predError", "value")`
- `transactionsDf.filter("transactionId" % 2 == 0).select("predError", "value")`
- `transactionsDf.filter(col("transactionId") % 2 == 0).select("predError", "value")`
- `transactionsDf.createOrReplaceTempView("transactionsDf")`
- `spark.sql("FROM transactionsDf SELECT predError, value WHERE transactionId % 2 = 0")`

In [42]:
from pyspark.sql.types import *

data = [(1, 3, 4, 25, 1, None),
        (2, 6, 7, 2, 2, None),
        (3, 3, None, 25, 3, None),
        (4, None, None, 3, 2, None),
        (5, None, None, None, 2, None),
        (6, 3, 2, 25, 2, None)]

schema = StructType([StructField('transactionId', IntegerType(), True),
                     StructField('predError', IntegerType(), True),
                     StructField('value', IntegerType(), True),
                     StructField('storeId', IntegerType(), True),
                     StructField('productId', IntegerType(), True),
                     StructField('f', IntegerType(), True)])

transactionsDf = spark.createDataFrame(data=data, schema=schema)
transactionsDf.show()

+-------------+---------+-----+-------+---------+----+
|transactionId|predError|value|storeId|productId|   f|
+-------------+---------+-----+-------+---------+----+
|            1|        3|    4|     25|        1|null|
|            2|        6|    7|      2|        2|null|
|            3|        3| null|     25|        3|null|
|            4|     null| null|      3|        2|null|
|            5|     null| null|   null|        2|null|
|            6|        3|    2|     25|        2|null|
+-------------+---------+-----+-------+---------+----+



In [43]:
transactionsDf.select(col("transactionId").isin([3,4,6]), "predError", "value").show()


+----------------------------+---------+-----+
|(transactionId IN (3, 4, 6))|predError|value|
+----------------------------+---------+-----+
|                       false|        3|    4|
|                       false|        6|    7|
|                        true|        3| null|
|                        true|     null| null|
|                       false|     null| null|
|                        true|        3|    2|
+----------------------------+---------+-----+



In [44]:
transactionsDf.filter(col("transactionId") % 2 == 0).select("predError", "value").show()


+---------+-----+
|predError|value|
+---------+-----+
|        6|    7|
|     null| null|
|        3|    2|
+---------+-----+



In [45]:
spark.sql("FROM transactionsDf SELECT predError, value WHERE transactionId % 2 = 0").show()

+---------+-----+
|predError|value|
+---------+-----+
|        6|    7|
|     null| null|
|        3|    2|
+---------+-----+



In [46]:
# NameError: name 'predError' is not defined
# transactionsDf.filter(col("transactionId").isin([3,4,6])).select([predError, value])

# TypeError: not all arguments converted during string formatting
# transactionsDf.filter("transactionId" % 2 == 0).select("predError", "value")


In [49]:
cond1 = col("productId").isin(1)
cond2 = col("productId").isin(2)
cond3 = col("productId").isin(3)

transactionsDf.withColumn("productGroup", when(cond1, "Grupo1")
								         .when(cond2, "Grupo2")
								         .when(cond3, "Grupo3")
								         .otherwise("Grupo4")) \
								  .show()

+-------------+---------+-----+-------+---------+----+------------+
|transactionId|predError|value|storeId|productId|   f|productGroup|
+-------------+---------+-----+-------+---------+----+------------+
|            1|        3|    4|     25|        1|null|      Grupo1|
|            2|        6|    7|      2|        2|null|      Grupo2|
|            3|        3| null|     25|        3|null|      Grupo3|
|            4|     null| null|      3|        2|null|      Grupo2|
|            5|     null| null|   null|        2|null|      Grupo2|
|            6|        3|    2|     25|        2|null|      Grupo2|
+-------------+---------+-----+-------+---------+----+------------+



In [50]:
df = spark.createDataFrame(
    [
        (1, 13), 
        (2, 23),
		(3, 10),
		(4, 17),
		(5, 18),
		(6, 21),
		(7, 9),
		(8, 26),
		(9, 28),
		
    ], ["id", "Idade"])

cond1 = col("Idade").isin(10, 11, 12)
cond2 = col("Idade").isin(13, 14, 15)
cond3 = col("Idade").isin(16, 17, 18)

df.withColumn("Idade_Grupo", when(cond1, "Grupo1")
                            .when(cond2, "Grupo2")
                            .when(cond3, "Grupo3")
                            .otherwise("Grupo4")) \
			 .show()    

+---+-----+-----------+
| id|Idade|Idade_Grupo|
+---+-----+-----------+
|  1|   13|     Grupo2|
|  2|   23|     Grupo4|
|  3|   10|     Grupo1|
|  4|   17|     Grupo3|
|  5|   18|     Grupo3|
|  6|   21|     Grupo4|
|  7|    9|     Grupo4|
|  8|   26|     Grupo4|
|  9|   28|     Grupo4|
+---+-----+-----------+



Which of the following code blocks returns a one-column DataFrame of all values in column supplier of DataFrame itemsDf that do not contain the letter X? In the DataFrame, every value should only be listed once.
>
Sample of DataFrame itemsDf:
>
- `+——+——————–+——————–+——————-+`
- `|itemId| itemName| attributes| supplier|`
- `+——+——————–+——————–+——————-+`
- `| 1|Thick Coat for Wa…|[blue, winter, cozy]|Sports Company Inc.|`
- `| 2|Elegant Outdoors …|[red, summer, fre…| YetiX|`
- `| 3| Outdoors Backpack|[green, summer, t…|Sports Company Inc.|`
- `+——+——————–+——————–+——————-+`

In [51]:
data = [(1, 'Thick Coat for Walking in the Snow', ['blue', 'winter', 'cozy'], 'Sports Company Inc.'),
        (2, 'Elegant Outdoors Summer Dress', ['red', 'summer'], 'YetiX'),
        (3, 'Outdoors Backpack', ['green', 'summer'], 'Sports Company Inc.')]

columns = ["itemId", "itemName", "attributes", "supplier"]

itemsDf = spark.createDataFrame(data=data, schema=columns)

itemsDf.printSchema()
itemsDf.show()

root
 |-- itemId: long (nullable = true)
 |-- itemName: string (nullable = true)
 |-- attributes: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- supplier: string (nullable = true)

+------+--------------------+--------------------+-------------------+
|itemId|            itemName|          attributes|           supplier|
+------+--------------------+--------------------+-------------------+
|     1|Thick Coat for Wa...|[blue, winter, cozy]|Sports Company Inc.|
|     2|Elegant Outdoors ...|       [red, summer]|              YetiX|
|     3|   Outdoors Backpack|     [green, summer]|Sports Company Inc.|
+------+--------------------+--------------------+-------------------+



In [54]:
itemsDf.select(~col('supplier').contains('X')).distinct().show()


+---------------------------+
|(NOT contains(supplier, X))|
+---------------------------+
|                       true|
|                      false|
+---------------------------+



In [56]:
itemsDf.filter(~col('supplier').contains('X')).select('supplier').distinct().show()


+-------------------+
|           supplier|
+-------------------+
|Sports Company Inc.|
+-------------------+



In [None]:
# NameError: name 'supplier' is not defined
itemsDf.filter(col(supplier).not_contains('X')).select(supplier).distinct().show()

# ValueError: Cannot convert column into bool: please use '&' for 'and', '|' for 'or', '~' for 'not' when building DataFrame boolean expressions.
itemsDf.filter(not(col('supplier').contains('X'))).select('supplier').unique().show()

# SyntaxError: invalid syntax !
itemsDf.filter(!col('supplier').contains('X')).select(col('supplier')).unique().show()