# FILTER() + WHERE() + ISIN() + CONTAIN() + WHEN()

In [6]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

spark = (
    SparkSession.builder
    .appName("example-filter-isin")
    .getOrCreate()
)

In [4]:

dept = [("Finance",10), ("Marketing",20), ("Sales",30), ("IT",40), ("X",40), ("Y",40), ("Z",40)]
deptColumns = ["dept_name", "dept_id"]

deptDF = spark.createDataFrame(data=dept, schema = deptColumns)
deptDF.show(truncate=False)

+---------+-------+
|dept_name|dept_id|
+---------+-------+
|Finance  |10     |
|Marketing|20     |
|Sales    |30     |
|IT       |40     |
|X        |40     |
|Y        |40     |
|Z        |40     |
+---------+-------+



In [5]:
#  The argument to the where method cannot be a string.
deptDF.where("col(dept_id) >= 30")

AnalysisException: Undefined function: 'col'. This function is neither a registered temporary function nor a permanent function registered in the database 'default'.; line 1 pos 0

In [6]:
deptDF.where(col('dept_id') >= 30).show()

+---------+-------+
|dept_name|dept_id|
+---------+-------+
|    Sales|     30|
|       IT|     40|
|        X|     40|
|        Y|     40|
|        Z|     40|
+---------+-------+



The code block displayed below contains an error. The code block should return a DataFrame where all entries in column supplier contain the letter combination et in this order. Find the error.
>
Code block:
>
- `itemsDf.filter(column(‘supplier’).isin(‘et’))`

In [7]:
data = [(1, 'Thick Coat for Walking in the Snow', 'Sports Company Inc.'),
        (2, 'Elegant Outdoors Summer Dress', 'YetiX'),
        (3, 'Outdoors Backpack', 'Sports Company Inc.')]

columns = ["itemId", "itemName", "supplier"]

itemsDf = spark.createDataFrame(data=data, schema=columns)

itemsDf.printSchema()
itemsDf.show()

root
 |-- itemId: long (nullable = true)
 |-- itemName: string (nullable = true)
 |-- supplier: string (nullable = true)

+------+--------------------+-------------------+
|itemId|            itemName|           supplier|
+------+--------------------+-------------------+
|     1|Thick Coat for Wa...|Sports Company Inc.|
|     2|Elegant Outdoors ...|              YetiX|
|     3|   Outdoors Backpack|Sports Company Inc.|
+------+--------------------+-------------------+



In [8]:
itemsDf[itemsDf.supplier.isin("Sports Company Inc.")].show()

+------+--------------------+-------------------+
|itemId|            itemName|           supplier|
+------+--------------------+-------------------+
|     1|Thick Coat for Wa...|Sports Company Inc.|
|     3|   Outdoors Backpack|Sports Company Inc.|
+------+--------------------+-------------------+



In [9]:
itemsDf.filter(col("supplier").contains("Company")).show()

+------+--------------------+-------------------+
|itemId|            itemName|           supplier|
+------+--------------------+-------------------+
|     1|Thick Coat for Wa...|Sports Company Inc.|
|     3|   Outdoors Backpack|Sports Company Inc.|
+------+--------------------+-------------------+



Which of the following code blocks returns a new DataFrame with only columns predError and values of every second row of DataFrame transactionsDf?
>
Entire DataFrame transactionsDf:
>
- `+————-+———+—–+——-+———+—-+`
- `|transactionId|predError|value|storeId|productId| f|`
- `+————-+———+—–+——-+———+—-+`
- `| 1| 3| 4| 25| 1|null|`
- `| 2| 6| 7| 2| 2|null|`
- `| 3| 3| null| 25| 3|null|`
- `| 4| null| null| 3| 2|null|`
- `| 5| null| null| null| 2|null|`
- `| 6| 3| 2| 25| 2|null|`
- `+————-+———+—–+——-+———+—-+`
>
- `transactionsDf.filter(col("transactionId").isin([3,4,6])).select([predError, value])`
- `transactionsDf.select(col("transactionId").isin([3,4,6]), "predError", "value")`
- `transactionsDf.filter("transactionId" % 2 == 0).select("predError", "value")`
- `transactionsDf.filter(col("transactionId") % 2 == 0).select("predError", "value")`
- `transactionsDf.createOrReplaceTempView("transactionsDf")`
- `spark.sql("FROM transactionsDf SELECT predError, value WHERE transactionId % 2 = 0")`

In [2]:
from pyspark.sql.types import *

data = [(1, 3, 4, 25, 1, None),
        (2, 6, 7, 2, 2, None),
        (3, 3, None, 25, 3, None),
        (4, None, None, 3, 2, None),
        (5, None, None, None, 2, None),
        (6, 3, 2, 25, 2, None)]

schema = StructType([StructField('transactionId', IntegerType(), True),
                     StructField('predError', IntegerType(), True),
                     StructField('value', IntegerType(), True),
                     StructField('storeId', IntegerType(), True),
                     StructField('productId', IntegerType(), True),
                     StructField('f', IntegerType(), True)])

transactionsDf = spark.createDataFrame(data=data, schema=schema)
transactionsDf.show()

+-------------+---------+-----+-------+---------+----+
|transactionId|predError|value|storeId|productId|   f|
+-------------+---------+-----+-------+---------+----+
|            1|        3|    4|     25|        1|null|
|            2|        6|    7|      2|        2|null|
|            3|        3| null|     25|        3|null|
|            4|     null| null|      3|        2|null|
|            5|     null| null|   null|        2|null|
|            6|        3|    2|     25|        2|null|
+-------------+---------+-----+-------+---------+----+



In [11]:
transactionsDf.filter((col("productId")==3) | (col("productId")<1)).show()

+-------------+---------+-----+-------+---------+----+
|transactionId|predError|value|storeId|productId|   f|
+-------------+---------+-----+-------+---------+----+
|            3|        3| null|     25|        3|null|
+-------------+---------+-----+-------+---------+----+



In [12]:
# NameError: name 'productId' is not defined
transactionsDf.filter(productId==3 or productId<1)

# ValueError: Cannot convert column into bool: please use '&' for 'and', '|' for 'or', '~' for 'not'
transactionsDf.filter((col("productId")==3) or (col("productId")<1))

# Py4JError: An error occurred while calling o43.or. 
# Trace: py4j.Py4JException: Method or([class java.lang.Integer]) does not exist
transactionsDf.filter(col("productId")==3 | col("productId")<1)

# SyntaxError: invalid syntax "or"
transactionsDf.where("productId"=3).or("productId"<1))



SyntaxError: invalid syntax (<ipython-input-12-fd3de26aeb47>, line 12)

In [13]:
transactionsDf.select(col("transactionId").isin([3,4,6]), "predError", "value").show()


+----------------------------+---------+-----+
|(transactionId IN (3, 4, 6))|predError|value|
+----------------------------+---------+-----+
|                       false|        3|    4|
|                       false|        6|    7|
|                        true|        3| null|
|                        true|     null| null|
|                       false|     null| null|
|                        true|        3|    2|
+----------------------------+---------+-----+



In [14]:
transactionsDf.filter(col("transactionId") % 2 == 0).select("predError", "value").show()


+---------+-----+
|predError|value|
+---------+-----+
|        6|    7|
|     null| null|
|        3|    2|
+---------+-----+



In [15]:
spark.sql("FROM transactionsDf SELECT predError, value WHERE transactionId % 2 = 0").show()

AnalysisException: Table or view not found: transactionsDf; line 1 pos 5;
'Project ['predError, 'value]
+- 'Filter (('transactionId % 2) = 0)
   +- 'UnresolvedRelation [transactionsDf]


In [16]:
# NameError: name 'predError' is not defined
# transactionsDf.filter(col("transactionId").isin([3,4,6])).select([predError, value])

# TypeError: not all arguments converted during string formatting
# transactionsDf.filter("transactionId" % 2 == 0).select("predError", "value")


In [17]:
cond1 = col("productId").isin(1)
cond2 = col("productId").isin(2)
cond3 = col("productId").isin(3)

transactionsDf.withColumn("productGroup", when(cond1, "Grupo1")
								         .when(cond2, "Grupo2")
								         .when(cond3, "Grupo3")
								         .otherwise("Grupo4")) \
								  .show()

+-------------+---------+-----+-------+---------+----+------------+
|transactionId|predError|value|storeId|productId|   f|productGroup|
+-------------+---------+-----+-------+---------+----+------------+
|            1|        3|    4|     25|        1|null|      Grupo1|
|            2|        6|    7|      2|        2|null|      Grupo2|
|            3|        3| null|     25|        3|null|      Grupo3|
|            4|     null| null|      3|        2|null|      Grupo2|
|            5|     null| null|   null|        2|null|      Grupo2|
|            6|        3|    2|     25|        2|null|      Grupo2|
+-------------+---------+-----+-------+---------+----+------------+



In [18]:
df = spark.createDataFrame(
    [
        (1, 13), 
        (2, 23),
		(3, 10),
		(4, 17),
		(5, 18),
		(6, 21),
		(7, 9),
		(8, 26),
		(9, 28),
		
    ], ["id", "Idade"])

cond1 = col("Idade").isin(10, 11, 12)
cond2 = col("Idade").isin(13, 14, 15)
cond3 = col("Idade").isin(16, 17, 18)

df.withColumn("Idade_Grupo", when(cond1, "Grupo1")
                            .when(cond2, "Grupo2")
                            .when(cond3, "Grupo3")
                            .otherwise("Grupo4")) \
			 .show()    

+---+-----+-----------+
| id|Idade|Idade_Grupo|
+---+-----+-----------+
|  1|   13|     Grupo2|
|  2|   23|     Grupo4|
|  3|   10|     Grupo1|
|  4|   17|     Grupo3|
|  5|   18|     Grupo3|
|  6|   21|     Grupo4|
|  7|    9|     Grupo4|
|  8|   26|     Grupo4|
|  9|   28|     Grupo4|
+---+-----+-----------+



Which of the following code blocks returns a one-column DataFrame of all values in column supplier of DataFrame itemsDf that do not contain the letter X? In the DataFrame, every value should only be listed once.
>
Sample of DataFrame itemsDf:
>
- `+——+——————–+——————–+——————-+`
- `|itemId| itemName| attributes| supplier|`
- `+——+——————–+——————–+——————-+`
- `| 1|Thick Coat for Wa…|[blue, winter, cozy]|Sports Company Inc.|`
- `| 2|Elegant Outdoors …|[red, summer, fre…| YetiX|`
- `| 3| Outdoors Backpack|[green, summer, t…|Sports Company Inc.|`
- `+——+——————–+——————–+——————-+`

In [19]:
data = [(1, 'Thick Coat for Walking in the Snow', ['blue', 'winter', 'cozy'], 'Sports Company Inc.'),
        (2, 'Elegant Outdoors Summer Dress', ['red', 'summer'], 'YetiX'),
        (3, 'Outdoors Backpack', ['green', 'summer'], 'Sports Company Inc.')]

columns = ["itemId", "itemName", "attributes", "supplier"]

itemsDf = spark.createDataFrame(data=data, schema=columns)

itemsDf.printSchema()
itemsDf.show()

root
 |-- itemId: long (nullable = true)
 |-- itemName: string (nullable = true)
 |-- attributes: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- supplier: string (nullable = true)

+------+--------------------+--------------------+-------------------+
|itemId|            itemName|          attributes|           supplier|
+------+--------------------+--------------------+-------------------+
|     1|Thick Coat for Wa...|[blue, winter, cozy]|Sports Company Inc.|
|     2|Elegant Outdoors ...|       [red, summer]|              YetiX|
|     3|   Outdoors Backpack|     [green, summer]|Sports Company Inc.|
+------+--------------------+--------------------+-------------------+



In [20]:
itemsDf.select(~col('supplier').contains('X')).distinct().show()


+---------------------------+
|(NOT contains(supplier, X))|
+---------------------------+
|                       true|
|                      false|
+---------------------------+



In [21]:
itemsDf.filter(~col('supplier').contains('X')).select('supplier').distinct().show()


+-------------------+
|           supplier|
+-------------------+
|Sports Company Inc.|
+-------------------+



In [22]:
# NameError: name 'supplier' is not defined
itemsDf.filter(col(supplier).not_contains('X')).select(supplier).distinct().show()

# ValueError: Cannot convert column into bool: please use '&' for 'and', '|' for 'or', '~' for 'not' when building DataFrame boolean expressions.
itemsDf.filter(not(col('supplier').contains('X'))).select('supplier').unique().show()

# SyntaxError: invalid syntax !
itemsDf.filter(!col('supplier').contains('X')).select(col('supplier')).unique().show()

SyntaxError: invalid syntax (<ipython-input-22-a02107b1abe3>, line 8)

In [23]:
transactionsDf.select(col("transactionId").isin([3,4,6]), "predError", "value").show()


+----------------------------+---------+-----+
|(transactionId IN (3, 4, 6))|predError|value|
+----------------------------+---------+-----+
|                       false|        3|    4|
|                       false|        6|    7|
|                        true|        3| null|
|                        true|     null| null|
|                       false|     null| null|
|                        true|        3|    2|
+----------------------------+---------+-----+



In [24]:
transactionsDf.filter(col("transactionId") % 2 == 0).select("predError", "value").show()


+---------+-----+
|predError|value|
+---------+-----+
|        6|    7|
|     null| null|
|        3|    2|
+---------+-----+



In [25]:
spark.sql("FROM transactionsDf SELECT predError, value WHERE transactionId % 2 = 0").show()

AnalysisException: Table or view not found: transactionsDf; line 1 pos 5;
'Project ['predError, 'value]
+- 'Filter (('transactionId % 2) = 0)
   +- 'UnresolvedRelation [transactionsDf]


In [26]:
# NameError: name 'predError' is not defined
# transactionsDf.filter(col("transactionId").isin([3,4,6])).select([predError, value])

# TypeError: not all arguments converted during string formatting
# transactionsDf.filter("transactionId" % 2 == 0).select("predError", "value")


In [27]:
cond1 = col("productId").isin(1)
cond2 = col("productId").isin(2)
cond3 = col("productId").isin(3)

transactionsDf.withColumn("productGroup", when(cond1, "Grupo1")
								         .when(cond2, "Grupo2")
								         .when(cond3, "Grupo3")
								         .otherwise("Grupo4")) \
								  .show()

+-------------+---------+-----+-------+---------+----+------------+
|transactionId|predError|value|storeId|productId|   f|productGroup|
+-------------+---------+-----+-------+---------+----+------------+
|            1|        3|    4|     25|        1|null|      Grupo1|
|            2|        6|    7|      2|        2|null|      Grupo2|
|            3|        3| null|     25|        3|null|      Grupo3|
|            4|     null| null|      3|        2|null|      Grupo2|
|            5|     null| null|   null|        2|null|      Grupo2|
|            6|        3|    2|     25|        2|null|      Grupo2|
+-------------+---------+-----+-------+---------+----+------------+



In [28]:
df = spark.createDataFrame(
    [
        (1, 13), 
        (2, 23),
		(3, 10),
		(4, 17),
		(5, 18),
		(6, 21),
		(7, 9),
		(8, 26),
		(9, 28),
		
    ], ["id", "Idade"])

cond1 = col("Idade").isin(10, 11, 12)
cond2 = col("Idade").isin(13, 14, 15)
cond3 = col("Idade").isin(16, 17, 18)

df.withColumn("Idade_Grupo", when(cond1, "Grupo1")
                            .when(cond2, "Grupo2")
                            .when(cond3, "Grupo3")
                            .otherwise("Grupo4")) \
			 .show()    

+---+-----+-----------+
| id|Idade|Idade_Grupo|
+---+-----+-----------+
|  1|   13|     Grupo2|
|  2|   23|     Grupo4|
|  3|   10|     Grupo1|
|  4|   17|     Grupo3|
|  5|   18|     Grupo3|
|  6|   21|     Grupo4|
|  7|    9|     Grupo4|
|  8|   26|     Grupo4|
|  9|   28|     Grupo4|
+---+-----+-----------+



Which of the following code blocks returns a one-column DataFrame of all values in column supplier of DataFrame itemsDf that do not contain the letter X? In the DataFrame, every value should only be listed once.
>
Sample of DataFrame itemsDf:
>
- `+——+——————–+——————–+——————-+`
- `|itemId| itemName| attributes| supplier|`
- `+——+——————–+——————–+——————-+`
- `| 1|Thick Coat for Wa…|[blue, winter, cozy]|Sports Company Inc.|`
- `| 2|Elegant Outdoors …|[red, summer, fre…| YetiX|`
- `| 3| Outdoors Backpack|[green, summer, t…|Sports Company Inc.|`
- `+——+——————–+——————–+——————-+`

In [29]:
data = [(1, 'Thick Coat for Walking in the Snow', ['blue', 'winter', 'cozy'], 'Sports Company Inc.'),
        (2, 'Elegant Outdoors Summer Dress', ['red', 'summer'], 'YetiX'),
        (3, 'Outdoors Backpack', ['green', 'summer'], 'Sports Company Inc.')]

columns = ["itemId", "itemName", "attributes", "supplier"]

itemsDf = spark.createDataFrame(data=data, schema=columns)

itemsDf.printSchema()
itemsDf.show()

root
 |-- itemId: long (nullable = true)
 |-- itemName: string (nullable = true)
 |-- attributes: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- supplier: string (nullable = true)

+------+--------------------+--------------------+-------------------+
|itemId|            itemName|          attributes|           supplier|
+------+--------------------+--------------------+-------------------+
|     1|Thick Coat for Wa...|[blue, winter, cozy]|Sports Company Inc.|
|     2|Elegant Outdoors ...|       [red, summer]|              YetiX|
|     3|   Outdoors Backpack|     [green, summer]|Sports Company Inc.|
+------+--------------------+--------------------+-------------------+



In [30]:
itemsDf.select(~col('supplier').contains('X')).distinct().show()


+---------------------------+
|(NOT contains(supplier, X))|
+---------------------------+
|                       true|
|                      false|
+---------------------------+



In [31]:
itemsDf.filter(~col('supplier').contains('X')).select('supplier').distinct().show()


+-------------------+
|           supplier|
+-------------------+
|Sports Company Inc.|
+-------------------+



In [32]:
# NameError: name 'supplier' is not defined
itemsDf.filter(col(supplier).not_contains('X')).select(supplier).distinct().show()

# ValueError: Cannot convert column into bool: please use '&' for 'and', '|' for 'or', '~' for 'not' when building DataFrame boolean expressions.
itemsDf.filter(not(col('supplier').contains('X'))).select('supplier').unique().show()

# SyntaxError: invalid syntax !
itemsDf.filter(!col('supplier').contains('X')).select(col('supplier')).unique().show()

SyntaxError: invalid syntax (<ipython-input-32-a02107b1abe3>, line 8)

Which of the following code blocks returns a single row from DataFrame transactionsDf?
>
- `transactionsDf.where(col("storeId").between(3,25))`
- `transactionsDf.filter((col("storeId")!=25) | (col("productId")==2))`
- `transactionsDf.filter(col("storeId")==25).select("predError","storeId").distinct()`
- `transactionsDf.select("productId", "storeId").where("storeId == 2 OR storeId != 25")`
- `transactionsDf.where(col("value").isNull()).select("productId", "storeId").distinct()`

In [39]:
transactionsDf.filter(col("storeId")==25).select("predError","storeId").distinct().show()

+---------+-------+
|predError|storeId|
+---------+-------+
|        3|     25|
+---------+-------+



In [34]:
transactionsDf.where(col("storeId").between(3,25)).show()


+-------------+---------+-----+-------+---------+----+
|transactionId|predError|value|storeId|productId|   f|
+-------------+---------+-----+-------+---------+----+
|            1|        3|    4|     25|        1|null|
|            3|        3| null|     25|        3|null|
|            4|     null| null|      3|        2|null|
|            6|        3|    2|     25|        2|null|
+-------------+---------+-----+-------+---------+----+



In [35]:
transactionsDf.filter((col("storeId")!=25) | (col("productId")==2)).show()


+-------------+---------+-----+-------+---------+----+
|transactionId|predError|value|storeId|productId|   f|
+-------------+---------+-----+-------+---------+----+
|            2|        6|    7|      2|        2|null|
|            4|     null| null|      3|        2|null|
|            5|     null| null|   null|        2|null|
|            6|        3|    2|     25|        2|null|
+-------------+---------+-----+-------+---------+----+



In [37]:
transactionsDf.select("productId", "storeId").where("storeId == 2 OR storeId != 25").show()


+---------+-------+
|productId|storeId|
+---------+-------+
|        2|      2|
|        2|      3|
+---------+-------+



In [38]:
transactionsDf.where(col("value").isNull()).select("productId", "storeId").distinct().show()

+---------+-------+
|productId|storeId|
+---------+-------+
|        2|      3|
|        3|     25|
|        2|   null|
+---------+-------+



The code block displayed below contains an error. The code block should count the number of rows that have a predError of either 3 or 6. Find the error.
>
Code block:
>
- `transactionsDf.filter(col(‘predError’).in([3, 6])).count()`

In [5]:
transactionsDf.filter(col('predError').isin([3, 6])).count()

4

Which expression is equivalent to the below expression.
>
- `df.where("salary > 5000")`
- `df.where(expr("salary > 5000"))`
- `df.filter("salary" > 5000)`
- `df.filter(col("salary") > 5000)`
- `df.filter(expr("salary > 5000"))`

In [7]:
simpleData = [("James","Sales","NY",90000,34,10000),
              ("Michael","Sales","NY",86000,56,20000),
              ("Robert","Sales","CA",81000,30,23000),
              ("Maria","Finance","CA",90000,24,23000),
              ("Raman","Finance","CA",99000,40,24000),
              ("Scott","Finance","NY",83000,36,19000),
              ("Jen","Finance","NY",79000,53,15000),
              ("Jeff","Marketing","CA",80000,25,18000),
              ("Kumar","Marketing","NY",91000,50,21000)]

schema = ["employee_name", "department", "state", "salary", "age", "bonus"]

df = spark.createDataFrame(data=simpleData, schema = schema)

df.printSchema()
df.show(truncate=False)

root
 |-- employee_name: string (nullable = true)
 |-- department: string (nullable = true)
 |-- state: string (nullable = true)
 |-- salary: long (nullable = true)
 |-- age: long (nullable = true)
 |-- bonus: long (nullable = true)

+-------------+----------+-----+------+---+-----+
|employee_name|department|state|salary|age|bonus|
+-------------+----------+-----+------+---+-----+
|James        |Sales     |NY   |90000 |34 |10000|
|Michael      |Sales     |NY   |86000 |56 |20000|
|Robert       |Sales     |CA   |81000 |30 |23000|
|Maria        |Finance   |CA   |90000 |24 |23000|
|Raman        |Finance   |CA   |99000 |40 |24000|
|Scott        |Finance   |NY   |83000 |36 |19000|
|Jen          |Finance   |NY   |79000 |53 |15000|
|Jeff         |Marketing |CA   |80000 |25 |18000|
|Kumar        |Marketing |NY   |91000 |50 |21000|
+-------------+----------+-----+------+---+-----+



In [9]:
df.where("salary > 5000").show()


+-------------+----------+-----+------+---+-----+
|employee_name|department|state|salary|age|bonus|
+-------------+----------+-----+------+---+-----+
|        James|     Sales|   NY| 90000| 34|10000|
|      Michael|     Sales|   NY| 86000| 56|20000|
|       Robert|     Sales|   CA| 81000| 30|23000|
|        Maria|   Finance|   CA| 90000| 24|23000|
|        Raman|   Finance|   CA| 99000| 40|24000|
|        Scott|   Finance|   NY| 83000| 36|19000|
|          Jen|   Finance|   NY| 79000| 53|15000|
|         Jeff| Marketing|   CA| 80000| 25|18000|
|        Kumar| Marketing|   NY| 91000| 50|21000|
+-------------+----------+-----+------+---+-----+



In [10]:
df.where(expr("salary > 5000")).show()


+-------------+----------+-----+------+---+-----+
|employee_name|department|state|salary|age|bonus|
+-------------+----------+-----+------+---+-----+
|        James|     Sales|   NY| 90000| 34|10000|
|      Michael|     Sales|   NY| 86000| 56|20000|
|       Robert|     Sales|   CA| 81000| 30|23000|
|        Maria|   Finance|   CA| 90000| 24|23000|
|        Raman|   Finance|   CA| 99000| 40|24000|
|        Scott|   Finance|   NY| 83000| 36|19000|
|          Jen|   Finance|   NY| 79000| 53|15000|
|         Jeff| Marketing|   CA| 80000| 25|18000|
|        Kumar| Marketing|   NY| 91000| 50|21000|
+-------------+----------+-----+------+---+-----+



In [12]:
df.filter(col("salary") > 5000).show()


+-------------+----------+-----+------+---+-----+
|employee_name|department|state|salary|age|bonus|
+-------------+----------+-----+------+---+-----+
|        James|     Sales|   NY| 90000| 34|10000|
|      Michael|     Sales|   NY| 86000| 56|20000|
|       Robert|     Sales|   CA| 81000| 30|23000|
|        Maria|   Finance|   CA| 90000| 24|23000|
|        Raman|   Finance|   CA| 99000| 40|24000|
|        Scott|   Finance|   NY| 83000| 36|19000|
|          Jen|   Finance|   NY| 79000| 53|15000|
|         Jeff| Marketing|   CA| 80000| 25|18000|
|        Kumar| Marketing|   NY| 91000| 50|21000|
+-------------+----------+-----+------+---+-----+



In [13]:
df.filter(expr("salary > 5000")).show()

+-------------+----------+-----+------+---+-----+
|employee_name|department|state|salary|age|bonus|
+-------------+----------+-----+------+---+-----+
|        James|     Sales|   NY| 90000| 34|10000|
|      Michael|     Sales|   NY| 86000| 56|20000|
|       Robert|     Sales|   CA| 81000| 30|23000|
|        Maria|   Finance|   CA| 90000| 24|23000|
|        Raman|   Finance|   CA| 99000| 40|24000|
|        Scott|   Finance|   NY| 83000| 36|19000|
|          Jen|   Finance|   NY| 79000| 53|15000|
|         Jeff| Marketing|   CA| 80000| 25|18000|
|        Kumar| Marketing|   NY| 91000| 50|21000|
+-------------+----------+-----+------+---+-----+



In [None]:
# TypeError: '>' not supported between instances of 'str' and 'int'
df.filter("salary" > 5000).show()


In [20]:
df.limit(3).where("salary > 4000").show()


+-------------+----------+-----+------+---+-----+
|employee_name|department|state|salary|age|bonus|
+-------------+----------+-----+------+---+-----+
|        James|     Sales|   NY| 90000| 34|10000|
|      Michael|     Sales|   NY| 86000| 56|20000|
|       Robert|     Sales|   CA| 81000| 30|23000|
+-------------+----------+-----+------+---+-----+



In [21]:
df.where("salary > 4000").limit(3).show()

+-------------+----------+-----+------+---+-----+
|employee_name|department|state|salary|age|bonus|
+-------------+----------+-----+------+---+-----+
|        James|     Sales|   NY| 90000| 34|10000|
|      Michael|     Sales|   NY| 86000| 56|20000|
|       Robert|     Sales|   CA| 81000| 30|23000|
+-------------+----------+-----+------+---+-----+



You are given a code as shown below to count the number of unique invoices.
df.select(“InvoiceNo”).distinct().agg(count(“InvoiceNo”))
>
Choose the logically equivalent code from the options.
>
- `df.select("count(distinct InvoiceNo)")`
- `df.selectExpr("count(distinct InvoiceNo)")`
- `df.select(countDistinct("InvoiceNo"))`
- `df.selectExpr("countDistinct(InvoiceNo)")`

In [23]:
df.select("department").distinct().agg(count("department")).show()

+-----------------+
|count(department)|
+-----------------+
|                3|
+-----------------+



In [25]:
df.selectExpr("count(distinct department)").show()


+--------------------------+
|count(DISTINCT department)|
+--------------------------+
|                         3|
+--------------------------+



In [26]:
df.select(countDistinct("department")).show()


+--------------------------+
|count(DISTINCT department)|
+--------------------------+
|                         3|
+--------------------------+



In [None]:
# AnalysisException: cannot resolve '`count(distinct department)`' given input columns: [age, bonus, department, employee_name, salary, state]
df.select("count(distinct department)").show()

# AnalysisException: Undefined function: 'countDistinct'. 
# This function is neither a registered temporary function nor a permanent function registered in the database 'default'.
df.selectExpr("countDistinct(department)").show()