# BETWEEN()

In [1]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

spark = (
    SparkSession.builder
    .appName("example-between")
    .getOrCreate()
)

In [2]:
data = [(1, 3, 4, 25, 1, None, 1587915332),
         (2, 6, 7, 2, 2, None, 1586815312),
         (3, 3, None, 25, 3, None, 1585824821),
         (4, None, None, 3, 2, None, 1583244275),
         (5, None, None, None, 2, None, 1575285427),
         (6, 3, 2, 25, 2, None, 1572733275)]

schema = StructType([StructField('transactionId', IntegerType(), True),
                     StructField('predError', IntegerType(), True),
                     StructField('value', IntegerType(), True),
                     StructField('storeId', IntegerType(), True),
                     StructField('productId', IntegerType(), True),
                     StructField('f', IntegerType(), True),
                     StructField('transactionDate', LongType(), True)])

transactionsDf = spark.createDataFrame(data=data, schema=schema)

In [3]:
transactionsDf.printSchema()
transactionsDf.show()

root
 |-- transactionId: integer (nullable = true)
 |-- predError: integer (nullable = true)
 |-- value: integer (nullable = true)
 |-- storeId: integer (nullable = true)
 |-- productId: integer (nullable = true)
 |-- f: integer (nullable = true)
 |-- transactionDate: long (nullable = true)

+-------------+---------+-----+-------+---------+----+---------------+
|transactionId|predError|value|storeId|productId|   f|transactionDate|
+-------------+---------+-----+-------+---------+----+---------------+
|            1|        3|    4|     25|        1|null|     1587915332|
|            2|        6|    7|      2|        2|null|     1586815312|
|            3|        3| null|     25|        3|null|     1585824821|
|            4|     null| null|      3|        2|null|     1583244275|
|            5|     null| null|   null|        2|null|     1575285427|
|            6|        3|    2|     25|        2|null|     1572733275|
+-------------+---------+-----+-------+---------+----+--------------

The code block shown below should return a column that indicates through boolean variables whether rows in DataFrame `transactionsDf` have values `greater or equal to 20 and smaller or equal to 30` in column `storeId` and have the value 2 in column productId. Choose the answer that correctly fills the blanks in the code block to accomplish this.
>
`transactionsDf.__1__((__2__.__3__) __4__ (__5__))`


- `1. select 2. col("storeId") 3. between(20, 30) 4. and 5. col("productId")==2`
- `1. where  2. col("storeId") 3. geq(20).leq(30) 4. &   5. col("productId")==2`
- `1. select 2. "storeId"      3. between(20, 30) 4. &&  5. col("productId")==2`
- `1. select 2. col("storeId") 3. between(20, 30) 4. &&  5. col("productId")=2`
- `1. select 2. col("storeId") 3. between(20, 30) 4. &   5. col("productId")==2`

In [11]:
transactionsDf.select((col("storeId").between(20, 30)) and (col("productId")==2)).show()

ValueError: Cannot convert column into bool: please use '&' for 'and', '|' for 'or', '~' for 'not' when building DataFrame boolean expressions.

In [10]:
transactionsDf.where((col("storeId").geq(20).leq(30)) & (col("productId")==2)).show()

TypeError: 'Column' object is not callable

In [9]:
transactionsDf.select((col("storeId").between(20, 30)) & (col("productId")==2)).show()

+-----------------------------------------------------------+
|(((storeId >= 20) AND (storeId <= 30)) AND (productId = 2))|
+-----------------------------------------------------------+
|                                                      false|
|                                                      false|
|                                                      false|
|                                                      false|
|                                                       null|
|                                                       true|
+-----------------------------------------------------------+

