In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField
from pyspark.sql.types import StringType, IntegerType, ArrayType

data = [
    (("James","","Smith"),["Java","Scala","C++"],"OH","M"),
    (("Anna","Rose",""),["Spark","Java","C++"],"NY","F"),
    (("Julia","","Williams"),["CSharp","VB"],"OH","F"),
    (("Maria","Anne","Jones"),["CSharp","VB"],"NY","M"),
    (("Jen","Mary","Brown"),["CSharp","VB"],"NY","M"),
    (("Mike","Mary","Williams"),["Python","VB"],"OH","M")
 ]

schema = StructType([
    StructField("Name",StructType([
        StructField("Firstname",StringType()),
        StructField("Middlename", StringType()),
        StructField("Lastname", StringType())
        ])),
    StructField("Languages", ArrayType(StringType())),
    StructField("Country", StringType()),
    StructField("Gender", StringType())
                ])
                
spark = SparkSession.builder.appName("Spark filter").getOrCreate()

df = spark.createDataFrame(data = data, schema = schema)
df.printSchema()
df.show()

root
 |-- Name: struct (nullable = true)
 |    |-- Firstname: string (nullable = true)
 |    |-- Middlename: string (nullable = true)
 |    |-- Lastname: string (nullable = true)
 |-- Languages: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- Country: string (nullable = true)
 |-- Gender: string (nullable = true)

+--------------------+------------------+-------+------+
|                Name|         Languages|Country|Gender|
+--------------------+------------------+-------+------+
|    {James, , Smith}|[Java, Scala, C++]|     OH|     M|
|      {Anna, Rose, }|[Spark, Java, C++]|     NY|     F|
| {Julia, , Williams}|      [CSharp, VB]|     OH|     F|
|{Maria, Anne, Jones}|      [CSharp, VB]|     NY|     M|
|  {Jen, Mary, Brown}|      [CSharp, VB]|     NY|     M|
|{Mike, Mary, Will...|      [Python, VB]|     OH|     M|
+--------------------+------------------+-------+------+



In [2]:
df.show(truncate =False)

+----------------------+------------------+-------+------+
|Name                  |Languages         |Country|Gender|
+----------------------+------------------+-------+------+
|{James, , Smith}      |[Java, Scala, C++]|OH     |M     |
|{Anna, Rose, }        |[Spark, Java, C++]|NY     |F     |
|{Julia, , Williams}   |[CSharp, VB]      |OH     |F     |
|{Maria, Anne, Jones}  |[CSharp, VB]      |NY     |M     |
|{Jen, Mary, Brown}    |[CSharp, VB]      |NY     |M     |
|{Mike, Mary, Williams}|[Python, VB]      |OH     |M     |
+----------------------+------------------+-------+------+



## 1. DataFrame filter() with Column Condition

In [3]:
# returns the if gender is male
df.filter(df.Gender == "M").show(truncate=False)

+----------------------+------------------+-------+------+
|Name                  |Languages         |Country|Gender|
+----------------------+------------------+-------+------+
|{James, , Smith}      |[Java, Scala, C++]|OH     |M     |
|{Maria, Anne, Jones}  |[CSharp, VB]      |NY     |M     |
|{Jen, Mary, Brown}    |[CSharp, VB]      |NY     |M     |
|{Mike, Mary, Williams}|[Python, VB]      |OH     |M     |
+----------------------+------------------+-------+------+



In [4]:
# returns the if country is "OH"
df.filter(df.Country == "OH").show(truncate=False)

+----------------------+------------------+-------+------+
|Name                  |Languages         |Country|Gender|
+----------------------+------------------+-------+------+
|{James, , Smith}      |[Java, Scala, C++]|OH     |M     |
|{Julia, , Williams}   |[CSharp, VB]      |OH     |F     |
|{Mike, Mary, Williams}|[Python, VB]      |OH     |M     |
+----------------------+------------------+-------+------+



In [32]:
# not equal operator
df.filter(~(df.Country == "OH")).show()
# df.filter(df.Country != "OH").show()

+--------------------+------------------+-------+------+
|                Name|         Languages|Country|Gender|
+--------------------+------------------+-------+------+
|      {Anna, Rose, }|[Spark, Java, C++]|     NY|     F|
|{Maria, Anne, Jones}|      [CSharp, VB]|     NY|     M|
|  {Jen, Mary, Brown}|      [CSharp, VB]|     NY|     M|
+--------------------+------------------+-------+------+



In [33]:
# using the col function 
from pyspark.sql.functions import col
df.filter(col("Gender") == "M").show()

+--------------------+------------------+-------+------+
|                Name|         Languages|Country|Gender|
+--------------------+------------------+-------+------+
|    {James, , Smith}|[Java, Scala, C++]|     OH|     M|
|{Maria, Anne, Jones}|      [CSharp, VB]|     NY|     M|
|  {Jen, Mary, Brown}|      [CSharp, VB]|     NY|     M|
|{Mike, Mary, Will...|      [Python, VB]|     OH|     M|
+--------------------+------------------+-------+------+



## 3. Filtering with SQL Expression

In [37]:
df.filter("Gender == 'F'").show()

+-------------------+------------------+-------+------+
|               Name|         Languages|Country|Gender|
+-------------------+------------------+-------+------+
|     {Anna, Rose, }|[Spark, Java, C++]|     NY|     F|
|{Julia, , Williams}|      [CSharp, VB]|     OH|     F|
+-------------------+------------------+-------+------+



In [39]:
# For not equal
df.filter("Gender != 'M'").show()
df.filter("Gender <> 'M'").show()

+-------------------+------------------+-------+------+
|               Name|         Languages|Country|Gender|
+-------------------+------------------+-------+------+
|     {Anna, Rose, }|[Spark, Java, C++]|     NY|     F|
|{Julia, , Williams}|      [CSharp, VB]|     OH|     F|
+-------------------+------------------+-------+------+

+-------------------+------------------+-------+------+
|               Name|         Languages|Country|Gender|
+-------------------+------------------+-------+------+
|     {Anna, Rose, }|[Spark, Java, C++]|     NY|     F|
|{Julia, , Williams}|      [CSharp, VB]|     OH|     F|
+-------------------+------------------+-------+------+



## 4. PySpark Filter with Multiple Conditions

In [47]:
df.filter((df.Country  == "OH") & (df.Gender  == "M")).show(truncate=False)

+----------------------+------------------+-------+------+
|Name                  |Languages         |Country|Gender|
+----------------------+------------------+-------+------+
|{James, , Smith}      |[Java, Scala, C++]|OH     |M     |
|{Mike, Mary, Williams}|[Python, VB]      |OH     |M     |
+----------------------+------------------+-------+------+



In [44]:
from pyspark.sql.functions import array_contains
df.filter(("Gender == 'F'") or (array_contains(df.Languages,"Python"))).show()

+-------------------+------------------+-------+------+
|               Name|         Languages|Country|Gender|
+-------------------+------------------+-------+------+
|     {Anna, Rose, }|[Spark, Java, C++]|     NY|     F|
|{Julia, , Williams}|      [CSharp, VB]|     OH|     F|
+-------------------+------------------+-------+------+



## 5. Filter Based on List Values

In [8]:
lst = ["OH","CA"]
df.filter(df.Country.isin(lst)).show()

+--------------------+------------------+-------+------+
|                Name|         Languages|Country|Gender|
+--------------------+------------------+-------+------+
|    {James, , Smith}|[Java, Scala, C++]|     OH|     M|
| {Julia, , Williams}|      [CSharp, VB]|     OH|     F|
|{Mike, Mary, Will...|      [Python, VB]|     OH|     M|
+--------------------+------------------+-------+------+



## 6. Filter Based on Starts With, Ends With, Contains

In [12]:
# startswith method
df.filter(df.Country.startswith('O')).show()

+--------------------+------------------+-------+------+
|                Name|         Languages|Country|Gender|
+--------------------+------------------+-------+------+
|    {James, , Smith}|[Java, Scala, C++]|     OH|     M|
| {Julia, , Williams}|      [CSharp, VB]|     OH|     F|
|{Mike, Mary, Will...|      [Python, VB]|     OH|     M|
+--------------------+------------------+-------+------+



In [13]:
# endswith method
df.filter(df.Country.endswith('Y')).show()

+--------------------+------------------+-------+------+
|                Name|         Languages|Country|Gender|
+--------------------+------------------+-------+------+
|      {Anna, Rose, }|[Spark, Java, C++]|     NY|     F|
|{Maria, Anne, Jones}|      [CSharp, VB]|     NY|     M|
|  {Jen, Mary, Brown}|      [CSharp, VB]|     NY|     M|
+--------------------+------------------+-------+------+



In [16]:
# startswith method
df.filter(df.Country.contains("N")).show()

+--------------------+------------------+-------+------+
|                Name|         Languages|Country|Gender|
+--------------------+------------------+-------+------+
|      {Anna, Rose, }|[Spark, Java, C++]|     NY|     F|
|{Maria, Anne, Jones}|      [CSharp, VB]|     NY|     M|
|  {Jen, Mary, Brown}|      [CSharp, VB]|     NY|     M|
+--------------------+------------------+-------+------+



## 7. Filtering with Regular Expression

In [19]:
data2 = [(2,"Michael Rose"),(3,"Robert Williams"),
     (4,"Rames Rose"),(5,"Rames rose")
  ]
df2 = spark.createDataFrame(data = data2, schema = ["id","name"])

# like - SQL LIKE pattern
df2.filter(df2.name.like("%rose%")).show()


# rlike - SQL RLIKE pattern (LIKE with Regex)
# This check case insensitive
df2.filter(df2.name.rlike("(?i)^*rose$")).show()


+---+----------+
| id|      name|
+---+----------+
|  5|Rames rose|
+---+----------+

+---+------------+
| id|        name|
+---+------------+
|  2|Michael Rose|
|  4|  Rames Rose|
|  5|  Rames rose|
+---+------------+



## 8. Filtering Array column

In [20]:
from pyspark.sql.functions import array_contains
df.filter(array_contains(df.Languages,"Python")).show()

+--------------------+------------+-------+------+
|                Name|   Languages|Country|Gender|
+--------------------+------------+-------+------+
|{Mike, Mary, Will...|[Python, VB]|     OH|     M|
+--------------------+------------+-------+------+



In [25]:
from pyspark.sql.functions import array_contains
df.filter(array_contains(df.Languages,"Java")).show()

+----------------+------------------+-------+------+
|            Name|         Languages|Country|Gender|
+----------------+------------------+-------+------+
|{James, , Smith}|[Java, Scala, C++]|     OH|     M|
|  {Anna, Rose, }|[Spark, Java, C++]|     NY|     F|
+----------------+------------------+-------+------+



## 9. Filtering on Nested Struct columns

In [29]:
df.printSchema()

root
 |-- Name: struct (nullable = true)
 |    |-- Firstname: string (nullable = true)
 |    |-- Middlename: string (nullable = true)
 |    |-- Lastname: string (nullable = true)
 |-- Languages: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- Country: string (nullable = true)
 |-- Gender: string (nullable = true)



In [30]:
# Struct condition
df.filter(df.Name.Lastname == "Williams") \
    .show(truncate=False) 


+----------------------+------------+-------+------+
|Name                  |Languages   |Country|Gender|
+----------------------+------------+-------+------+
|{Julia, , Williams}   |[CSharp, VB]|OH     |F     |
|{Mike, Mary, Williams}|[Python, VB]|OH     |M     |
+----------------------+------------+-------+------+

