In [1]:
#Basic imports

import numpy as np
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *

#session Built

spark = SparkSession.builder.appName('Basics_pyspark_df').getOrCreate()

#data creation

data = [(('Ram','','Singh'),'M',['Maths','English','Science'],'PASS','IN'),
        (('Shyamo','Kumari',''),'F',['Maths','Science'],'FAIL','AUS'),
        (('Rahul','Kumar','Varshney'),'M',['English','Science'],'FAIL','AUS'),
        (('Axeli','','Patel'),'F',['Maths','English'],'PASS','PAK'),
        (('Virat','Kohli',''),'M',['Maths','English'],'FAIL','IN'),
        (('Vinai','Raj','Singh'),'F',['English','Science'],'PASS','PAK')
       ]

schema = StructType([
     StructField('name', StructType([
        StructField('firstname', StringType(), True),
        StructField('middlename', StringType(), True),
         StructField('lastname', StringType(), True)
     ])),
    StructField('gender', StringType(), True),
     StructField('subject', ArrayType(StringType()), True),
     StructField('status', StringType(), True),
    StructField('country', StringType(), True)
 ])

df = spark.createDataFrame(data = data, schema = schema)
df.printSchema()
df.show(truncate=False)

root
 |-- name: struct (nullable = true)
 |    |-- firstname: string (nullable = true)
 |    |-- middlename: string (nullable = true)
 |    |-- lastname: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- subject: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- status: string (nullable = true)
 |-- country: string (nullable = true)

+------------------------+------+-------------------------+------+-------+
|name                    |gender|subject                  |status|country|
+------------------------+------+-------------------------+------+-------+
|{Ram, , Singh}          |M     |[Maths, English, Science]|PASS  |IN     |
|{Shyamo, Kumari, }      |F     |[Maths, Science]         |FAIL  |AUS    |
|{Rahul, Kumar, Varshney}|M     |[English, Science]       |FAIL  |AUS    |
|{Axeli, , Patel}        |F     |[Maths, English]         |PASS  |PAK    |
|{Virat, Kohli, }        |M     |[Maths, English]         |FAIL  |IN     |
|{Vinai, Raj, Singh

In [2]:
#way 1
df.filter(df.gender == "M").show()

#way 2
df.filter(col('gender') == "F").show()

#nested column
df.filter(df.name.firstname == 'Virat').show()

+--------------------+------+--------------------+------+-------+
|                name|gender|             subject|status|country|
+--------------------+------+--------------------+------+-------+
|      {Ram, , Singh}|     M|[Maths, English, ...|  PASS|     IN|
|{Rahul, Kumar, Va...|     M|  [English, Science]|  FAIL|    AUS|
|    {Virat, Kohli, }|     M|    [Maths, English]|  FAIL|     IN|
+--------------------+------+--------------------+------+-------+

+-------------------+------+------------------+------+-------+
|               name|gender|           subject|status|country|
+-------------------+------+------------------+------+-------+
| {Shyamo, Kumari, }|     F|  [Maths, Science]|  FAIL|    AUS|
|   {Axeli, , Patel}|     F|  [Maths, English]|  PASS|    PAK|
|{Vinai, Raj, Singh}|     F|[English, Science]|  PASS|    PAK|
+-------------------+------+------------------+------+-------+

+----------------+------+----------------+------+-------+
|            name|gender|         sub

In [3]:
#Using SQL Expression
df.filter("status == 'PASS'").show()

#For not equal
df.filter("status != 'PASS'").show()

+-------------------+------+--------------------+------+-------+
|               name|gender|             subject|status|country|
+-------------------+------+--------------------+------+-------+
|     {Ram, , Singh}|     M|[Maths, English, ...|  PASS|     IN|
|   {Axeli, , Patel}|     F|    [Maths, English]|  PASS|    PAK|
|{Vinai, Raj, Singh}|     F|  [English, Science]|  PASS|    PAK|
+-------------------+------+--------------------+------+-------+

+--------------------+------+------------------+------+-------+
|                name|gender|           subject|status|country|
+--------------------+------+------------------+------+-------+
|  {Shyamo, Kumari, }|     F|  [Maths, Science]|  FAIL|    AUS|
|{Rahul, Kumar, Va...|     M|[English, Science]|  FAIL|    AUS|
|    {Virat, Kohli, }|     M|  [Maths, English]|  FAIL|     IN|
+--------------------+------+------------------+------+-------+



In [4]:
#multi condition

#way 1
df.filter((df.gender == 'M') & (df.status == 'PASS')).show()

#way 2
df.filter("gender == 'M' and status == 'FAIL'").show()

+--------------+------+--------------------+------+-------+
|          name|gender|             subject|status|country|
+--------------+------+--------------------+------+-------+
|{Ram, , Singh}|     M|[Maths, English, ...|  PASS|     IN|
+--------------+------+--------------------+------+-------+

+--------------------+------+------------------+------+-------+
|                name|gender|           subject|status|country|
+--------------------+------+------------------+------+-------+
|{Rahul, Kumar, Va...|     M|[English, Science]|  FAIL|    AUS|
|    {Virat, Kohli, }|     M|  [Maths, English]|  FAIL|     IN|
+--------------------+------+------------------+------+-------+



In [5]:
#Filter IS IN List values

sub= ['M','Female']
df.filter(df.gender.isin(sub)).show()

+--------------------+------+--------------------+------+-------+
|                name|gender|             subject|status|country|
+--------------------+------+--------------------+------+-------+
|      {Ram, , Singh}|     M|[Maths, English, ...|  PASS|     IN|
|{Rahul, Kumar, Va...|     M|  [English, Science]|  FAIL|    AUS|
|    {Virat, Kohli, }|     M|    [Maths, English]|  FAIL|     IN|
+--------------------+------+--------------------+------+-------+



In [6]:
# Filter NOT IS IN List values

df.filter(~df.gender.isin(sub)).show()
df.filter(df.gender.isin(sub)==False).show()

+-------------------+------+------------------+------+-------+
|               name|gender|           subject|status|country|
+-------------------+------+------------------+------+-------+
| {Shyamo, Kumari, }|     F|  [Maths, Science]|  FAIL|    AUS|
|   {Axeli, , Patel}|     F|  [Maths, English]|  PASS|    PAK|
|{Vinai, Raj, Singh}|     F|[English, Science]|  PASS|    PAK|
+-------------------+------+------------------+------+-------+

+-------------------+------+------------------+------+-------+
|               name|gender|           subject|status|country|
+-------------------+------+------------------+------+-------+
| {Shyamo, Kumari, }|     F|  [Maths, Science]|  FAIL|    AUS|
|   {Axeli, , Patel}|     F|  [Maths, English]|  PASS|    PAK|
|{Vinai, Raj, Singh}|     F|[English, Science]|  PASS|    PAK|
+-------------------+------+------------------+------+-------+



In [7]:
# Using startswith
df.filter(df.country.startswith("I")).show()

#using endswith
df.filter(df.country.endswith("S")).show()

#contains
df.filter(df.country.contains("A")).show()

+----------------+------+--------------------+------+-------+
|            name|gender|             subject|status|country|
+----------------+------+--------------------+------+-------+
|  {Ram, , Singh}|     M|[Maths, English, ...|  PASS|     IN|
|{Virat, Kohli, }|     M|    [Maths, English]|  FAIL|     IN|
+----------------+------+--------------------+------+-------+

+--------------------+------+------------------+------+-------+
|                name|gender|           subject|status|country|
+--------------------+------+------------------+------+-------+
|  {Shyamo, Kumari, }|     F|  [Maths, Science]|  FAIL|    AUS|
|{Rahul, Kumar, Va...|     M|[English, Science]|  FAIL|    AUS|
+--------------------+------+------------------+------+-------+

+--------------------+------+------------------+------+-------+
|                name|gender|           subject|status|country|
+--------------------+------+------------------+------+-------+
|  {Shyamo, Kumari, }|     F|  [Maths, Science]|  

In [8]:
#SQL like # CaseSensitive

df.filter(df.name.firstname.like('%Vi%')).show()

df.filter(df.name.middlename.rlike('^*i$')).show()

+-------------------+------+------------------+------+-------+
|               name|gender|           subject|status|country|
+-------------------+------+------------------+------+-------+
|   {Virat, Kohli, }|     M|  [Maths, English]|  FAIL|     IN|
|{Vinai, Raj, Singh}|     F|[English, Science]|  PASS|    PAK|
+-------------------+------+------------------+------+-------+

+------------------+------+----------------+------+-------+
|              name|gender|         subject|status|country|
+------------------+------+----------------+------+-------+
|{Shyamo, Kumari, }|     F|[Maths, Science]|  FAIL|    AUS|
|  {Virat, Kohli, }|     M|[Maths, English]|  FAIL|     IN|
+------------------+------+----------------+------+-------+



In [9]:
#Filter array

df.filter(array_contains(df.subject,"Science")).show(truncate=False)

+------------------------+------+-------------------------+------+-------+
|name                    |gender|subject                  |status|country|
+------------------------+------+-------------------------+------+-------+
|{Ram, , Singh}          |M     |[Maths, English, Science]|PASS  |IN     |
|{Shyamo, Kumari, }      |F     |[Maths, Science]         |FAIL  |AUS    |
|{Rahul, Kumar, Varshney}|M     |[English, Science]       |FAIL  |AUS    |
|{Vinai, Raj, Singh}     |F     |[English, Science]       |PASS  |PAK    |
+------------------------+------+-------------------------+------+-------+



# NOTE:  All these operation can be done with "where" in place of "filter"


In [11]:
df.where(array_contains(df.subject,"Science")).show(truncate=False)

+------------------------+------+-------------------------+------+-------+
|name                    |gender|subject                  |status|country|
+------------------------+------+-------------------------+------+-------+
|{Ram, , Singh}          |M     |[Maths, English, Science]|PASS  |IN     |
|{Shyamo, Kumari, }      |F     |[Maths, Science]         |FAIL  |AUS    |
|{Rahul, Kumar, Varshney}|M     |[English, Science]       |FAIL  |AUS    |
|{Vinai, Raj, Singh}     |F     |[English, Science]       |PASS  |PAK    |
+------------------------+------+-------------------------+------+-------+

