In [19]:
'''

@Author: Vighnesh Harish Bilgi
@Date: 2022-12-14
@Last Modified by: Vighnesh Harish Bilgi
@Last Modified time: 2022-12-14
@Title : 3 - Pyspark dataframe filter operations

'''

'\n\n@Author: Vighnesh Harish Bilgi\n@Date: 2022-12-14\n@Last Modified by: Vighnesh Harish Bilgi\n@Last Modified time: 2022-12-14\n@Title : 3 - Pyspark dataframe filter operations\n\n'

In [20]:
import findspark
findspark.init()

In [21]:
import pyspark
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.ml.feature import Imputer

## Filter operations on Pyspark dataframe

### Creating spark session

In [22]:
spark = SparkSession.builder.appName('Dataframe - filter').getOrCreate()

In [23]:
spark

### Reading .csv file as spark dataframe with header and inferSchema as True

In [24]:
df_pyspark = spark.read.option('header','true').csv('test3.csv',inferSchema=True)
df_pyspark.show()

+---------+---+----------+------+
|     Name|age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
+---------+---+----------+------+



In [25]:
df_pyspark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- Experience: integer (nullable = true)
 |-- Salary: integer (nullable = true)



### Filtering dataframe to fetch records where salary <= 20000

In [26]:
df_pyspark.filter("Salary<=20000").show()

+-------+---+----------+------+
|   Name|age|Experience|Salary|
+-------+---+----------+------+
|  Sunny| 29|         4| 20000|
|   Paul| 24|         3| 20000|
| Harsha| 21|         1| 15000|
|Shubham| 23|         2| 18000|
+-------+---+----------+------+



### Filtering dataframe to fetch records where salary <= 20000 and only selecting columns 'Name' and 'age'

In [27]:
df_pyspark.filter("Salary<=20000").select(['Name','age']).show()

+-------+---+
|   Name|age|
+-------+---+
|  Sunny| 29|
|   Paul| 24|
| Harsha| 21|
|Shubham| 23|
+-------+---+



In [28]:
df_pyspark.filter(df_pyspark['Salary']<=20000).show()

+-------+---+----------+------+
|   Name|age|Experience|Salary|
+-------+---+----------+------+
|  Sunny| 29|         4| 20000|
|   Paul| 24|         3| 20000|
| Harsha| 21|         1| 15000|
|Shubham| 23|         2| 18000|
+-------+---+----------+------+



### Filtering dataframe to fetch records where salary <= 20000 AND salary >= 15000 (& operator)

In [29]:
df_pyspark.filter((df_pyspark['Salary']<=20000) & (df_pyspark['Salary']>=15000)).show()

+-------+---+----------+------+
|   Name|age|Experience|Salary|
+-------+---+----------+------+
|  Sunny| 29|         4| 20000|
|   Paul| 24|         3| 20000|
| Harsha| 21|         1| 15000|
|Shubham| 23|         2| 18000|
+-------+---+----------+------+



### Filtering dataframe to fetch records where salary <= 20000 OR salary >= 15000 (| operator)

In [30]:
df_pyspark.filter((df_pyspark['Salary']<=20000) | (df_pyspark['Salary']>=15000)).show()

+---------+---+----------+------+
|     Name|age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
+---------+---+----------+------+



### Filtering dataframe to fetch records where salary NOT <= 20000 (~ operator)

In [31]:
df_pyspark.filter(~(df_pyspark['Salary']<=20000)).show()

+---------+---+----------+------+
|     Name|age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
+---------+---+----------+------+

