<a href="https://colab.research.google.com/github/Rajaanthonysamy/pyspark/blob/main/04_pysaprk_filtering_operation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np

# Number of rows for the dataset
num_rows = 20

# Generate random names (simple placeholders)
names = [f'Person_{i+1}' for i in range(num_rows)]

# Generate random age between 20 and 60
ages = np.random.randint(20, 61, num_rows)

# Generate random experience between 0 and age-20 (or a max of 30)
experiences = np.array([np.random.randint(0, min(age - 20, 30) + 1) for age in ages])

# Generate random salary based on experience, with some randomness
salaries = (experiences * 5000 + np.random.randint(30000, 70001, num_rows)).astype(int)

# Create a dictionary with the generated data
data = {
    'name': names,
    'age': ages,
    'experience': experiences,
    'salary': salaries
}

# Create a pandas DataFrame
df_large = pd.DataFrame(data)

# Save the DataFrame to a CSV file
csv_file_path_large = 'employees_20.csv'
df_large.to_csv(csv_file_path_large, index=False)

print(f"CSV file '{csv_file_path_large}' created successfully with {num_rows} rows. Here's the head of the DataFrame:")
display(df_large.head())

CSV file 'employees_20.csv' created successfully with 20 rows. Here's the head of the DataFrame:


Unnamed: 0,name,age,experience,salary
0,Person_1,41,10,100522
1,Person_2,40,15,121058
2,Person_3,48,8,89818
3,Person_4,48,11,110422
4,Person_5,36,7,102605


In [2]:
from pyspark.sql import SparkSession
spark_session = SparkSession.builder.appName("Practice").getOrCreate()

In [3]:
df= spark_session.read.csv('employees_20.csv',header=True,inferSchema=True)

In [4]:
df.show()

+---------+---+----------+------+
|     name|age|experience|salary|
+---------+---+----------+------+
| Person_1| 41|        10|100522|
| Person_2| 40|        15|121058|
| Person_3| 48|         8| 89818|
| Person_4| 48|        11|110422|
| Person_5| 36|         7|102605|
| Person_6| 40|         9| 75352|
| Person_7| 23|         3| 60597|
| Person_8| 51|        19|136026|
| Person_9| 21|         1| 62001|
|Person_10| 34|        11|122076|
|Person_11| 56|        26|175114|
|Person_12| 36|         5| 58241|
|Person_13| 24|         0| 49201|
|Person_14| 44|         8| 99991|
|Person_15| 31|        11|108423|
|Person_16| 24|         0| 38068|
|Person_17| 59|        18|157644|
|Person_18| 35|         8|104652|
|Person_19| 51|        10| 92730|
|Person_20| 22|         2| 58457|
+---------+---+----------+------+



In [8]:
df.filter("salary<50000").withColumnRenamed("name","name_is").show()

+---------+---+----------+------+
|  name_is|age|experience|salary|
+---------+---+----------+------+
|Person_13| 24|         0| 49201|
|Person_16| 24|         0| 38068|
+---------+---+----------+------+



In [6]:
df.filter("salary<50000").select(['name','age']).show()

+---------+---+
|     name|age|
+---------+---+
|Person_13| 24|
|Person_16| 24|
+---------+---+



In [9]:
df.filter((df['salary']<50000) & (df['salary']>30000)).show()

+---------+---+----------+------+
|     name|age|experience|salary|
+---------+---+----------+------+
|Person_13| 24|         0| 49201|
|Person_16| 24|         0| 38068|
+---------+---+----------+------+



In [10]:
df.filter((df['salary']<50000) | (df['salary']>30000)).show()

+---------+---+----------+------+
|     name|age|experience|salary|
+---------+---+----------+------+
| Person_1| 41|        10|100522|
| Person_2| 40|        15|121058|
| Person_3| 48|         8| 89818|
| Person_4| 48|        11|110422|
| Person_5| 36|         7|102605|
| Person_6| 40|         9| 75352|
| Person_7| 23|         3| 60597|
| Person_8| 51|        19|136026|
| Person_9| 21|         1| 62001|
|Person_10| 34|        11|122076|
|Person_11| 56|        26|175114|
|Person_12| 36|         5| 58241|
|Person_13| 24|         0| 49201|
|Person_14| 44|         8| 99991|
|Person_15| 31|        11|108423|
|Person_16| 24|         0| 38068|
|Person_17| 59|        18|157644|
|Person_18| 35|         8|104652|
|Person_19| 51|        10| 92730|
|Person_20| 22|         2| 58457|
+---------+---+----------+------+

