In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
.appName("Colab PySpark Setup") \
.getOrCreate()

spark

In [2]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Load CSV
import pandas as pd

file_path = '/content/drive/MyDrive/ColabData/large_employee_dataset.csv'  # Update path if needed
df = pd.read_csv(file_path)

# Convert JoiningDate to datetime
df['JoiningDate'] = pd.to_datetime(df['JoiningDate'])


Mounted at /content/drive


In [7]:
#1. Show the top 10 rows:
df.head(10)


Unnamed: 0,EmployeeID,Name,Age,Department,Salary,JoiningDate,Status,City
0,4128,Charles Johnson,52,HR,64039,2018-07-07,Resigned,Allentown
1,6094,Dylan Camacho,57,Marketing,34686,2015-08-25,Active,Anthonyfort
2,5883,Mr. Ryan Bowman Jr.,29,Finance,64541,2025-03-11,On Leave,Gilesstad
3,9146,Brian Ball,24,Sales,87831,2015-10-01,Resigned,Jenniferfurt
4,1918,Angela Hooper,26,Finance,108773,2019-08-14,On Leave,Lake Amystad
5,4600,Alexander Johnson PhD,45,Sales,75671,2016-04-21,On Leave,Russohaven
6,6253,Steven Lane,47,Finance,64982,2021-07-25,Active,East Robert
7,8278,Riley Johnson,49,HR,43449,2015-08-03,Resigned,New Thomas
8,8520,Emily Washington,43,Finance,33851,2021-11-30,Resigned,West Ashley
9,1298,Valerie Fleming,42,Marketing,70137,2019-12-08,Resigned,Caseborough


In [8]:
#2. Count the total number of employees:

df.shape[0]


500

In [9]:
#3. Display unique departments:
df['Department'].unique()


array(['HR', 'Marketing', 'Finance', 'Sales', 'IT'], dtype=object)

In [10]:
#4. Filter all employees in the "IT" department:
df[df['Department'] == 'IT']


Unnamed: 0,EmployeeID,Name,Age,Department,Salary,JoiningDate,Status,City
15,6598,Mary Henson,58,IT,63951,2021-08-25,Active,Port Tricia
23,8518,Elizabeth Abbott,22,IT,91732,2022-11-05,Active,Douglasside
24,9506,Thomas Dunn,45,IT,90340,2020-07-12,On Leave,Lindseychester
32,9663,Glenn Mason,43,IT,109189,2020-03-27,On Leave,Katelynburgh
36,2106,Richard Bailey,45,IT,30950,2021-06-29,Resigned,North John
...,...,...,...,...,...,...,...,...
484,5252,Mary Martinez,22,IT,94629,2019-09-09,Resigned,West Christopher
486,5755,Erika Ortega,42,IT,117517,2023-04-16,On Leave,Kellyfort
488,2581,Alexandria Jones,34,IT,119009,2017-01-22,On Leave,Toniside
493,5876,Nicole Baker,25,IT,64237,2023-12-28,Active,North Nicole


In [11]:
# 5. Show employees aged between 30 and 40
df[(df['Age'] >= 30) & (df['Age'] <= 40)]


Unnamed: 0,EmployeeID,Name,Age,Department,Salary,JoiningDate,Status,City
14,4676,Christopher Fuller,30,HR,63042,2021-04-30,Resigned,South Donnaville
16,4136,Jerome Torres,30,Finance,68213,2024-11-30,Active,North Justinborough
22,1588,Edwin Burns,34,Sales,108208,2015-09-14,Resigned,South David
28,8074,Fred Brewer,30,HR,100736,2021-06-06,On Leave,Port Wendyville
31,3841,April Allen,36,HR,98845,2020-05-20,Active,Rachelchester
...,...,...,...,...,...,...,...,...
489,6329,Heidi Shaffer,36,HR,119165,2020-01-14,Resigned,New Alexa
490,5998,Bruce Serrano,36,HR,118641,2019-02-02,Active,South Elizabeth
491,2621,Aaron Duncan,36,Finance,76393,2017-07-24,Resigned,Michaelport
492,7189,Spencer Frazier,32,Sales,86055,2025-03-23,Resigned,East Ericborough


In [12]:
# 6. Sort employees by Salary in descending order
df.sort_values(by='Salary', ascending=False)


Unnamed: 0,EmployeeID,Name,Age,Department,Salary,JoiningDate,Status,City
445,8860,Cody Williams,30,IT,119978,2019-03-16,Resigned,Susanville
248,4585,Sandra Webster,30,HR,119940,2022-10-21,On Leave,Thompsonport
413,4667,Ronald Hunt,58,Sales,119677,2019-08-29,Resigned,Griffithchester
121,1602,Deborah Williams,25,HR,119397,2023-09-26,On Leave,Port Terrimouth
45,3374,Amanda Green,41,HR,119316,2021-04-08,Resigned,West Shelleyton
...,...,...,...,...,...,...,...,...
320,5332,Nicole Figueroa,42,Finance,30570,2023-11-14,On Leave,Davishaven
437,6408,Vernon Miller,28,Marketing,30547,2018-07-05,Active,New Michelle
239,1320,Michele Lawrence,22,Finance,30110,2024-05-14,Resigned,Natalieborough
99,3087,Mark Padilla,33,HR,30080,2017-10-14,On Leave,North Josephhaven


In [13]:
# 7. Get the average salary by department
df.groupby('Department')['Salary'].mean()


Unnamed: 0_level_0,Salary
Department,Unnamed: 1_level_1
Finance,72834.756303
HR,76091.27451
IT,73116.255556
Marketing,71958.188889
Sales,77488.545455


In [14]:
# 8. Count of employees by Status
df['Status'].value_counts()


Unnamed: 0_level_0,count
Status,Unnamed: 1_level_1
Active,172
On Leave,169
Resigned,159


In [15]:
# 9. Highest salary in each city
df.groupby('City')['Salary'].max()


Unnamed: 0_level_0,Salary
City,Unnamed: 1_level_1
Aaronberg,76528
Adkinsbury,60857
Aguilarchester,37472
Aimeeport,101791
Allenberg,96201
...,...
Whiteport,31057
Whiteview,115326
Williamsborough,70552
Williamsland,49167


In [16]:
# 10. Total number of employees who joined each year
df['YearJoined'] = df['JoiningDate'].dt.year
df['YearJoined'].value_counts().sort_index()


Unnamed: 0_level_0,count
YearJoined,Unnamed: 1_level_1
2015,37
2016,49
2017,44
2018,52
2019,52
2020,56
2021,49
2022,49
2023,47
2024,38


In [17]:
# 11. Department-wise count of employees who are currently "Active"
df[df['Status'] == 'Active'].groupby('Department').size()


Unnamed: 0_level_0,0
Department,Unnamed: 1_level_1
Finance,45
HR,37
IT,26
Marketing,32
Sales,32


In [18]:
# 12. Average age of employees per department
df.groupby('Department')['Age'].mean()


Unnamed: 0_level_0,Age
Department,Unnamed: 1_level_1
Finance,39.210084
HR,41.460784
IT,38.688889
Marketing,41.822222
Sales,40.535354


In [19]:
# 13. Create another dataset with City and Region, and join it

city_region_map = {
    'Allentown': 'East',
    'Anthonyfort': 'South',
    'Gilesstad': 'West',
    'Jenniferfurt': 'North'

}

city_region_df = pd.DataFrame(list(city_region_map.items()), columns=['City', 'Region'])

# Join with the main DataFrame
df_joined = pd.merge(df, city_region_df, on='City', how='left')
df_joined.head()


Unnamed: 0,EmployeeID,Name,Age,Department,Salary,JoiningDate,Status,City,YearJoined,Region
0,4128,Charles Johnson,52,HR,64039,2018-07-07,Resigned,Allentown,2018,East
1,6094,Dylan Camacho,57,Marketing,34686,2015-08-25,Active,Anthonyfort,2015,South
2,5883,Mr. Ryan Bowman Jr.,29,Finance,64541,2025-03-11,On Leave,Gilesstad,2025,West
3,9146,Brian Ball,24,Sales,87831,2015-10-01,Resigned,Jenniferfurt,2015,North
4,1918,Angela Hooper,26,Finance,108773,2019-08-14,On Leave,Lake Amystad,2019,


In [20]:
# 14. Group salaries by Region after the join
df_joined.groupby('Region')['Salary'].sum()


Unnamed: 0_level_0,Salary
Region,Unnamed: 1_level_1
East,64039
North,87831
South,34686
West,64541


In [21]:
# 15. Calculate years of experience (current date - JoiningDate)
df['ExperienceYears'] = (pd.Timestamp.now() - df['JoiningDate']).dt.days // 365
df[['EmployeeID', 'Name', 'JoiningDate', 'ExperienceYears']].head()


Unnamed: 0,EmployeeID,Name,JoiningDate,ExperienceYears
0,4128,Charles Johnson,2018-07-07,6
1,6094,Dylan Camacho,2015-08-25,9
2,5883,Mr. Ryan Bowman Jr.,2025-03-11,0
3,9146,Brian Ball,2015-10-01,9
4,1918,Angela Hooper,2019-08-14,5


In [22]:
# 16. List all employees with more than 5 years of experience
df[df['ExperienceYears'] > 5]


Unnamed: 0,EmployeeID,Name,Age,Department,Salary,JoiningDate,Status,City,YearJoined,ExperienceYears
0,4128,Charles Johnson,52,HR,64039,2018-07-07,Resigned,Allentown,2018,6
1,6094,Dylan Camacho,57,Marketing,34686,2015-08-25,Active,Anthonyfort,2015,9
3,9146,Brian Ball,24,Sales,87831,2015-10-01,Resigned,Jenniferfurt,2015,9
5,4600,Alexander Johnson PhD,45,Sales,75671,2016-04-21,On Leave,Russohaven,2016,9
7,8278,Riley Johnson,49,HR,43449,2015-08-03,Resigned,New Thomas,2015,9
...,...,...,...,...,...,...,...,...,...,...
490,5998,Bruce Serrano,36,HR,118641,2019-02-02,Active,South Elizabeth,2019,6
491,2621,Aaron Duncan,36,Finance,76393,2017-07-24,Resigned,Michaelport,2017,7
494,4793,Tara Sanchez,36,Finance,78360,2015-10-31,Resigned,Kimberlyfurt,2015,9
496,6429,Timothy Farrell,52,Finance,101208,2017-03-22,Resigned,Port Jameston,2017,8
