In [1]:
import pandas as pd
df = pd.read_csv('/content/employee_records.csv')
df.head()

Unnamed: 0,Employee_ID,Name,Age,Department,Salary ($),Join_Date,Performance_Score,Promoted,Work_Location,Employment_Type
0,10001,Liam,48,Sales,193637,2005-12-31,9,No,Hybrid,Full-time
1,10002,David,51,IT,168107,2022-01-13,8,No,Remote,Contract
2,10003,Mason,51,Marketing,172319,2021-03-18,7,No,Remote,Full-time
3,10004,Emma,24,IT,32579,2024-01-15,9,No,Office,Part-time
4,10005,Liam,64,Sales,192220,2007-12-02,4,No,Office,Contract


# 1️⃣ Basic Data Exploration

In [2]:
df.info()
df.shape
df.describe()
df.dtypes
df.columns

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Employee_ID        10000 non-null  int64 
 1   Name               10000 non-null  object
 2   Age                10000 non-null  int64 
 3   Department         10000 non-null  object
 4   Salary ($)         10000 non-null  int64 
 5   Join_Date          10000 non-null  object
 6   Performance_Score  10000 non-null  int64 
 7   Promoted           10000 non-null  object
 8   Work_Location      10000 non-null  object
 9   Employment_Type    10000 non-null  object
dtypes: int64(4), object(6)
memory usage: 781.4+ KB


Index(['Employee_ID', 'Name', 'Age', 'Department', 'Salary ($)', 'Join_Date',
       'Performance_Score', 'Promoted', 'Work_Location', 'Employment_Type'],
      dtype='object')

In [4]:
# converting column names to snale case
df.columns = df.columns.str.lower().str.replace(' ($)', '').str.replace(' ', '_')
df.columns

Index(['employee_id', 'name', 'age', 'department', 'salary', 'join_date',
       'performance_score', 'promoted', 'work_location', 'employment_type'],
      dtype='object')

In [15]:
df['join_date'] = pd.to_datetime(df['join_date'])
df['join_month'] = df['join_date'].dt.month_name()
df['join_year'] = df['join_date'].dt.year

In [16]:
# Count the number of employees in each department.
print(df['department'].value_counts())
# Find the average salary of all employees.
print(df['salary'].mean())
# Find the highest and lowest salaries.
print(df['salary'].max(), df['salary'].min())

department
Marketing      1759
Finance        1699
HR             1645
IT             1644
Engineering    1638
Sales          1615
Name: count, dtype: int64
114816.647
500000 15000


In [31]:
for column in ['department', 'work_location', 'employment_type']:
    df[column] = df[column].str.lower()

# 2️⃣ Filtering Based on Conditions

In [33]:
# Select all employees in the IT department.
it_dept_employees = df[df['department'] == 'it']
print(f'There are {it_dept_employees.shape[0]} IT employees')

# Extract employees older than 50 years
employees_over_50 = df[df['age'] > 50]
print(f'There are {employees_over_50.shape[0]} employees older the 50 years' )

# Find employees earning more than $100,000.
over_100k_earners = df[df['salary'] > 100000]
print(f'{over_100k_earners.shape[0]} employees earn over $100,000')

# Get all employees who joined before 2015.
employed_before_2015 = df[df['join_year'] < 2015]
print(f'{employed_before_2015.shape[0]} employees were employed before 2015')

performance_over_8 = df[df['performance_score'] > 8]
print(f'{performance_over_8.shape[0]} employees have a performance score over 8')


There are 1644 IT employees
There are 3125 employees older the 50 years
5843 employees earn over $100,000
5650 employees were employed before 2015
2021 employees have a performance score over 8


# 3️⃣ Advanced Filtering

In [34]:
# Find all HR employees working remotely
remote_hr = df[(df['work_location'] == 'remote') & (df['department'] == 'hr')]
print(f'{remote_hr.shape[0]} HR employees work remotely')

# Get Engineering employees who are Full-time and earn more than $120,000.
fulltime_engineers_over_120k = df[
    (df['department'] == 'engineering') &
    (df['employment_type'] == 'full-time') &
    (df['salary'] > 120000)]
print(f'{fulltime_engineers_over_120k.shape[0]} Full-time engineers earn over $120,000')

555 HR employees work remotely
246 Full-time engineers earn over $120,000


In [26]:
df['department'].unique()

array(['Sales', 'IT', 'Marketing', 'HR', 'Engineering', 'Finance'],
      dtype=object)