In [1]:
import pandas as pd

In [2]:
# Loading the cleaned HR dataset
file_path = '../data/cleaned/hr_dataset_cleaned.csv'
df = pd.read_csv(file_path)

In [3]:
# Display first 5 rows
df.head()

Unnamed: 0,Age,Attrition,Business_Travel,Daily_Rate,Department,Distance_From_Home,Education,Education_Field,Environment_Satisfaction,Gender,...,Performance_Rating,Relationship_Satisfaction,Stock_Option_Level,Total_Working_Years,Training_Times_Last_Year,Work_Life_Balance,Years_At_Company,Years_In_Current_Role,Years_Since_Last_Promotion,Years_With_Current_Manager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,2,Female,...,3,1,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,3,Male,...,4,4,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,4,Male,...,3,2,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,4,Female,...,3,3,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,Male,...,3,4,1,6,3,3,2,2,2,2


In [4]:
# Check basic dataset structure: rows, columns, data types
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 31 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   Age                         1470 non-null   int64 
 1   Attrition                   1470 non-null   object
 2   Business_Travel             1470 non-null   object
 3   Daily_Rate                  1470 non-null   int64 
 4   Department                  1470 non-null   object
 5   Distance_From_Home          1470 non-null   int64 
 6   Education                   1470 non-null   int64 
 7   Education_Field             1470 non-null   object
 8   Environment_Satisfaction    1470 non-null   int64 
 9   Gender                      1470 non-null   object
 10  Hourly_Rate                 1470 non-null   int64 
 11  Job_Involvement             1470 non-null   int64 
 12  Job_Level                   1470 non-null   int64 
 13  Job_Role                    1470 non-null   obje

In [5]:
# Get statistics like mean, min, max for numeric columns
df.describe()

Unnamed: 0,Age,Daily_Rate,Distance_From_Home,Education,Environment_Satisfaction,Hourly_Rate,Job_Involvement,Job_Level,Job_Satisfaction,Monthly_Income,...,Performance_Rating,Relationship_Satisfaction,Stock_Option_Level,Total_Working_Years,Training_Times_Last_Year,Work_Life_Balance,Years_At_Company,Years_In_Current_Role,Years_Since_Last_Promotion,Years_With_Current_Manager
count,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,...,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0
mean,36.92381,802.485714,9.192517,2.912925,2.721769,65.891156,2.729932,2.063946,2.728571,6502.931293,...,3.153741,2.712245,0.793878,11.279592,2.79932,2.761224,7.008163,4.229252,2.187755,4.123129
std,9.135373,403.5091,8.106864,1.024165,1.093082,20.329428,0.711561,1.10694,1.102846,4707.956783,...,0.360824,1.081209,0.852077,7.780782,1.289271,0.706476,6.126525,3.623137,3.22243,3.568136
min,18.0,102.0,1.0,1.0,1.0,30.0,1.0,1.0,1.0,1009.0,...,3.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
25%,30.0,465.0,2.0,2.0,2.0,48.0,2.0,1.0,2.0,2911.0,...,3.0,2.0,0.0,6.0,2.0,2.0,3.0,2.0,0.0,2.0
50%,36.0,802.0,7.0,3.0,3.0,66.0,3.0,2.0,3.0,4919.0,...,3.0,3.0,1.0,10.0,3.0,3.0,5.0,3.0,1.0,3.0
75%,43.0,1157.0,14.0,4.0,4.0,83.75,3.0,3.0,4.0,8379.0,...,3.0,4.0,1.0,15.0,3.0,3.0,9.0,7.0,3.0,7.0
max,60.0,1499.0,29.0,5.0,4.0,100.0,4.0,5.0,4.0,19999.0,...,4.0,4.0,3.0,40.0,6.0,4.0,40.0,18.0,15.0,17.0


In [7]:
# Total number of employees in the dataset
total_employees = df.shape[0]

# Count of employees who left (Attrition = Yes)
attrition_count = df[df['Attrition'] == 'Yes'].shape[0]

# Calculate Attrition Rate as a percentage
attrition_rate = (attrition_count / total_employees) * 100

print("Total Employees:", total_employees)
print("Employees who left:", attrition_count)
print("Attrition Rate: {:.2f}%".format(attrition_rate))

Total Employees: 1470
Employees who left: 237
Attrition Rate: 16.12%


In [8]:
# Group by Gender and count how many left (Attrition = Yes)
attrition_by_gender = df[df['Attrition'] == 'Yes'].groupby('Gender').size()

# Show result
print("Attrition Count by Gender:")
print(attrition_by_gender)

Attrition Count by Gender:
Gender
Female     87
Male      150
dtype: int64


In [9]:
# Count attrition cases per department
attrition_by_dept = df[df['Attrition'] == 'Yes'].groupby('Department').size()

print("Attrition Count by Department:")
print(attrition_by_dept)

Attrition Count by Department:
Department
Human Resources            12
Research & Development    133
Sales                      92
dtype: int64


In [11]:
# Count attrition cases by Job Role
attrition_by_job = df[df['Attrition'] == 'Yes'].groupby('Job_Role').size().sort_values(ascending=False)

print("Attrition Count by Job Role:")
print(attrition_by_job)

Attrition Count by Job Role:
Job_Role
Laboratory Technician        62
Sales Executive              57
Research Scientist           47
Sales Representative         33
Human Resources              12
Manufacturing Director       10
Healthcare Representative     9
Manager                       5
Research Director             2
dtype: int64


In [12]:
# Average income of employees who left vs stayed
income_vs_attrition = df.groupby('Attrition')['Monthly_Income'].mean()

print("Average Monthly Income by Attrition Status:")
print(income_vs_attrition)

Average Monthly Income by Attrition Status:
Attrition
No     6832.739659
Yes    4787.092827
Name: Monthly_Income, dtype: float64


In [13]:
# Average years at company for people who left vs stayed
years_vs_attrition = df.groupby('Attrition')['Years_At_Company'].mean()

print("Average Years at Company by Attrition Status:")
print(years_vs_attrition)

Average Years at Company by Attrition Status:
Attrition
No     7.369019
Yes    5.130802
Name: Years_At_Company, dtype: float64
