In [None]:
# Importing Required Libraries
import pandas as pd
import numpy as np

# Modeling Libraries
from sklearn.preprocessing import StandardScaler, LabelEncoder

import warnings
warnings.filterwarnings("ignore")


**Loading the dataset**

In [None]:
# Loading the Dataset
df = pd.read_csv('WA_Fn-UseC_-HR-Employee-Attrition.csv')

In [None]:
# Displaying the first five rows
df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


In [None]:
# Displaying the bottom 5 rows
df.tail()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
1465,36,No,Travel_Frequently,884,Research & Development,23,2,Medical,1,2061,...,3,80,1,17,3,3,5,2,0,3
1466,39,No,Travel_Rarely,613,Research & Development,6,1,Medical,1,2062,...,1,80,1,9,5,3,7,7,1,7
1467,27,No,Travel_Rarely,155,Research & Development,4,3,Life Sciences,1,2064,...,2,80,1,6,0,3,6,2,0,3
1468,49,No,Travel_Frequently,1023,Sales,2,3,Medical,1,2065,...,4,80,0,17,3,2,9,6,0,8
1469,34,No,Travel_Rarely,628,Research & Development,8,3,Medical,1,2068,...,1,80,0,6,3,4,4,3,1,2


**Data Wrangling**

In [None]:
## Computing the size of the dataset
print("The shape of data frame:", df.shape)
# Print the length (number of rows) of the DataFrame
print("Number of Rows in the dataframe:", len(df))
# Print the number of columns in the DataFrame
print("Number of Columns in the dataframe:", len(df.columns))

The shape of data frame: (1470, 35)
Number of Rows in the dataframe: 1470
Number of Columns in the dataframe: 35


In [None]:
## Enlisting the columns in the data set
print("Column labels in the dataset in column order:")
for column in df.columns:
    print(column)

Column labels in the dataset in column order:
Age
Attrition
BusinessTravel
DailyRate
Department
DistanceFromHome
Education
EducationField
EmployeeCount
EmployeeNumber
EnvironmentSatisfaction
Gender
HourlyRate
JobInvolvement
JobLevel
JobRole
JobSatisfaction
MaritalStatus
MonthlyIncome
MonthlyRate
NumCompaniesWorked
Over18
OverTime
PercentSalaryHike
PerformanceRating
RelationshipSatisfaction
StandardHours
StockOptionLevel
TotalWorkingYears
TrainingTimesLastYear
WorkLifeBalance
YearsAtCompany
YearsInCurrentRole
YearsSinceLastPromotion
YearsWithCurrManager


In [None]:
#  Displaying the data type of each column
print("Data types of each column in the dataset:")
print (df.info())

Data types of each column in the dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 35 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Age                       1470 non-null   int64 
 1   Attrition                 1470 non-null   object
 2   BusinessTravel            1470 non-null   object
 3   DailyRate                 1470 non-null   int64 
 4   Department                1470 non-null   object
 5   DistanceFromHome          1470 non-null   int64 
 6   Education                 1470 non-null   int64 
 7   EducationField            1470 non-null   object
 8   EmployeeCount             1470 non-null   int64 
 9   EmployeeNumber            1470 non-null   int64 
 10  EnvironmentSatisfaction   1470 non-null   int64 
 11  Gender                    1470 non-null   object
 12  HourlyRate                1470 non-null   int64 
 13  JobInvolvement            1470 non-n

The dataset contains 1470 entries (rows) and 35 columns (features).

No missing values are present in the dataset — each column has exactly 1470 non-null entries. This is a strong indicator of data completeness and reduces the need for imputation.

There are two types of columns:

1. int64: Numerical features such as Age, DistanceFromHome, MonthlyIncome,
etc. These can be used directly for modeling after scaling.

2. object: Categorical features such as BusinessTravel, Department, JobRole, Gender, etc. These will need to be encoded (e.g., via one-hot encoding or label encoding) before they can be used in machine learning algorithms.

Specifically, there are 26 numeric (int64) columns and 9 categorical (object) columns.



In [None]:
# Replacing ordinal numeric values with descriptive categorical labels for interpretability

df["Education"] = df["Education"].replace({
    1: "Below College",
    2: "College",
    3: "Bachelor",
    4: "Master",
    5: "Doctor"
})

df["EnvironmentSatisfaction"] = df["EnvironmentSatisfaction"].replace({
    1: "Low",
    2: "Medium",
    3: "High",
    4: "Very High"
})

df["JobInvolvement"] = df["JobInvolvement"].replace({
    1: "Low",
    2: "Medium",
    3: "High",
    4: "Very High"
})

df["JobLevel"] = df["JobLevel"].replace({
    1: "Entry Level",
    2: "Junior Level",
    3: "Mid Level",
    4: "Senior Level",
    5: "Executive Level"
})

df["JobSatisfaction"] = df["JobSatisfaction"].replace({
    1: "Low",
    2: "Medium",
    3: "High",
    4: "Very High"
})

df["PerformanceRating"] = df["PerformanceRating"].replace({
    1: "Low",
    2: "Good",
    3: "Excellent",
    4: "Outstanding"
})

df["RelationshipSatisfaction"] = df["RelationshipSatisfaction"].replace({
    1: "Low",
    2: "Medium",
    3: "High",
    4: "Very High"
})

df["WorkLifeBalance"] = df["WorkLifeBalance"].replace({
    1: "Bad",
    2: "Good",
    3: "Better",
    4: "Best"
})


In [None]:
# Displaying a random sample of 5 rows from categorical (object-type) features
df.select_dtypes(include="object").sample(5)

Unnamed: 0,Attrition,BusinessTravel,Department,Education,EducationField,EnvironmentSatisfaction,Gender,JobInvolvement,JobLevel,JobRole,JobSatisfaction,MaritalStatus,Over18,OverTime,PerformanceRating,RelationshipSatisfaction,WorkLifeBalance
1149,No,Travel_Rarely,Research & Development,Bachelor,Other,Very High,Male,Medium,Entry Level,Laboratory Technician,Low,Divorced,Y,No,Excellent,Low,Better
714,No,Travel_Rarely,Research & Development,College,Medical,Very High,Male,High,Senior Level,Research Director,Very High,Divorced,Y,No,Outstanding,High,Good
1219,No,Travel_Rarely,Research & Development,Master,Medical,Very High,Female,High,Entry Level,Laboratory Technician,High,Married,Y,No,Excellent,High,Better
1410,No,Travel_Rarely,Sales,College,Marketing,Medium,Female,High,Junior Level,Sales Executive,Medium,Married,Y,No,Excellent,High,Better
268,No,Travel_Rarely,Research & Development,College,Medical,High,Male,High,Senior Level,Healthcare Representative,Very High,Married,Y,Yes,Excellent,Medium,Better


In [None]:
# Checking for missing values
print("\nMissing Values:")
print(df.isnull().sum())


Missing Values:
Age                         0
Attrition                   0
BusinessTravel              0
DailyRate                   0
Department                  0
DistanceFromHome            0
Education                   0
EducationField              0
EmployeeCount               0
EmployeeNumber              0
EnvironmentSatisfaction     0
Gender                      0
HourlyRate                  0
JobInvolvement              0
JobLevel                    0
JobRole                     0
JobSatisfaction             0
MaritalStatus               0
MonthlyIncome               0
MonthlyRate                 0
NumCompaniesWorked          0
Over18                      0
OverTime                    0
PercentSalaryHike           0
PerformanceRating           0
RelationshipSatisfaction    0
StandardHours               0
StockOptionLevel            0
TotalWorkingYears           0
TrainingTimesLastYear       0
WorkLifeBalance             0
YearsAtCompany              0
YearsInCurrentRole     

As shown in the output, there are no missing values in any of the 35 columns of the dataset, each feature is fully populated with 1,470 non-null entries. This ensures a smooth preprocessing phase without the need for imputation or data cleaning related to null values, which is ideal for machine learning workflows.

In [None]:
# Descriptive Analysis on numerical features
print ("\nsummary statistics:")
print(df.describe())


summary statistics:
               Age    DailyRate  DistanceFromHome  EmployeeCount  \
count  1470.000000  1470.000000       1470.000000         1470.0   
mean     36.923810   802.485714          9.192517            1.0   
std       9.135373   403.509100          8.106864            0.0   
min      18.000000   102.000000          1.000000            1.0   
25%      30.000000   465.000000          2.000000            1.0   
50%      36.000000   802.000000          7.000000            1.0   
75%      43.000000  1157.000000         14.000000            1.0   
max      60.000000  1499.000000         29.000000            1.0   

       EmployeeNumber   HourlyRate  MonthlyIncome   MonthlyRate  \
count     1470.000000  1470.000000    1470.000000   1470.000000   
mean      1024.865306    65.891156    6502.931293  14313.103401   
std        602.024335    20.329428    4707.956783   7117.786044   
min          1.000000    30.000000    1009.000000   2094.000000   
25%        491.250000    48.000

The summary statistics provide a comprehensive overview of the numerical features in the dataset, highlighting key insights into the workforce's structure and variability. The dataset comprises 26 numerical columns, each described by standard metrics such as mean, standard deviation, minimum, quartiles, and maximum. Several features, like MonthlyIncome, TotalWorkingYears, and DistanceFromHome, exhibit wide ranges and high standard deviations, indicating significant variability across employees in terms of salary, experience, and commuting distance. Conversely, features such as StandardHours and EmployeeCount hold constant values across all entries, rendering them non-informative and suitable for removal during data preparation. Other features, like YearsAtCompany, YearsInCurrentRole, and YearsSinceLastPromotion, reveal insights into employee tenure and career progression—while some employees have remained with the same manager or company for over a decade, the median years since last promotion is just two, pointing to a fairly active promotion cycle. Additionally, the average employee has worked for about three different companies and received close to three training sessions in the last year, suggesting a workforce with a mix of external experience and ongoing development. These statistical summaries not only enhance our understanding of employee attributes but also guide critical decisions regarding feature scaling, normalization, and selection in preparation for modeling.

In [None]:
# Displaying the number of unique values for each feature in the dataset
print(" Number of Unique Values per Column:\n")

for column in df.columns:
    unique_count = df[column].nunique()
    print(f"{column:<30} →  {unique_count} unique value(s)")


 Number of Unique Values per Column:

Age                            →  43 unique value(s)
Attrition                      →  2 unique value(s)
BusinessTravel                 →  3 unique value(s)
DailyRate                      →  886 unique value(s)
Department                     →  3 unique value(s)
DistanceFromHome               →  29 unique value(s)
Education                      →  5 unique value(s)
EducationField                 →  6 unique value(s)
EmployeeCount                  →  1 unique value(s)
EmployeeNumber                 →  1470 unique value(s)
EnvironmentSatisfaction        →  4 unique value(s)
Gender                         →  2 unique value(s)
HourlyRate                     →  71 unique value(s)
JobInvolvement                 →  4 unique value(s)
JobLevel                       →  5 unique value(s)
JobRole                        →  9 unique value(s)
JobSatisfaction                →  4 unique value(s)
MaritalStatus                  →  3 unique value(s)
MonthlyIncome     

The output from this code provides a clear overview of the cardinality—or the number of unique values—present in each column of the dataset. This information is essential for distinguishing between categorical and continuous variables, as well as for identifying potential features that may not contribute meaningfully to predictive modeling. For instance, features like EmployeeCount, Over18, and StandardHours each contain only one unique value, indicating no variance and therefore no predictive power; these columns are ideal candidates for removal. Conversely, columns such as EmployeeNumber, MonthlyIncome, and MonthlyRate have very high cardinality, with EmployeeNumber having a unique value for every row, signifying it's likely an identifier and not suitable for modeling.

In [None]:
# Dropping irrelevant or constant columns
df.drop(['EmployeeCount', 'EmployeeNumber', 'Over18', 'StandardHours'], axis=1, inplace=True)
print("Remaining columns:", df.columns.tolist())

Remaining columns: ['Age', 'Attrition', 'BusinessTravel', 'DailyRate', 'Department', 'DistanceFromHome', 'Education', 'EducationField', 'EnvironmentSatisfaction', 'Gender', 'HourlyRate', 'JobInvolvement', 'JobLevel', 'JobRole', 'JobSatisfaction', 'MaritalStatus', 'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked', 'OverTime', 'PercentSalaryHike', 'PerformanceRating', 'RelationshipSatisfaction', 'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance', 'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion', 'YearsWithCurrManager']


In [None]:
# Displaying the shape and structure of the DataFrame

print(f"Shape of the DataFrame           : {df.shape}")
print(f"Total Number of Rows (Observations): {df.shape[0]}")
print(f"Total Number of Columns (Features) : {df.shape[1]}")


Shape of the DataFrame           : (1470, 31)
Total Number of Rows (Observations): 1470
Total Number of Columns (Features) : 31


**Saving Dataframe to CSV File**

In [None]:
df.to_csv('HR_Employee_Attrition_Cleaned.csv', index=False)
print("Dataframe is exported to CSV File successfully")

Dataframe is exported to CSV File successfully
