In [1]:
import pandas as pd

In [2]:
data = pd.read_csv('HR Data.csv')

In [3]:
print("Before Data Cleansing:")
print(data.head())

Before Data Cleansing:
   Age Attrition     BusinessTravel  DailyRate              Department  \
0   41       Yes      Travel_Rarely       1102                   Sales   
1   49        No  Travel_Frequently        279  Research & Development   
2   37       Yes      Travel_Rarely       1373  Research & Development   
3   33        No  Travel_Frequently       1392  Research & Development   
4   27        No      Travel_Rarely        591  Research & Development   

   DistanceFromHome  Education EducationField  EmployeeCount  EmployeeNumber  \
0                 1          2  Life Sciences              1               1   
1                 8          1  Life Sciences              1               2   
2                 2          2          Other              1               4   
3                 3          4  Life Sciences              1               5   
4                 2          1        Medical              1               7   

   ...  RelationshipSatisfaction StandardHours  Sto

In [4]:
columns_to_remove = ['EmployeeCount', 'StandardHours']
data.drop(columns=columns_to_remove, inplace=True)


In [5]:
print(data.head())

   Age Attrition     BusinessTravel  DailyRate              Department  \
0   41       Yes      Travel_Rarely       1102                   Sales   
1   49        No  Travel_Frequently        279  Research & Development   
2   37       Yes      Travel_Rarely       1373  Research & Development   
3   33        No  Travel_Frequently       1392  Research & Development   
4   27        No      Travel_Rarely        591  Research & Development   

   DistanceFromHome  Education EducationField  EmployeeNumber  \
0                 1          2  Life Sciences               1   
1                 8          1  Life Sciences               2   
2                 2          2          Other               4   
3                 3          4  Life Sciences               5   
4                 2          1        Medical               7   

   EnvironmentSatisfaction  ... PerformanceRating  RelationshipSatisfaction  \
0                        2  ...                 3                         1   
1     

In [6]:
columns_to_rename = {
    'Age': 'Employee Age',
    'Attrition': 'Employee Attrition',
    'BusinessTravel': 'Travel Frequency',
    'DailyRate': 'Daily Pay Rate',
    'Department': 'Work Department',
    'DistanceFromHome': 'Home Distance',
    'Education': 'Education Level',
    'EducationField': 'Field of Education',
    'EmployeeNumber': 'Employee ID',
    'EnvironmentSatisfaction': 'Job Environment Satisfaction',
    'Gender': 'Gender',
    'HourlyRate': 'Hourly Pay Rate',
    'JobInvolvement': 'Job Involvement Level',
    'JobLevel': 'Job Level',
    'JobRole': 'Role in Company',
    'JobSatisfaction': 'Job Satisfaction Level',
    'MaritalStatus': 'Marital Status',
    'MonthlyIncome': 'Monthly Income',
    'MonthlyRate': 'Monthly Pay Rate',
    'NumCompaniesWorked': 'Number of Companies Worked',
    'Over18': 'Over 18',
    'OverTime': 'Overtime',
    'PercentSalaryHike': 'Salary Hike Percent',
    'PerformanceRating': 'Performance Rating',
    'RelationshipSatisfaction': 'Relationship Satisfaction Level',
    'StockOptionLevel': 'Stock Option Level',
    'TotalWorkingYears': 'Total Working Years',
    'TrainingTimesLastYear': 'Training Times Last Year',
    'WorkLifeBalance': 'Work-Life Balance Level',
    'YearsAtCompany': 'Years at Company',
    'YearsInCurrentRole': 'Years in Current Role',
    'YearsSinceLastPromotion': 'Years Since Last Promotion',
    'YearsWithCurrManager': 'Years with Current Manager'
}

data.rename(columns=columns_to_rename, inplace=True)

# 3. Eliminate redundant entries (e.g., duplicate rows)
data.drop_duplicates(inplace=True)


In [7]:
print(data.head())

   Employee Age Employee Attrition   Travel Frequency  Daily Pay Rate  \
0            41                Yes      Travel_Rarely            1102   
1            49                 No  Travel_Frequently             279   
2            37                Yes      Travel_Rarely            1373   
3            33                 No  Travel_Frequently            1392   
4            27                 No      Travel_Rarely             591   

          Work Department  Home Distance  Education Level Field of Education  \
0                   Sales              1                2      Life Sciences   
1  Research & Development              8                1      Life Sciences   
2  Research & Development              2                2              Other   
3  Research & Development              3                4      Life Sciences   
4  Research & Development              2                1            Medical   

   Employee ID  Job Environment Satisfaction  ... Performance Rating  \
0       

In [8]:
data['Gender'] = data['Gender'].replace({'M': 'Male', 'F': 'Female'})

In [9]:
columns_to_standardize = ['Travel Frequency', 'Work Department', 'Field of Education', 'Role in Company', 'Marital Status']
for column in columns_to_standardize:
    data[column] = data[column].str.strip().str.title()

In [12]:
data.dropna(inplace=True)
data['Over 18'] = data['Over 18'].replace({'Y': 'Yes','N':'No'})

In [13]:
print("After Data Cleansing:")
print(data.head())


After Data Cleansing:
   Employee Age Employee Attrition   Travel Frequency  Daily Pay Rate  \
0            41                Yes      Travel_Rarely            1102   
1            49                 No  Travel_Frequently             279   
2            37                Yes      Travel_Rarely            1373   
3            33                 No  Travel_Frequently            1392   
4            27                 No      Travel_Rarely             591   

          Work Department  Home Distance  Education Level Field of Education  \
0                   Sales              1                2      Life Sciences   
1  Research & Development              8                1      Life Sciences   
2  Research & Development              2                2              Other   
3  Research & Development              3                4      Life Sciences   
4  Research & Development              2                1            Medical   

   Employee ID  Job Environment Satisfaction  ... Performa

In [14]:
if 'Job Level' in data.columns:
    data['Job Level'] = pd.to_numeric(data['Job Level'], errors='coerce')
    data['Job Level'].fillna(0, inplace=True)  # Replace NaN with 0 or a default value

# 5. Eliminate rows with NaN values
data.dropna(inplace=True)

In [15]:
# Additional changes if necessary
# Example: Converting 'Salary Hike Percent' to numeric and ensuring it's within a reasonable range
if 'Salary Hike Percent' in data.columns:
    data['Salary Hike Percent'] = pd.to_numeric(data['Salary Hike Percent'], errors='coerce')
    data['Salary Hike Percent'].fillna(0, inplace=True)  # Replace NaN with 0 or a default value


In [16]:
print("After Data Cleansing:")
print(data.head())

# Save the cleansed data to a new CSV file
data.to_csv('hr_data_cleansed.csv', index=False)

After Data Cleansing:
   Employee Age Employee Attrition   Travel Frequency  Daily Pay Rate  \
0            41                Yes      Travel_Rarely            1102   
1            49                 No  Travel_Frequently             279   
2            37                Yes      Travel_Rarely            1373   
3            33                 No  Travel_Frequently            1392   
4            27                 No      Travel_Rarely             591   

          Work Department  Home Distance  Education Level Field of Education  \
0                   Sales              1                2      Life Sciences   
1  Research & Development              8                1      Life Sciences   
2  Research & Development              2                2              Other   
3  Research & Development              3                4      Life Sciences   
4  Research & Development              2                1            Medical   

   Employee ID  Job Environment Satisfaction  ... Performa