In [51]:
import pandas as pd

In [37]:
filepath = 'filepath/WA_Fn-UseC_-HR-Employee-Attrition.csv'
att_df = pd.read_csv(filepath)

In [38]:
# Getting an overview of the data
att_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 35 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Age                       1470 non-null   int64 
 1   Attrition                 1470 non-null   object
 2   BusinessTravel            1470 non-null   object
 3   DailyRate                 1470 non-null   int64 
 4   Department                1470 non-null   object
 5   DistanceFromHome          1470 non-null   int64 
 6   Education                 1470 non-null   int64 
 7   EducationField            1470 non-null   object
 8   EmployeeCount             1470 non-null   int64 
 9   EmployeeNumber            1470 non-null   int64 
 10  EnvironmentSatisfaction   1470 non-null   int64 
 11  Gender                    1470 non-null   object
 12  HourlyRate                1470 non-null   int64 
 13  JobInvolvement            1470 non-null   int64 
 14  JobLevel                

In [43]:
# Check for missing values
att_df.columns[att_df.isnull().any()]

# Dropping Duplicates
att_df.drop_duplicates(inplace=True)

In [44]:
# Columns to drop, due to not being relevant to the data
att_df.drop(columns=['EmployeeCount', 'Over18', 'StandardHours'], inplace=True)

In [28]:
# Function to encode non-numeric columns (for modelling)
def encode_column(df, column_name, ordered_values):
    # Create a mapping dictionary based on provided order
    value_map = {value: idx for idx, value in enumerate(ordered_values)}
    
    # Create new column for the encoded values
    encoded_column_name = f"{column_name}_encoded"
    df[encoded_column_name] = df[column_name].map(value_map)
    
    return df

# Defining order of categories and encoding them (BusinessTravel, Attrition, OverTime, Gender)
BusinessTravel_values = ['Non-Travel', 'Travel_Rarely', 'Travel_Frequently']
Attrition_values = ['No', 'Yes']
OverTime_values = ['No', 'Yes']
Gender_values = ['Male', 'Female']

encode_column(att_df, 'BusinessTravel', BusinessTravel_values)
encode_column(att_df, 'Attrition', Attrition_values)
encode_column(att_df, 'OverTime', OverTime_values)
encode_column(att_df, 'Gender', Gender_values)

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeNumber,EnvironmentSatisfaction,...,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,BusinessTravel_encoded,Attrition_encoded,OverTime_encoded,Gender_encoded
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,2,...,0,1,6,4,0,5,1,1,1,1
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,2,3,...,3,3,10,7,1,7,2,0,0,0
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,4,4,...,3,3,0,0,0,0,1,1,1,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,5,4,...,3,3,8,7,3,0,2,0,1,1
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,7,1,...,3,3,2,2,2,2,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1465,36,No,Travel_Frequently,884,Research & Development,23,2,Medical,2061,3,...,3,3,5,2,0,3,2,0,0,0
1466,39,No,Travel_Rarely,613,Research & Development,6,1,Medical,2062,4,...,5,3,7,7,1,7,1,0,0,0
1467,27,No,Travel_Rarely,155,Research & Development,4,3,Life Sciences,2064,2,...,0,3,6,2,0,3,1,0,1,0
1468,49,No,Travel_Frequently,1023,Sales,2,3,Medical,2065,4,...,3,2,9,6,0,8,2,0,0,0


In [35]:
# Creating Dummy Variables (Department, EducationField, MaritalStatus, JobRole)
att_df_with_dummies = []
att_df_with_dummies = pd.get_dummies(att_df, columns=['Department'], prefix='Department')
att_df_with_dummies = pd.get_dummies(att_df_with_dummies, columns=['EducationField'], prefix='EducationField')
att_df_with_dummies = pd.get_dummies(att_df_with_dummies, columns=['MaritalStatus'], prefix='MaritalStatus')
att_df_with_dummies = pd.get_dummies(att_df_with_dummies, columns=['JobRole'], prefix='JobRole')

In [36]:
# Checking results
att_df_with_dummies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 53 columns):
 #   Column                             Non-Null Count  Dtype 
---  ------                             --------------  ----- 
 0   Age                                1470 non-null   int64 
 1   Attrition                          1470 non-null   object
 2   BusinessTravel                     1470 non-null   object
 3   DailyRate                          1470 non-null   int64 
 4   DistanceFromHome                   1470 non-null   int64 
 5   Education                          1470 non-null   int64 
 6   EmployeeNumber                     1470 non-null   int64 
 7   EnvironmentSatisfaction            1470 non-null   int64 
 8   Gender                             1470 non-null   object
 9   HourlyRate                         1470 non-null   int64 
 10  JobInvolvement                     1470 non-null   int64 
 11  JobLevel                           1470 non-null   int64 
 12  JobSat

In [55]:
# Export cleaned and processed data
export_filepath = 'filepath/Cleaned_HR-Employee-Attrition.csv'
att_df_with_dummies.to_csv(export_filepath, index=False)