In [1]:
import numpy as np
import pandas as pd

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
df = pd.read_csv("Employee Attrition.csv")

In [3]:
df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


In [4]:
print(df.isnull().sum())

Age                         0
Attrition                   0
BusinessTravel              0
DailyRate                   0
Department                  0
DistanceFromHome            0
Education                   0
EducationField              0
EmployeeCount               0
EmployeeNumber              0
EnvironmentSatisfaction     0
Gender                      0
HourlyRate                  0
JobInvolvement              0
JobLevel                    0
JobRole                     0
JobSatisfaction             0
MaritalStatus               0
MonthlyIncome               0
MonthlyRate                 0
NumCompaniesWorked          0
Over18                      0
OverTime                    0
PercentSalaryHike           0
PerformanceRating           0
RelationshipSatisfaction    0
StandardHours               0
StockOptionLevel            0
TotalWorkingYears           0
TrainingTimesLastYear       0
WorkLifeBalance             0
YearsAtCompany              0
YearsInCurrentRole          0
YearsSince

In [5]:
one_uniq = dict()
for col in df.columns:
    if len(df[col].unique()) == 1:
        one_uniq.update({col: df[col].unique().tolist()})
        
print('The name of column(s) with single value: %s' % one_uniq)

The name of column(s) with single value: {'EmployeeCount': [1], 'Over18': ['Y'], 'StandardHours': [80]}


In [6]:
df.drop(one_uniq.keys(), axis=1, inplace=True)

In [7]:
df.drop('EmployeeNumber', axis=1, inplace=True)

In [8]:
education_map = {1: 'Below College', 2: 'College',
                 3: 'Bachelor', 4: 'Master', 5: 'Phd'}
envir_sat_map = {1: 'Low', 2: 'Medium',
                 3: 'High', 4: 'Very High'}
job_inv_map = {1: 'Low', 2: 'Medium',
               3: 'High', 4: 'Very High'}
job_sat_map = {1: 'Low', 2: 'Medium',
               3: 'High', 4: 'Very High'}
perf_rate_map = {1: 'Low', 2: 'Good',
                 3: 'Excellent', 4: 'Outstanding'}
relation_sat_map = {1: 'Low', 2: 'Medium',
                    3: 'High', 4: 'Very High'}
work_life_map = {1: 'Bad', 2: 'Good',
                 3: 'Better', 4: 'Best'}

df['Education'] = df['Education'].apply(lambda x: education_map[x])
df['EnvironmentSatisfaction'] = \
    df['EnvironmentSatisfaction'].apply(lambda x: envir_sat_map[x])
df['JobInvolvement'] = df['JobInvolvement'].apply(lambda x: job_inv_map[x])
df['JobSatisfaction'] = \
df['JobSatisfaction'].apply(lambda x: job_sat_map[x])
df['PerformanceRating'] = df['PerformanceRating'].apply(lambda x: perf_rate_map[x])
df['RelationshipSatisfaction'] = \
    df['RelationshipSatisfaction'].apply(lambda x: relation_sat_map[x])
df['WorkLifeBalance'] = df['WorkLifeBalance'].apply(lambda x: work_life_map[x])

display(df.head(5))

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,Gender,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,College,Life Sciences,Medium,Female,...,Excellent,Low,0,8,0,Bad,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,Below College,Life Sciences,High,Male,...,Outstanding,Very High,1,10,3,Better,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,College,Other,Very High,Male,...,Excellent,Medium,0,7,3,Better,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,Master,Life Sciences,Very High,Female,...,Excellent,High,0,8,3,Better,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,Below College,Medical,Low,Male,...,Excellent,Very High,1,6,3,Better,2,2,2,2


In [9]:
#Function to augment data by slightly altering values
def augment_data(df, num_desired_records):
    augmented_data = []
    num_existing_records = len(df)
 
    while len(augmented_data) < num_desired_records:
        # Randomly select an existing record
        idx = np.random.randint(num_existing_records)
        original_record = df.iloc[idx].copy()
 
        # Perturb some categorical columns
        original_record['Education'] = np.random.choice(list(education_map.values()))
        original_record['EnvironmentSatisfaction'] = np.random.choice(list(envir_sat_map.values()))
        original_record['JobInvolvement'] = np.random.choice(list(job_inv_map.values()))
        original_record['JobSatisfaction'] = np.random.choice(list(job_sat_map.values()))
        original_record['PerformanceRating'] = np.random.choice(list(perf_rate_map.values()))
        original_record['RelationshipSatisfaction'] = np.random.choice(list(relation_sat_map.values()))
        original_record['WorkLifeBalance'] = np.random.choice(list(work_life_map.values()))
 
        augmented_data.append(original_record)
 
    # Convert list of dictionaries to DataFrame
    augmented_df = pd.DataFrame(augmented_data)
 
    return augmented_df
 
# Example usage to generate 300,000 records
augmented_df = augment_data(df, num_desired_records=300000)
 
# Display the first few rows of the augmented DataFrame
print(augmented_df.head())

      Age Attrition BusinessTravel  DailyRate              Department  \
996    27        No  Travel_Rarely        205                   Sales   
1150   35        No  Travel_Rarely        819  Research & Development   
859    29        No  Travel_Rarely        942  Research & Development   
40     35        No  Travel_Rarely        464  Research & Development   
302    28        No  Travel_Rarely       1476  Research & Development   

      DistanceFromHome      Education EducationField EnvironmentSatisfaction  \
996                 10  Below College      Marketing                    High   
1150                18         Master  Life Sciences                    High   
859                 15         Master  Life Sciences                    High   
40                   4  Below College          Other               Very High   
302                 16       Bachelor        Medical               Very High   

      Gender  ...  PerformanceRating RelationshipSatisfaction  \
996   Female  .

In [10]:
augmented_df.shape

(300000, 31)

In [11]:
augmented_df.to_csv('HR Employee Attrition2.csv', index=False)
 
#print(f"Augmented dataset saved successfully at: {HR Employee Attrition}")