In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder


In [5]:
# Load the dataset
file_path = 'C:/Users/hp/OneDrive/Desktop/VCU/Technocolabs Softwares/Task 1/WA_Fn-UseC_-HR-Employee-Attrition.csv'


In [6]:
df = pd.read_csv(file_path)

In [9]:
# Impute missing values for numerical columns
numeric_cols = df.select_dtypes(include=[np.number]).columns
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())


In [13]:
# 1. Data Exploration
print(df.info())
print(df.describe())


<class 'pandas.core.frame.DataFrame'>
Index: 699 entries, 2 to 1469
Data columns (total 35 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Age                       699 non-null    int64 
 1   Attrition                 699 non-null    object
 2   BusinessTravel            699 non-null    object
 3   DailyRate                 699 non-null    int64 
 4   Department                699 non-null    object
 5   DistanceFromHome          699 non-null    int64 
 6   Education                 699 non-null    int64 
 7   EducationField            699 non-null    object
 8   EmployeeCount             699 non-null    int64 
 9   EmployeeNumber            699 non-null    int64 
 10  EnvironmentSatisfaction   699 non-null    int64 
 11  Gender                    699 non-null    object
 12  HourlyRate                699 non-null    int64 
 13  JobInvolvement            699 non-null    int64 
 14  JobLevel                  699 

In [15]:
# Impute missing values for categorical columns
categorical_cols = df.select_dtypes(include=['object']).columns
df[categorical_cols] = df[categorical_cols].fillna(df[categorical_cols].mode().iloc[0])

In [16]:
print(df.isnull().sum())
for column in df.select_dtypes(include=['object']).columns:
    print(f"Value counts for {column}:\n{df[column].value_counts()}\n")

Age                         0
Attrition                   0
BusinessTravel              0
DailyRate                   0
Department                  0
DistanceFromHome            0
Education                   0
EducationField              0
EmployeeCount               0
EmployeeNumber              0
EnvironmentSatisfaction     0
Gender                      0
HourlyRate                  0
JobInvolvement              0
JobLevel                    0
JobRole                     0
JobSatisfaction             0
MaritalStatus               0
MonthlyIncome               0
MonthlyRate                 0
NumCompaniesWorked          0
Over18                      0
OverTime                    0
PercentSalaryHike           0
PerformanceRating           0
RelationshipSatisfaction    0
StandardHours               0
StockOptionLevel            0
TotalWorkingYears           0
TrainingTimesLastYear       0
WorkLifeBalance             0
YearsAtCompany              0
YearsInCurrentRole          0
YearsSince

In [24]:
# Handling outliers (example using IQR)
for column in numeric_cols:
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    df = df[~((df[column] < (Q1 - 1.5 * IQR)) | (df[column] > (Q3 + 1.5 * IQR)))]

In [26]:
# Ensure consistency in categorical data
for column in categorical_cols:
    df[column] = df[column].astype(str).str.strip().str.lower()


In [27]:
#Data encoding
# Identify binary columns
binary_columns = [col for col in df.columns if df[col].nunique() == 2]
print("Binary columns:", binary_columns)

Binary columns: ['Attrition', 'Gender', 'OverTime']


In [28]:
le = LabelEncoder()
for column in binary_columns:
    df[column] = le.fit_transform(df[column])


In [29]:
# One-Hot Encoding for multi-class categorical features
multi_class_columns = ['BusinessTravel', 'Department', 'EducationField', 'JobRole', 'MaritalStatus']
# Ensure these columns are present in the DataFrame before applying One-Hot Encoding
multi_class_columns = [col for col in multi_class_columns if col in df.columns]
df = pd.get_dummies(df, columns=multi_class_columns, drop_first=True)

In [30]:
# 4. Data Labeling
# Ensure target variable is labeled if not already handled in binary columns
if 'Attrition' not in binary_columns:
    df['Attrition'] = le.fit_transform(df['Attrition'])

In [31]:
# Save the cleaned and encoded dataset
df.to_csv('cleaned_employee_data.csv', index=False)

In [32]:
print("Data preprocessing and preparation completed successfully.")

Data preprocessing and preparation completed successfully.
