In [14]:
import pandas as pd
import numpy as np

# Load the raw dataset from the local directory
# Standard professional practice: keep raw data immutable
try:
    df = pd.read_csv('../data/raw/WA_Fn-UseC_-HR-Employee-Attrition.csv')
    print("✅ Success: Dataset loaded successfully.")
    display(df.head())
except FileNotFoundError:
    print("❌ Error: The file was not found. Please check the 'data/raw' folder.")

✅ Success: Dataset loaded successfully.


Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


In [15]:
# Initial data exploration to understand structure and quality
print("--- Dataset Information ---")
print(df.info())

# Checking for missing values (Critical for data integrity)
print("\n--- Missing Values Check ---")
print(df.isnull().sum())

# Checking for duplicate entries
print(f"\nTotal Duplicate Rows: {df.duplicated().sum()}")

--- Dataset Information ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 35 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Age                       1470 non-null   int64 
 1   Attrition                 1470 non-null   object
 2   BusinessTravel            1470 non-null   object
 3   DailyRate                 1470 non-null   int64 
 4   Department                1470 non-null   object
 5   DistanceFromHome          1470 non-null   int64 
 6   Education                 1470 non-null   int64 
 7   EducationField            1470 non-null   object
 8   EmployeeCount             1470 non-null   int64 
 9   EmployeeNumber            1470 non-null   int64 
 10  EnvironmentSatisfaction   1470 non-null   int64 
 11  Gender                    1470 non-null   object
 12  HourlyRate                1470 non-null   int64 
 13  JobInvolvement            1470 non-null   int64 
 

In [16]:
# List of columns to be removed due to zero variance (constant values)
# These columns do not provide any predictive power for the analysis
cols_to_drop = ['Over18', 'EmployeeCount', 'StandardHours', 'EmployeeNumber']

"""
Rationale for dropping:
- Over18: All employees are 'Y', no variance.
- EmployeeCount: All values are 1, provides no statistical value.
- StandardHours: Constant value (80), doesn't differentiate workload.
- EmployeeNumber: Random unique ID, not useful for trend correlation.
"""

df.drop(columns=cols_to_drop, inplace=True)

print(f"✅ Success: Dropped {len(cols_to_drop)} redundant columns.")
print(f"Remaining Columns: {df.shape[1]}")

✅ Success: Dropped 4 redundant columns.
Remaining Columns: 31


In [17]:
# Mapping categorical 'Yes/No' values to binary integers (1/0)
# This enables easier calculation of Attrition Rates in Power BI/Models
binary_map = {'Yes': 1, 'No': 0}

df['Attrition'] = df['Attrition'].map(binary_map)
df['OverTime'] = df['OverTime'].map(binary_map)

print("✅ Success: 'Attrition' and 'OverTime' successfully mapped to 1/0.")

✅ Success: 'Attrition' and 'OverTime' successfully mapped to 1/0.


In [18]:
# Binning Age into logical life stages for better HR insights
age_bins = [18, 25, 35, 45, 55, 100]
age_labels = ['18-25', '26-35', '36-45', '46-55', '55+']
df['AgeGroup'] = pd.cut(df['Age'], bins=age_bins, labels=age_labels, right=False)

# Binning Distance from Home into categorized travel segments
# Helps identify if long commutes contribute to higher attrition
dist_bins = [0, 5, 15, 100]
dist_labels = ['Near', 'Far', 'Very Far']
df['DistanceGroup'] = pd.cut(df['DistanceFromHome'], bins=dist_bins, labels=dist_labels, right=False)

print("✅ Success: 'AgeGroup' and 'DistanceGroup' created.")
display(df[['Age', 'AgeGroup', 'DistanceFromHome', 'DistanceGroup']].head())

✅ Success: 'AgeGroup' and 'DistanceGroup' created.


Unnamed: 0,Age,AgeGroup,DistanceFromHome,DistanceGroup
0,41,36-45,1,Near
1,49,46-55,8,Far
2,37,36-45,2,Near
3,33,26-35,3,Near
4,27,26-35,2,Near


In [19]:
# Binning Age into logical life stages for better HR insights
age_bins = [18, 25, 35, 45, 55, 100]
age_labels = ['18-25', '26-35', '36-45', '46-55', '55+']
df['AgeGroup'] = pd.cut(df['Age'], bins=age_bins, labels=age_labels, right=False)

# Binning Distance from Home into categorized travel segments
# Helps identify if long commutes contribute to higher attrition
dist_bins = [0, 5, 15, 100]
dist_labels = ['Near', 'Far', 'Very Far']
df['DistanceGroup'] = pd.cut(df['DistanceFromHome'], bins=dist_bins, labels=dist_labels, right=False)

print("✅ Success: 'AgeGroup' and 'DistanceGroup' created.")
display(df[['Age', 'AgeGroup', 'DistanceFromHome', 'DistanceGroup']].head())

✅ Success: 'AgeGroup' and 'DistanceGroup' created.


Unnamed: 0,Age,AgeGroup,DistanceFromHome,DistanceGroup
0,41,36-45,1,Near
1,49,46-55,8,Far
2,37,36-45,2,Near
3,33,26-35,3,Near
4,27,26-35,2,Near
