Data Understanding and Exploration

1. Load and explore the dataset, checking for data completeness and structure.

In [1]:
import sys
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
df=pd.read_csv("../data/raw/employee_data.csv")

In [None]:
df.columns


In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.describe()


In [None]:
df.head()

In [None]:
df.isnull().sum()

In [None]:
df.dtypes

In [32]:
a = df['Attrition'].value_counts() *100 / len(df)


In [None]:
a

2. Perform univariate and bivariate analysis.

In [None]:
# Bar plot for categorical variables
categorical_vars = ['Department', 'JobRole', 'MaritalStatus', 'BusinessTravel']
for var in categorical_vars:
    plt.figure(figsize=(10, 5))
    sns.countplot(data=df, x=var, hue='Attrition', palette='Set2')
    plt.title(f'{var} vs Attrition')
    plt.xticks(rotation=45)
    plt.show()


In [None]:
# Distribution plots for numerical features
numerical_vars = ['Age', 'MonthlyIncome', 'DistanceFromHome', 'TotalWorkingYears']

for var in numerical_vars:
    plt.figure(figsize=(8, 4))
    sns.histplot(df[var], kde=True, bins=30, color='blue')
    plt.title(f'Distribution of {var}')
    plt.xlabel(var)
    plt.ylabel('Frequency')
    plt.show()


In [None]:
# Boxplots to analyze patterns
for var in numerical_vars:
    plt.figure(figsize=(8, 4))
    sns.boxplot(data=df, x='Attrition', y=var, palette='Set3')
    plt.title(f'{var} vs Attrition')
    plt.show()


In [None]:
# Stacked bar chart for categorical variables
for var in categorical_vars:
    attrition_counts = df.groupby([var, 'Attrition']).size().unstack()
    attrition_counts.plot(kind='bar', stacked=True, figsize=(10, 5), colormap='viridis')
    plt.title(f'Stacked Bar Chart for {var} and Attrition')
    plt.ylabel('Count')
    plt.show()


In [3]:
# Check for constant columns
constant_columns = [col for col in df.columns if df[col].nunique() == 1]
print("Constant Columns:", constant_columns)

# Identify columns missing from numerical or heatmap analysis
numerical_columns = df.select_dtypes(include=['number']).columns
missing_columns = [col for col in df.columns if col not in numerical_columns]
print("Missing Columns from Analysis:", missing_columns)


Constant Columns: ['EmployeeCount', 'Over18', 'StandardHours']
Missing Columns from Analysis: ['Attrition', 'BusinessTravel', 'Department', 'EducationField', 'Gender', 'JobRole', 'MaritalStatus', 'Over18', 'OverTime']


In [7]:
df['Attrition']

0       Yes
1        No
2       Yes
3        No
4        No
       ... 
1465     No
1466     No
1467     No
1468     No
1469     No
Name: Attrition, Length: 1470, dtype: object

In [4]:
# Drop constant columns
fdf = df.drop(columns=constant_columns)

In [5]:
binary_columns = ['Attrition', 'OverTime', 'Gender' ]
for col in binary_columns:
    fdf[col] = fdf[col].map({'Yes': 1, 'No': 0, 'Male': 1, 'Female': 0})  # Adjust mappings as per column values


multi_class_columns = ['BusinessTravel', 'Department', 'EducationField', 'JobRole', 'MaritalStatus']

# for col in multi_class_columns:
#     fdf[col] = fdf[col].map({'Travel_Rarely':0, 'Travel_Frequently':1, 'Non-Travel':2, 'Sales':0, 'Research & Development':1, 'Human Resources':2,'Life Sciences':0, 'Other':1, 'Medical':2, 'Marketing':3, 'Technical Degree':4, 'Human Resources':2,})
# Apply One-Hot Encoding
# fdf = pd.get_dummies(fdf, columns=multi_class_columns, drop_first=True)
# encoded_columns = ['BusinessTravel_Travel_Frequently',
#        'BusinessTravel_Travel_Rarely', 'Department_Research & Development',
#        'Department_Sales', 'EducationField_Life Sciences',
#        'EducationField_Marketing', 'EducationField_Medical',
#        'EducationField_Other', 'EducationField_Technical Degree',
#        'JobRole_Human Resources', 'JobRole_Laboratory Technician',
#        'JobRole_Manager', 'JobRole_Manufacturing Director',
#        'JobRole_Research Director', 'JobRole_Research Scientist',
#        'JobRole_Sales Executive', 'JobRole_Sales Representative',
#        'MaritalStatus_Married', 'MaritalStatus_Single']
# for col in encoded_columns:
#     fdf[col] = fdf[col].map({'Yes': 1, 'No': 0, 'Male': 1, 'Female': 0, 'True': 1, 'False': 0})  # Adjust mappings as per column values


In [9]:
# Recompute numerical columns
numerical_columns = fdf.select_dtypes(include=['number']).columns


In [10]:
fdf['Attrition']

0       1
1       0
2       1
3       0
4       0
       ..
1465    0
1466    0
1467    0
1468    0
1469    0
Name: Attrition, Length: 1470, dtype: int64

In [None]:
# Divide numerical columns into two groups
half = len(numerical_columns) // 2
group1 = numerical_columns[:half]
group2 = numerical_columns[half:]

# Heatmap for Group 1
plt.figure(figsize=(10, 8))
sns.heatmap(fdf[group1].corr(), annot=True, cmap='coolwarm', fmt=".2f", vmin=-1, vmax=1)
plt.title('Correlation Heatmap - Group 1')
plt.show()

# Heatmap for Group 2
plt.figure(figsize=(10, 8))
sns.heatmap(fdf[group2].corr(), annot=True, cmap='coolwarm', fmt=".2f", vmin=-1, vmax=1)
plt.title('Correlation Heatmap - Group 2')
plt.show()


Data Cleaning and Preprocessing:

Handle Missing Value:

1. Numerical Features
(Use mean/median imputation for missing numerical features):


In [11]:
# Fill missing values for numerical columns with mean or median
numerical_columns = fdf.select_dtypes(include=['number']).columns
fdf[numerical_columns] = fdf[numerical_columns].fillna(fdf[numerical_columns].median())

2. Categorical Features
(Use mode (most frequent category) for missing categorical features):

In [12]:
# Fill missing values for categorical columns with mode
categorical_columns = fdf.select_dtypes(include=['object']).columns
for col in categorical_columns:
    fdf[col] = fdf[col].fillna(fdf[col].mode()[0])


In [13]:
fdf['Attrition']

0       1
1       0
2       1
3       0
4       0
       ..
1465    0
1466    0
1467    0
1468    0
1469    0
Name: Attrition, Length: 1470, dtype: int64

Encode Categorical Variables

1. Binary Categorical Columns
            (Use Label Encoding for binary columns):

In [46]:
binary_columns = ['Attrition', 'OverTime', 'Gender']  # Adjust this list as needed
for col in binary_columns:
    fdf[col] = fdf[col].map({'Yes': 1, 'No': 0, 'Male': 1, 'Female': 0})


2. Multi-class Categorical Columns
(Use One-Hot Encoding for multi-class columns):


In [47]:
multi_class_columns = ['BusinessTravel', 'Department', 'EducationField', 'JobRole', 'MaritalStatus']
fdf = pd.get_dummies(fdf, columns=multi_class_columns, drop_first=True)


Feature Scaling:

1. Apply Min-Max Scaling or Standard Scaling
(Use Min-Max Scaling for numerical features to normalize data between 0 and 1):



In [14]:
fdf.columns

Index(['Age', 'Attrition', 'BusinessTravel', 'DailyRate', 'Department',
       'DistanceFromHome', 'Education', 'EducationField', 'EmployeeNumber',
       'EnvironmentSatisfaction', 'Gender', 'HourlyRate', 'JobInvolvement',
       'JobLevel', 'JobRole', 'JobSatisfaction', 'MaritalStatus',
       'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked', 'OverTime',
       'PercentSalaryHike', 'PerformanceRating', 'RelationshipSatisfaction',
       'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear',
       'WorkLifeBalance', 'YearsAtCompany', 'YearsInCurrentRole',
       'YearsSinceLastPromotion', 'YearsWithCurrManager'],
      dtype='object')

In [15]:
from sklearn.preprocessing import MinMaxScaler

# Scale numerical columns
scaler = MinMaxScaler()
numerical_columns = fdf.select_dtypes(include=['number']).columns
fdf[numerical_columns] = scaler.fit_transform(fdf[numerical_columns])


In [16]:
fdf['Attrition']

0       1.0
1       0.0
2       1.0
3       0.0
4       0.0
       ... 
1465    0.0
1466    0.0
1467    0.0
1468    0.0
1469    0.0
Name: Attrition, Length: 1470, dtype: float64

Address Class Imbalance in the Target Variable
1. Check Class Distribution

In [None]:
# Check class balance
print(fdf['Attrition'].value_counts(normalize=True))


2. Use SMOTE for Oversampling

In [17]:
fdf = pd.get_dummies(fdf, drop_first=True)

In [18]:
fdf['Attrition']

0       1.0
1       0.0
2       1.0
3       0.0
4       0.0
       ... 
1465    0.0
1466    0.0
1467    0.0
1468    0.0
1469    0.0
Name: Attrition, Length: 1470, dtype: float64

In [20]:
from collections import Counter
print(Counter(y))



Counter({0.0: 1233, 1.0: 237})


In [19]:
from imblearn.over_sampling import SMOTE

# Separate features and target
X = fdf.drop(columns=['Attrition'])  # Features
y = fdf['Attrition']  # Target

# Apply SMOTE
smote = SMOTE(random_state=42, k_neighbors=3)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Check new class distribution
from collections import Counter
print("Resampled class distribution:", Counter(y_resampled))


Resampled class distribution: Counter({1.0: 1233, 0.0: 1233})


In [23]:
fdf.to_csv('C:/Users\muzam\OneDrive\Desktop\Attrition-Analysis\data\interim\cleaned_data.csv', index=False)

  fdf.to_csv('C:/Users\muzam\OneDrive\Desktop\Attrition-Analysis\data\interim\cleaned_data.csv', index=False)


In [24]:
a = pd.read_csv('C:/Users\muzam\OneDrive\Desktop\Attrition-Analysis\data\interim\cleaned_data.csv')

  a = pd.read_csv('C:/Users\muzam\OneDrive\Desktop\Attrition-Analysis\data\interim\cleaned_data.csv')


In [25]:
a

Unnamed: 0,Age,Attrition,DailyRate,DistanceFromHome,Education,EmployeeNumber,EnvironmentSatisfaction,Gender,HourlyRate,JobInvolvement,...,JobRole_Human Resources,JobRole_Laboratory Technician,JobRole_Manager,JobRole_Manufacturing Director,JobRole_Research Director,JobRole_Research Scientist,JobRole_Sales Executive,JobRole_Sales Representative,MaritalStatus_Married,MaritalStatus_Single
0,0.547619,1.0,0.715820,0.000000,0.25,0.000000,0.333333,0.0,0.914286,0.666667,...,False,False,False,False,False,False,True,False,False,True
1,0.738095,0.0,0.126700,0.250000,0.00,0.000484,0.666667,1.0,0.442857,0.333333,...,False,False,False,False,False,True,False,False,True,False
2,0.452381,1.0,0.909807,0.035714,0.25,0.001451,1.000000,1.0,0.885714,0.333333,...,False,True,False,False,False,False,False,False,False,True
3,0.357143,0.0,0.923407,0.071429,0.75,0.001935,1.000000,0.0,0.371429,0.666667,...,False,False,False,False,False,True,False,False,True,False
4,0.214286,0.0,0.350036,0.035714,0.00,0.002903,0.000000,1.0,0.142857,0.666667,...,False,True,False,False,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1465,0.428571,0.0,0.559771,0.785714,0.25,0.996613,0.666667,1.0,0.157143,1.000000,...,False,True,False,False,False,False,False,False,True,False
1466,0.500000,0.0,0.365784,0.178571,0.00,0.997097,1.000000,1.0,0.171429,0.333333,...,False,False,False,False,False,False,False,False,True,False
1467,0.214286,0.0,0.037938,0.107143,0.50,0.998065,0.333333,1.0,0.814286,1.000000,...,False,False,False,True,False,False,False,False,True,False
1468,0.738095,0.0,0.659270,0.035714,0.50,0.998549,1.000000,1.0,0.471429,0.333333,...,False,False,False,False,False,False,True,False,True,False


In [26]:
df

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1465,36,No,Travel_Frequently,884,Research & Development,23,2,Medical,1,2061,...,3,80,1,17,3,3,5,2,0,3
1466,39,No,Travel_Rarely,613,Research & Development,6,1,Medical,1,2062,...,1,80,1,9,5,3,7,7,1,7
1467,27,No,Travel_Rarely,155,Research & Development,4,3,Life Sciences,1,2064,...,2,80,1,6,0,3,6,2,0,3
1468,49,No,Travel_Frequently,1023,Sales,2,3,Medical,1,2065,...,4,80,0,17,3,2,9,6,0,8
