### XGBoost - Employee Attrition Dataset

In [1]:
# Import Libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [2]:
# Load dataset
df = pd.read_csv(r"E:\Datasets\employee_attrition.csv")

In [3]:
# rows and cols
df.shape

(1470, 35)

In [4]:
# printing first 5 rows
df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 35 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Age                       1470 non-null   int64 
 1   Attrition                 1470 non-null   object
 2   BusinessTravel            1470 non-null   object
 3   DailyRate                 1470 non-null   int64 
 4   Department                1470 non-null   object
 5   DistanceFromHome          1470 non-null   int64 
 6   Education                 1470 non-null   int64 
 7   EducationField            1470 non-null   object
 8   EmployeeCount             1470 non-null   int64 
 9   EmployeeNumber            1470 non-null   int64 
 10  EnvironmentSatisfaction   1470 non-null   int64 
 11  Gender                    1470 non-null   object
 12  HourlyRate                1470 non-null   int64 
 13  JobInvolvement            1470 non-null   int64 
 14  JobLevel                

In [6]:
df.describe()

Unnamed: 0,Age,DailyRate,DistanceFromHome,Education,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,HourlyRate,JobInvolvement,JobLevel,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
count,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,...,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0
mean,36.92381,802.485714,9.192517,2.912925,1.0,1024.865306,2.721769,65.891156,2.729932,2.063946,...,2.712245,80.0,0.793878,11.279592,2.79932,2.761224,7.008163,4.229252,2.187755,4.123129
std,9.135373,403.5091,8.106864,1.024165,0.0,602.024335,1.093082,20.329428,0.711561,1.10694,...,1.081209,0.0,0.852077,7.780782,1.289271,0.706476,6.126525,3.623137,3.22243,3.568136
min,18.0,102.0,1.0,1.0,1.0,1.0,1.0,30.0,1.0,1.0,...,1.0,80.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
25%,30.0,465.0,2.0,2.0,1.0,491.25,2.0,48.0,2.0,1.0,...,2.0,80.0,0.0,6.0,2.0,2.0,3.0,2.0,0.0,2.0
50%,36.0,802.0,7.0,3.0,1.0,1020.5,3.0,66.0,3.0,2.0,...,3.0,80.0,1.0,10.0,3.0,3.0,5.0,3.0,1.0,3.0
75%,43.0,1157.0,14.0,4.0,1.0,1555.75,4.0,83.75,3.0,3.0,...,4.0,80.0,1.0,15.0,3.0,3.0,9.0,7.0,3.0,7.0
max,60.0,1499.0,29.0,5.0,1.0,2068.0,4.0,100.0,4.0,5.0,...,4.0,80.0,3.0,40.0,6.0,4.0,40.0,18.0,15.0,17.0


In [7]:
df.isnull().sum()

Age                         0
Attrition                   0
BusinessTravel              0
DailyRate                   0
Department                  0
DistanceFromHome            0
Education                   0
EducationField              0
EmployeeCount               0
EmployeeNumber              0
EnvironmentSatisfaction     0
Gender                      0
HourlyRate                  0
JobInvolvement              0
JobLevel                    0
JobRole                     0
JobSatisfaction             0
MaritalStatus               0
MonthlyIncome               0
MonthlyRate                 0
NumCompaniesWorked          0
Over18                      0
OverTime                    0
PercentSalaryHike           0
PerformanceRating           0
RelationshipSatisfaction    0
StandardHours               0
StockOptionLevel            0
TotalWorkingYears           0
TrainingTimesLastYear       0
WorkLifeBalance             0
YearsAtCompany              0
YearsInCurrentRole          0
YearsSince

#### Dropping Unwanted Features

In [8]:
columns_to_drop = ["EmployeeCount", "EmployeeNumber", "Over18", "StandardHours"]
df = df.drop(columns = columns_to_drop)

#### Handling outliers

In [9]:
# 1. Finding Inter Quartile Range
numerical_features = df.select_dtypes(include = [np.number]).columns
Q1 = df[numerical_features].quantile(0.25)
Q3 = df[numerical_features].quantile(0.75)
IQR = Q3 - Q1

In [10]:
# 2. Removing Outliers by Checking values beyond the limit
# Calcuate lower and upper limits for detecting outliers
lower_limit = Q1 - 1.5*IQR
upper_limit = Q3 + 1.5*IQR

# Create a condition to detect outliers
outliers_condition = ((df[numerical_features] < lower_limit) |
                      df[numerical_features] > upper_limit)

# Filter the dataframe to remove rows with outliers
df = df[~outliers_condition.any(axis=1)]

#### Encoding Categorical Features

In [11]:
categorical_features = df.select_dtypes(include=["object"]).columns
label_encoder = LabelEncoder()
for col in categorical_features:
    df[col] = label_encoder.fit_transform(df[col])

#### Splitting features (X) and target (y)

In [12]:
X = df.drop(columns = "Attrition")
y = df['Attrition']

#### Standardize the data

In [13]:
# We standardize the data which is import for XGBoost
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [14]:
# splitting into train and test
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size = 0.2,
                                                   random_state=42)

#### Model Training

In [15]:
model = XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=4,
                     random_state=42)
model.fit(X_train, y_train)

In [16]:
# Predictions
y_pred = model.predict(X_test)

#### Evaluate the model performance

In [17]:
# 1. Accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: ", accuracy)

Accuracy:  0.8843537414965986


- The model is good, showing decent predictive performance

In [18]:
# 2. Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix: ")
conf_matrix

Confusion Matrix: 


array([[250,   5],
       [ 29,  10]], dtype=int64)

- 250 (TN) - Correctly predicted employees who stayed in company as stayed
- 10 (TP) - Correctly predicted employee who left company as left

- 5 (FP) - Wrongly predicted employees who stayed in company as left
- 29 (FN) - Wrongly predicted employees who left company as stayed

In [19]:
# classification report
class_report = classification_report(y_test, y_pred)
print("Classification Report: \n", class_report)

Classification Report: 
               precision    recall  f1-score   support

           0       0.90      0.98      0.94       255
           1       0.67      0.26      0.37        39

    accuracy                           0.88       294
   macro avg       0.78      0.62      0.65       294
weighted avg       0.87      0.88      0.86       294



0 --> Employee stayed
- Precision - 0.90 : Out of all model predictions that emp stayed, 90% is correct
- Recall - 0.98 : Out of all emp actually stayed, model predicted 87% is correct
- F1 - 0.94 : Balance between Precision and Recall, overall good performance

1 --> Employee Left
- Precision - 0.67 : Out of all model predictions that emp left, 67% are correct
- Recall 0.26 : Out of all emp actually left, model predicted 26% is correct
- F1 - 0.37 : Balance between precision and recall, overall just a decent performance

Conclusion: 
- Model has good Precision and Recall for 0 and decent for 1
- Model improve Precision for left (1), using SMOTE

In [20]:
# Sample Prediction
new_employee = np.random.rand(1, X.shape[1])

# Convert into DataFrame with same column names as X
new_employee_df = pd.DataFrame(new_employee, columns=X.columns)

# Scale it using the trained scaler - to ensure all features have same range
new_employee_scaled = scaler.transform(new_employee_df)
prediction = model.predict(new_employee_scaled)

print("Predictions for New Employee: ", prediction)

Predictions for New Employee:  [1]
