# Implementing a Supervised Learning Model for Prediction

In [167]:
#import the libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as plt


# 1. Data Loading and Exploration

In [168]:
#load the data
file_path = (r"/Users/chino/Downloads/employee_data.csv")
# store the data
df = pd.read_csv(file_path)
#print the first 7 rows of data
df.head(7)

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2
5,32,No,Travel_Frequently,1005,Research & Development,2,2,Life Sciences,1,8,...,3,80,0,8,2,2,7,7,3,6
6,59,No,Travel_Rarely,1324,Research & Development,3,3,Medical,1,10,...,1,80,3,12,3,2,1,0,0,0


In [169]:
# Check the shape of the data
df.shape

(1470, 35)

## 2. Data Processing

In [170]:
#get a count of empty values for each column
df.isna().sum()

Age                         0
Attrition                   0
BusinessTravel              0
DailyRate                   0
Department                  0
DistanceFromHome            0
Education                   0
EducationField              0
EmployeeCount               0
EmployeeNumber              0
EnvironmentSatisfaction     0
Gender                      0
HourlyRate                  0
JobInvolvement              0
JobLevel                    0
JobRole                     0
JobSatisfaction             0
MaritalStatus               0
MonthlyIncome               0
MonthlyRate                 0
NumCompaniesWorked          0
Over18                      0
OverTime                    0
PercentSalaryHike           0
PerformanceRating           0
RelationshipSatisfaction    0
StandardHours               0
StockOptionLevel            0
TotalWorkingYears           0
TrainingTimesLastYear       0
WorkLifeBalance             0
YearsAtCompany              0
YearsInCurrentRole          0
YearsSince

In [171]:
# check for any missing or null values
df.isnull().values.any()

False

In [172]:
#remove irrelevant colums
df = df.drop("Over18", axis = 1)
df = df.drop("EmployeeNumber", axis = 1)
df = df.drop("StandardHours", axis = 1)
df = df.drop("EmployeeCount", axis = 1)

In [173]:
# Perform categorical encoding for the categorical features 
from sklearn.preprocessing import LabelEncoder

for column in df.columns:
    if not np.issubdtype(df[column].dtype, np.number):
        df[column] = LabelEncoder().fit_transform(df[column])
df.head(7)

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,Gender,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,1,2,1102,2,1,2,1,2,0,...,3,1,0,8,0,1,6,4,0,5
1,49,0,1,279,1,8,1,1,3,1,...,4,4,1,10,3,3,10,7,1,7
2,37,1,2,1373,1,2,2,4,4,1,...,3,2,0,7,3,3,0,0,0,0
3,33,0,1,1392,1,3,4,1,4,0,...,3,3,0,8,3,3,8,7,3,0
4,27,0,2,591,1,2,1,3,1,1,...,3,4,1,6,3,3,2,2,2,2
5,32,0,1,1005,1,2,2,1,4,1,...,3,3,0,8,2,2,7,7,3,6
6,59,0,2,1324,1,3,3,3,3,0,...,4,1,3,12,3,2,1,0,0,0


## Feature Engineering

In [174]:
#create a new column
df["Age_Years"] = df["Age"]


In [175]:
#drop the age column
df = df.drop("Age", axis=1)

In [176]:
#show data frame
df

Unnamed: 0,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,Gender,HourlyRate,...,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,Age_Years
0,1,2,1102,2,1,2,1,2,0,94,...,1,0,8,0,1,6,4,0,5,41
1,0,1,279,1,8,1,1,3,1,61,...,4,1,10,3,3,10,7,1,7,49
2,1,2,1373,1,2,2,4,4,1,92,...,2,0,7,3,3,0,0,0,0,37
3,0,1,1392,1,3,4,1,4,0,56,...,3,0,8,3,3,8,7,3,0,33
4,0,2,591,1,2,1,3,1,1,40,...,4,1,6,3,3,2,2,2,2,27
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1465,0,1,884,1,23,2,3,3,1,41,...,3,1,17,3,3,5,2,0,3,36
1466,0,2,613,1,6,1,3,4,1,42,...,1,1,9,5,3,7,7,1,7,39
1467,0,2,155,1,4,3,1,2,1,87,...,2,1,6,0,3,6,2,0,3,27
1468,0,1,1023,2,2,3,3,4,1,63,...,4,0,17,3,2,9,6,0,8,49


# Model Training and Evaluation

In [177]:
#Identify targe and feature variables
X = df.iloc[:, 1:df.shape[1]].values
Y = df.iloc[:, 0].values

In [178]:
#split the data into 80% training and 20% testing
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state = 0)

In [179]:
# use the Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(n_estimators=10, criterion="entropy", random_state=0)
forest.fit(X_train, Y_train)

In [184]:
# Hyperparameter tuning 
from sklearn.model_selection import GridSearchCV

parameters = {'n_estimators': [10, 50, 100], 'criterion': ['gini', 'entropy']}
grid_search = GridSearchCV(estimator=forest, param_grid=parameters, cv=5, n_jobs=-1)
grid_search.fit(X_train, Y_train)

best_accuracy = grid_search.best_score_
best_parameters = grid_search.best_params_
print(f"#Best Accuracy: {best_accuracy}")
print(f"Best Parameters: {best_parameters}")

Best Accuracy: 0.8613847818247387
Best Parameters: {'criterion': 'entropy', 'n_estimators': 50}


In [185]:
#get the accuracy on the training data set

forest.score(X_train, Y_train)

0.983843537414966

In [186]:
#show the confusion matrix and accuracy score for the model on the test data
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(Y_test, forest.predict(X_test))

TN = cm[0][0]
TP = cm[1][1]
FN = cm[1][0]
FP = cm[0][1]

print(cm)
print("Model Testing Accuracy = {}".format((TP + TN) / (TP+TN+FN+FP)))


[[241   4]
 [ 43   6]]
Model Testing Accuracy = 0.8401360544217688


In [200]:
from sklearn.model_selection import train_test_split

# Extract feature importance
feature_importances = forest.feature_importances_
features = X.columns 
# Create a DataFrame for feature importances
importance_df = pd.DataFrame({'Feature': features, 'Importance': feature_importances})

# Sort the dataframe by importance
importance_df = importance_df.sort_values(by='Importance', ascending=False)

importance_df

Unnamed: 0,Feature,Importance
14,MonthlyIncome,0.086987
17,OverTime,0.061704
29,Age_Years,0.059497
1,DailyRate,0.052885
15,MonthlyRate,0.05193
8,HourlyRate,0.049775
22,TotalWorkingYears,0.048284
3,DistanceFromHome,0.042205
25,YearsAtCompany,0.041534
16,NumCompaniesWorked,0.036779


In [192]:
# Additional evaluation metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
y_pred = forest.predict(X_test)
accuracy = accuracy_score(Y_test, y_pred)
precision = precision_score(Y_test, y_pred)
recall = recall_score(Y_test, y_pred)
f1 = f1_score(Y_test, y_pred)

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')

Accuracy: 0.8401360544217688
Precision: 0.6
Recall: 0.12244897959183673
F1 Score: 0.20338983050847456
