In [100]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns

In [101]:
data=pd.read_csv('hr_data.csv')

In [102]:
print(data.head())
print(data.info())
print(data.describe())
print(data.isna().sum())

   Age output     BusinessTravel  DailyRate              Department  \
0   41    Yes      Travel_Rarely       1102                   Sales   
1   49     No  Travel_Frequently        279  Research & Development   
2   37    Yes      Travel_Rarely       1373  Research & Development   
3   33     No  Travel_Frequently       1392  Research & Development   
4   27     No      Travel_Rarely        591  Research & Development   

   DistanceFromHome  Education EducationField  EmployeeCount  EmployeeNumber  \
0                 1          2  Life Sciences              1               1   
1                 8          1  Life Sciences              1               2   
2                 2          2          Other              1               4   
3                 3          4  Life Sciences              1               5   
4                 2          1        Medical              1               7   

   ...  RelationshipSatisfaction StandardHours  StockOptionLevel  \
0  ...                  

In [103]:
data = data.drop(['EmployeeCount', 'EmployeeNumber', 'Over18', 'StandardHours'], axis=1)
data

Unnamed: 0,Age,output,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,Gender,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,2,Female,...,3,1,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,3,Male,...,4,4,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,4,Male,...,3,2,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,4,Female,...,3,3,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,Male,...,3,4,1,6,3,3,2,2,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1465,36,No,Travel_Frequently,884,Research & Development,23,2,Medical,3,Male,...,3,3,1,17,3,3,5,2,0,3
1466,39,No,Travel_Rarely,613,Research & Development,6,1,Medical,4,Male,...,3,1,1,9,5,3,7,7,1,7
1467,27,No,Travel_Rarely,155,Research & Development,4,3,Life Sciences,2,Male,...,4,2,1,6,0,3,6,2,0,3
1468,49,No,Travel_Frequently,1023,Sales,2,3,Medical,4,Male,...,3,4,0,17,3,2,9,6,0,8


In [104]:
from sklearn.preprocessing import LabelEncoder
cat_columns = ['BusinessTravel', 'Department', 'EducationField', 'Gender', 'JobRole', 'MaritalStatus', 'OverTime','output']

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Encode categorical columns
for col in cat_columns:
    data[col] = label_encoder.fit_transform(data[col])

# Display the first few rows of the updated dataset
data.head()

Unnamed: 0,Age,output,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,Gender,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,1,2,1102,2,1,2,1,2,0,...,3,1,0,8,0,1,6,4,0,5
1,49,0,1,279,1,8,1,1,3,1,...,4,4,1,10,3,3,10,7,1,7
2,37,1,2,1373,1,2,2,4,4,1,...,3,2,0,7,3,3,0,0,0,0
3,33,0,1,1392,1,3,4,1,4,0,...,3,3,0,8,3,3,8,7,3,0
4,27,0,2,591,1,2,1,3,1,1,...,3,4,1,6,3,3,2,2,2,2


In [105]:
y=data.iloc[:,1].values
x = data.iloc[:, [col for col in range(len(data.columns)) if col != 1]]
print(x)
print(y)

      Age  BusinessTravel  DailyRate  Department  DistanceFromHome  Education  \
0      41               2       1102           2                 1          2   
1      49               1        279           1                 8          1   
2      37               2       1373           1                 2          2   
3      33               1       1392           1                 3          4   
4      27               2        591           1                 2          1   
...   ...             ...        ...         ...               ...        ...   
1465   36               1        884           1                23          2   
1466   39               2        613           1                 6          1   
1467   27               2        155           1                 4          3   
1468   49               1       1023           2                 2          3   
1469   34               2        628           1                 8          3   

      EducationField  Envir

In [106]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=0)
print("X_train_shape:",x_train.shape)
print("X_test_shape:",x_test.shape)

X_train_shape: (1176, 30)
X_test_shape: (294, 30)


In [107]:
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

In [108]:
model = RandomForestClassifier(n_estimators=10, random_state=42)

# model training
model.fit(x_train, y_train)

# prediction
y_pred = model.predict(x_test)

pd.DataFrame(data={'Actual':y_test,'Predicted':y_pred})

Unnamed: 0,Actual,Predicted
0,0,0
1,0,0
2,1,0
3,0,0
4,1,1
...,...,...
289,0,0
290,0,0
291,1,0
292,0,0


In [109]:
print('Confusion Matrix:')
print(confusion_matrix(y_test, y_pred))

print('\nAccuracy Score:')
print(accuracy_score(y_test, y_pred)*100)

print('\nClassification Report:')
print(classification_report(y_test, y_pred))


Confusion Matrix:
[[243   2]
 [ 41   8]]

Accuracy Score:
85.37414965986395

Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.99      0.92       245
           1       0.80      0.16      0.27        49

    accuracy                           0.85       294
   macro avg       0.83      0.58      0.59       294
weighted avg       0.85      0.85      0.81       294



In [110]:
#feature importance
importances = model.feature_importances_
feature_names = X.columns
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

print(feature_importance_df)

                     Feature  Importance
15             MonthlyIncome    0.097712
0                        Age    0.073628
16               MonthlyRate    0.054722
18                  OverTime    0.053441
2                  DailyRate    0.053194
9                 HourlyRate    0.052184
23         TotalWorkingYears    0.046799
4           DistanceFromHome    0.046552
19         PercentSalaryHike    0.039694
26            YearsAtCompany    0.033255
13           JobSatisfaction    0.031539
29      YearsWithCurrManager    0.031004
7    EnvironmentSatisfaction    0.030742
12                   JobRole    0.030658
25           WorkLifeBalance    0.027783
10            JobInvolvement    0.027091
28   YearsSinceLastPromotion    0.026765
27        YearsInCurrentRole    0.026721
17        NumCompaniesWorked    0.025072
21  RelationshipSatisfaction    0.024749
6             EducationField    0.024690
22          StockOptionLevel    0.023509
24     TrainingTimesLastYear    0.022500
5               