-
Notifications
You must be signed in to change notification settings - Fork 0
/
HR_Casestudy.R
155 lines (115 loc) · 3.96 KB
/
HR_Casestudy.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
#install.packages("rattle", repos="https://rattle.togaware.com", type="source")
library(readr)
HR_Casestudy <- read_csv("~/Documents/HR_Casestudy.csv")
HR <- tbl_df(HR_Casestudy)
glimpse(HR)
HR$Attrition <- as.factor(HR$Attrition)
HR$BusinessTravel <- as.factor(HR$BusinessTravel)
HR$Department <- as.factor(HR$Department)
HR$EducationField <- as.factor(HR$EducationField)
HR$Gender <- as.factor(HR$Gender)
HR$JobRole <- as.factor(HR$JobRole)
HR$MaritalStatus <- as.factor(HR$MaritalStatus)
HR$Over18 <- as.factor(HR$Over18)
HR$OverTime <- as.factor(HR$OverTime)
HR$StockOptionLevel <- as.factor(HR$StockOptionLevel)
HR$Education <- factor(HR$Education, levels= c(1,2,3,4,5), labels=c("Below College", "College", "Bachelor", "Master", "Doctor"))
HR$EnvironmentSatisfaction <- factor(HR$EnvironmentSatisfaction, levels= c(1,2,3,4), labels=c("Low","Medium","High","Very High") )
HR$JobInvolvement <- factor(HR$JobInvolvement, levels= c(1,2,3,4), labels=c("Low","Medium","High","Very High") )
HR$PerformanceRating <- factor(HR$PerformanceRating, levels= c(1,2,3,4), labels=c("Low","Good","Excellent","Outstanding") )
HR$RelationshipSatisfaction <- factor(HR$RelationshipSatisfaction, levels= c(1,2,3,4), labels=c("Low","Medium","High","Very High"))
HR$WorkLifeBalance <- factor(HR$WorkLifeBalance, levels= c(1,2,3,4), labels=c("Bad","Good","Better","Best"))
glimpse(HR)
summary(HR)
levels(HR$Education)
summary(HR$Education)
HR$EmployeeNumber <- NULL
HR$EmployeeCount <- NULL
# Analtics
glimpse(HR)
HR%>%
na.omit()%>%
select(MonthlyIncome, YearsAtCompany, Attrition, Gender)%>%
ggplot(aes(x=YearsAtCompany, y= MonthlyIncome, col=Attrition, alpha=0.4))+
geom_point()+
geom_smooth(method="lm", se=F)+
facet_grid(.~Gender)
#Conduct Analysis & Find Trends & Correlations
HR%>%
na.omit()%>%
select(JobLevel, Attrition)%>%
mutate(JobLevel=as.factor(JobLevel))%>%
group_by(JobLevel)%>%
summarise(patt=sum(Attrition=="Yes")/n()*100)%>%
ggplot(aes(JobLevel, patt))+
geom_bar(stat="identity", position="dodge")
HR%>%
na.omit()%>%
select(JobLevel, Attrition, WorkLifeBalance)%>%
mutate(JobLevel=as.factor(JobLevel))%>%
group_by(JobLevel, WorkLifeBalance)%>%
summarise(patt=sum(Attrition=="Yes")/n()*100)%>%
ggplot(aes(JobLevel, patt))+
geom_bar(stat="identity", position="dodge")+
facet_grid(.~WorkLifeBalance)
#Creating Classification Model
set.seed(100)
# Shuffle the dataset, call the result shuffled
n<-nrow(HR)
shuffled<-HR[sample(n),]
# Split the data in train and test
train_indices <- 1:round(0.7*n)
test_indices<-(round(0.7*n)+1):n
#Making the New Data Set
train<-shuffled[train_indices,]
test<- shuffled[test_indices,]
# Print the structure of train and test
str(train)
str(test)
#Create the Classifcation Model & Predict on Test set
my_mod <- rpart(Attrition ~ ., train, method = "class", maxdepth = 3)
pred<-predict(my_mod, test, type="class")
#Create a Confustion Matrix
conf<-table(HR$Attrition, pred)
TP<-conf[1, 1]
FN<-conf[1, 2]
FP<-conf[2, 1]
TN<-conf[2, 2]
print(conf)
# Calculate and print the accuracy:
acc<-(TP+TN)/(TP+FN+FP+TN)
acc*100
# Calculate and print out the precision: prec
prec<-TP/(TP+FP)
prec*100
#Calculate and print out the recall: rec
rec<-TP/(TP+FN)
rec*100
# Draw the decision tree
install.packages("rpart.plot")
library(rpart)
library(rattle)
library(rpart.plot)
library(RColorBrewer)
rpart.plot(my_mod, 3)
summary(rpart(Attrition ~ ., train, method = "class"))
glimpse(HR)
summary(HR)
#Trim The Tree
glimpse(HR)
summary(HR$Attrition)
#Improve Classfication Model
HR$EducationField <- NULL
HR$EnvironmentSatisfaction<-NULL
HR$Department <-NULL
HR$BusinessTravel <-NULL
HR$HourlyRate <-NULL
HR$Gender <-NULL
HR$JobInvolvement <-NULL
HR$MaritalStatus <-NULL
HR$NumCompaniesWorked <-NULL
HR$Over18 <-NULL
HR$JobRole <-NULL
#What Is relevent from the data?
#Job Level of above 1.5 , attirton rate is 100-63=37%. Hence, to reduce attrition rate, focus on employees below 1.5Job LEvel.
#Focus on those who do over time and are below 1.5 Job Level.