# Data prepare

## Data import

In [None]:
df <- read.csv("./data/Employee-Attrition.csv")
head(df)

In [None]:
# 创建副本
original_df <- df

# 查看数据集
library(dplyr)
library(tidyverse)
# Using an insightful summary with skim and kable
df %>% glimpse()

In [None]:
# Look up for missing values
any(is.na(df))

## Prepare data for visualization

In [None]:
# Translate into Chinese

df_zh <- df %>%
  mutate(Gender = case_when(
    Gender == "Male" ~ "男",
    Gender == "Female" ~ "女",
    TRUE ~ Gender
  )) %>%
  mutate(Attrition = case_when(
    Attrition == "Yes" ~ "已流失",
    Attrition == "No" ~ "未流失",
    TRUE ~ Attrition
  )) %>%
  mutate(Over18 = case_when(
    Over18 == "Y" ~ "已成年",
    TRUE ~ Over18
  )) %>%
  mutate(OverTime = case_when(
    OverTime == "Yes" ~ "有加班",
    OverTime == "No" ~ "无加班",
    TRUE ~ OverTime
  )) %>%
  mutate(Department = case_when(
    Department == "Research & Development" ~ "研发部门",
    Department == "Sales" ~ "销售部门",
    Department == "Human Resources" ~ "人力部门",
    TRUE ~ Department
  )) %>%
  mutate(JobRole = case_when(
    JobRole == "Sales Executive" ~ "销售主管",
    JobRole == "Research Scientist" ~ "研究科学家",
    JobRole == "Laboratory Technician" ~ "实验室技术员",
    JobRole == "Manufacturing Director" ~ "制造总监",
    JobRole == "Healthcare Representative" ~ "医疗代表",
    JobRole == "Manager" ~ "经理",
    JobRole == "Sales Representative" ~ "销售代表",
    JobRole == "Research Director" ~ "研究总监",
    JobRole == "Human Resources" ~ "人力资源",
    TRUE ~ JobRole
  )) %>%
  mutate(BusinessTravel = case_when(
    BusinessTravel == "Travel_Rarely" ~ "很少出差",
    BusinessTravel == "Travel_Frequently" ~ "频繁出差",
    BusinessTravel == "Non-Travel" ~ "从不出差",
    TRUE ~ BusinessTravel
  )) %>%
  mutate(EducationField = case_when(
    EducationField == "Life Sciences" ~ "生命科学",
    EducationField == "Human Resources" ~ "人力资源",
    EducationField == "Technical Degree" ~ "技术学位",
    EducationField == "Marketing" ~ "市场营销",
    EducationField == "Medical" ~ "医疗",
    EducationField == "Other" ~ "其他",
    TRUE ~ EducationField
  )) %>%
  mutate(MaritalStatus = case_when(
    MaritalStatus == "Single" ~ "单身",
    MaritalStatus == "Married" ~ "已婚",
    MaritalStatus == "Divorced" ~ "离异",
    TRUE ~ MaritalStatus
  ))
df_zh %>% glimpse()

# 重命名列名
col_names <- c('年龄', '流失情况', '出差情况', '日薪', '所属部门', '公里离家距离', '教育水平', '教育领域', '员工数量（恒1）', '员工数量', '环境满意度', '性别', '小时薪', '工作投入度', '岗位级别', '岗位名称', '工作满意度', '婚姻状态', '月收入', '月费率', '在多少家公司工作过', '是否成年', '是否加班', '薪资增长百分比', '绩效评级', '人际关系满意度', '标准工作小时数', '股票期权级别', '总工作年数', '去年接受培训的次数', '工作与生活的平衡情况', '在当前公司的工作年数', '在当前职位的工作年数', '自上次升职以来的年数', '与当前经理合作的年数')
df_zh <- rename(df_zh, !!!setNames(names(df_zh), col_names))
df_zh %>% glimpse()

In [None]:
# 创建副本
original_df_zh <- df_zh

## Prepare data for machine learning

### Convert categorical type data to integer encoding

In [None]:
df_train <- df %>%
  mutate(Gender = ifelse(Gender == "Male", 1, ifelse(Gender == "Female", 0, Gender))) %>%
  mutate(Attrition = ifelse(Attrition == "Yes", 0, ifelse(Attrition == "No", 1, Attrition))) %>%
  mutate(Over18 = ifelse(Over18 == "Y", 1, Over18)) %>%
  mutate(OverTime = ifelse(OverTime == "Yes", 1, ifelse(OverTime == "No", 0, OverTime))) %>%
  mutate(Department = ifelse(Department == "Research & Development", 0,
                             ifelse(Department == "Sales", 1,
                                    ifelse(Department == "Human Resources", 2, Department)))) %>%
  mutate(JobRole = ifelse(JobRole == "Sales Executive", 0,
                          ifelse(JobRole == "Research Scientist", 1,
                                 ifelse(JobRole == "Laboratory Technician", 2,
                                        ifelse(JobRole == "Manufacturing Director", 3,
                                               ifelse(JobRole == "Healthcare Representative", 4,
                                                      ifelse(JobRole == "Manager", 5,
                                                             ifelse(JobRole == "Sales Representative", 6,
                                                                    ifelse(JobRole == "Research Director", 7,
                                                                           ifelse(JobRole == "Human Resources", 8, JobRole)))))))))) %>%
  mutate(BusinessTravel = ifelse(BusinessTravel == "Travel_Rarely", 1,
                                 ifelse(BusinessTravel == "Travel_Frequently", 2,
                                        ifelse(BusinessTravel == "Non-Travel", 0, BusinessTravel)))) %>%
  mutate(EducationField = ifelse(EducationField == "Life Sciences", 0,
                                 ifelse(EducationField == "Human Resources", 1,
                                        ifelse(EducationField == "Technical Degree", 2,
                                               ifelse(EducationField == "Marketing", 3,
                                                      ifelse(EducationField == "Medical", 4,
                                                             ifelse(EducationField == "Other", 5, EducationField))))))) %>%
  mutate(MaritalStatus = ifelse(MaritalStatus == "Single", 0,
                                ifelse(MaritalStatus == "Married", 1,
                                       ifelse(MaritalStatus == "Divorced", 2, MaritalStatus))))

df_train$Gender <- as.integer(df_train$Gender)
df_train$Attrition <- as.integer(df_train$Attrition)
df_train$Over18 <- as.integer(df_train$Over18)
df_train$OverTime <- as.integer(df_train$OverTime)
df_train$Department <- as.integer(df_train$Department)
df_train$JobRole <- as.integer(df_train$JobRole)
df_train$BusinessTravel <- as.integer(df_train$BusinessTravel)
df_train$EducationField <- as.integer(df_train$EducationField)
df_train$MaritalStatus <- as.integer(df_train$MaritalStatus)


df_train %>% glimpse()

### Delete constant columns

In [None]:
df_train <- select(df_train, -'Over18')
df_train <- select(df_train, -'StandardHours')
df_train <- select(df_train, -'EmployeeCount')

# Static

In [None]:
red <- "#a6dcc3"
darkred <- "#df0017"
green <- "#ffdac2"
yellow <- "#fcffc0"
brown <- "#fed097"
st_y <- "员工数量"
st_y_per <- "员工数量占比"
font <- "serif"

# Analyze basic factors
1. Demographic characteristics(sex, age, marital status, educational level)
2. Department and job role

## Demographic characteristics
Stacked Bar Charts and Percent Stacked Bar Charts

### Sex -> Attrition

In [None]:
options(repr.plot.width=6, repr.plot.height=4) 

p1_1 <- 
  df_zh %>%
  select(性别, 流失情况) %>% 
  group_by(性别, 流失情况) %>% 
  summarize(n=n()) %>% 
    ggplot(aes(x=性别, y=n, fill = 流失情况)) +
    geom_bar(stat="identity", color="black", size=.2, width=.6, position="stack") +
    geom_label(aes(label=n, y=n), 
               color = "black", family = font) +
    theme_minimal() +
    # 坐标轴标题
    labs(y=st_y) +
    # 填充颜色
    scale_fill_manual(values=c(red, green))+
    # 全图字体&字号
    theme(text = element_text(family = font, size=11)) +
    # 坐标轴外观
    theme(
      # 坐标轴标题距离其他元素间距&字号
      axis.title.x = element_text(margin = margin(t = 10, r = 0, b = 0, l = 0), size=12), 
      axis.title.y = element_text(margin = margin(t = 0, r = 15, b = 0, l = 0), size=12),
      # 坐标轴刻度字号
      axis.text = element_text(size = 10)) 
p1_1

In [None]:
options(repr.plot.width=6, repr.plot.height=4) 

p1_2 <- 
  df_zh %>%
  select(性别, 流失情况) %>% 
  group_by(性别, 流失情况) %>% 
  summarize(n=n()) %>% 
  mutate(pct=round(prop.table(n),2) * 100) %>% arrange(desc(pct)) %>%
    ggplot(aes(x=fct_reorder(性别,pct), y=pct, fill=流失情况, color=流失情况)) + 
    geom_bar(stat="identity", color="black", size=.2, width=.6, position="fill") +
    geom_label(aes(label=paste0(pct, "%"), fill = 流失情况), 
               color = "black", family = font, 
               position = position_fill(vjust = 0.5)) + 
    theme_minimal() +
    # 坐标轴标题
    labs(y=st_y_per, x="性别") +
    # 填充颜色
    scale_fill_manual(values=c(red, green))+
    # 全图字体&字号
    theme(text = element_text(family = font, size=11)) +
    # 坐标轴外观
    theme(
      # 坐标轴标题距离其他元素间距&字号
      axis.title.x = element_text(margin = margin(t = 10, r = 0, b = 0, l = 0), size=12), 
      axis.title.y = element_text(margin = margin(t = 0, r = 15, b = 0, l = 0), size=12),
      # 坐标轴刻度字号
      axis.text = element_text(size = 10))
p1_2

### Age -> Attrition
Devide age into 4 generations

In [None]:
df_zh$代际 <- ifelse(df_zh$年龄<37,"千禧一代",
ifelse(df_zh$年龄>=38 & df_zh$年龄<54,"X世代",
ifelse(df_zh$年龄>=54 & df_zh$年龄<73,"婴儿潮一代",
       "沉默一代"
)))
df_zh$代际

In [None]:
options(repr.plot.width=6, repr.plot.height=4) 

p2_1 <- 
  df_zh %>%
  select(代际, 流失情况) %>% 
  group_by(代际, 流失情况) %>% 
  summarize(n=n()) %>% 
  mutate(代际 = factor(代际, levels=c("千禧一代", "X世代", "婴儿潮一代", "沉默一代"))) %>%
    ggplot(aes(x=代际, y=n, fill = 流失情况)) +
    geom_bar(stat="identity", color="black", size=.2, width=.6, position="stack") +
    geom_label(aes(label=n, y=n), 
               color = "black", family = font) +
    theme_minimal() +
    # 坐标轴标题
    labs(y=st_y) +
    # 填充颜色
    scale_fill_manual(values=c(red, green))+
    # 全图字体&字号
    theme(text = element_text(family = font, size=11)) +
    # 坐标轴外观
    theme(
      # 坐标轴标题距离其他元素间距&字号
      axis.title.x = element_text(margin = margin(t = 10, r = 0, b = 0, l = 0), size=12), 
      axis.title.y = element_text(margin = margin(t = 0, r = 15, b = 0, l = 0), size=12),
      # 坐标轴刻度字号
      axis.text = element_text(size = 10)) 
p2_1

In [None]:
options(repr.plot.width=6, repr.plot.height=4) 

p2_2 <- 
  df_zh %>%
  select(代际, 流失情况) %>% 
  group_by(代际, 流失情况) %>% 
  summarize(n=n()) %>% 
  mutate(pct=round(prop.table(n),2) * 100) %>% arrange(desc(pct)) %>%
  mutate(代际 = factor(代际, levels=c("千禧一代", "X世代", "婴儿潮一代", "沉默一代"))) %>%
    ggplot(aes(x=fct_reorder(代际,pct), y=pct, fill=流失情况, color=流失情况)) + 
    geom_bar(stat="identity", color="black", size=.2, width=0.6, position="fill") +
    geom_label(aes(label=paste0(pct, "%"), fill = 流失情况), 
               color = "black", family = font, 
               position = position_fill(vjust = 0.5)) + 
    theme_minimal() +
    # 坐标轴标题
    labs(y=st_y_per, x="代际") +
    # 填充颜色
    scale_fill_manual(values=c(red, green))+
    # 全图字体&字号
    theme(text = element_text(family = font, size=11)) +
    # 坐标轴外观
    theme(
      # 坐标轴标题距离其他元素间距&字号
      axis.title.x = element_text(margin = margin(t = 10, r = 0, b = 0, l = 0), size=12), 
      axis.title.y = element_text(margin = margin(t = 0, r = 15, b = 0, l = 0), size=12),
      # 坐标轴刻度字号
      axis.text = element_text(size = 10))
p2_2

### Marital status -> Attrition

In [None]:
options(repr.plot.width=6, repr.plot.height=4) 

p3_1 <- 
  df_zh %>%
  select(婚姻状态, 流失情况) %>% 
  group_by(婚姻状态, 流失情况) %>% 
  summarize(n=n()) %>% 
  mutate(婚姻状态 = factor(婚姻状态, levels=c("已婚", "单身", "离异"))) %>%
    ggplot(aes(x=婚姻状态, y=n, fill = 流失情况)) +
    geom_bar(stat="identity", color="black", size=.2, width=.6, position="stack") +
    geom_label(aes(label=n, y=n), 
               color = "black", family = font) +
    theme_minimal() +
    # 坐标轴标题
    labs(y=st_y) +
    # 填充颜色
    scale_fill_manual(values=c(red, green))+
    # 全图字体&字号
    theme(text = element_text(family = font, size=11)) +
    # 坐标轴外观
    theme(
      # 坐标轴标题距离其他元素间距&字号
      axis.title.x = element_text(margin = margin(t = 10, r = 0, b = 0, l = 0), size=12), 
      axis.title.y = element_text(margin = margin(t = 0, r = 15, b = 0, l = 0), size=12),
      # 坐标轴刻度字号
      axis.text = element_text(size = 10)) 
p3_1

In [None]:
options(repr.plot.width=6, repr.plot.height=4) 

p3_2 <- 
  df_zh %>%
  select(婚姻状态, 流失情况) %>% 
  group_by(婚姻状态, 流失情况) %>% 
  summarize(n=n()) %>% 
  mutate(pct=round(prop.table(n),2) * 100) %>% arrange(desc(pct)) %>%
  mutate(婚姻状态 = factor(婚姻状态, levels=c("已婚", "单身", "离异"))) %>%
    ggplot(aes(x=fct_reorder(婚姻状态,pct), y=pct, fill=流失情况, color=流失情况)) + 
    geom_bar(stat="identity", color="black", size=.2, width=0.6, position="fill") +
    geom_label(aes(label=paste0(pct, "%"), fill = 流失情况), 
               color = "black", family = font, 
               position = position_fill(vjust = 0.5)) + 
    theme_minimal() +
    # 坐标轴标题
    labs(y=st_y_per, x="婚姻状态") +
    # 填充颜色
    scale_fill_manual(values=c(red, green))+
    # 全图字体&字号
    theme(text = element_text(family = font, size=11)) +
    # 坐标轴外观
    theme(
      # 坐标轴标题距离其他元素间距&字号
      axis.title.x = element_text(margin = margin(t = 10, r = 0, b = 0, l = 0), size=12), 
      axis.title.y = element_text(margin = margin(t = 0, r = 15, b = 0, l = 0), size=12),
      # 坐标轴刻度字号
      axis.text = element_text(size = 10))
p3_2

### Educational level -> Attrition

In [None]:
df_zh$教育水平_文字 <- 
ifelse(df_zh$教育水平 == 1, "本科以下学历",
ifelse(df_zh$教育水平 == 2, "本科学历",
ifelse(df_zh$教育水平 == 3, "学士学位",
ifelse(df_zh$教育水平 == 4, "硕士学位", 
       "博士学位"))))
unique(df_zh$教育水平_文字)

options(repr.plot.width=6, repr.plot.height=4) 


In [None]:
p4_1 <- 
  df_zh %>%
  select(教育水平_文字, 流失情况) %>% 
  group_by(教育水平_文字, 流失情况) %>% 
  summarize(n=n()) %>% 
  mutate(教育水平_文字 = factor(教育水平_文字, levels=c("学士学位", "硕士学位", "本科学历", "本科以下学历", "博士学位"))) %>%
    ggplot(aes(x=教育水平_文字, y=n, fill = 流失情况)) +
    geom_bar(stat="identity", color="black", size=.2, width=.6, position="stack") +
    geom_label(aes(label=n, y=n), 
               color = "black", family = font) +
    theme_minimal() +
    # 坐标轴标题
    labs(y=st_y, x="教育水平") +
    # 填充颜色
    scale_fill_manual(values=c(red, green))+
    # 全图字体&字号
    theme(text = element_text(family = font, size=11)) +
    # 坐标轴外观
    theme(
      # 坐标轴标题距离其他元素间距&字号
      axis.title.x = element_text(margin = margin(t = 10, r = 0, b = 0, l = 0), size=12), 
      axis.title.y = element_text(margin = margin(t = 0, r = 15, b = 0, l = 0), size=12),
      # 坐标轴刻度字号
      axis.text = element_text(size = 10)) 
p4_1

In [None]:
options(repr.plot.width=6, repr.plot.height=4) 

p4_2 <- 
  df_zh %>%
  select(教育水平_文字, 流失情况) %>% 
  group_by(教育水平_文字, 流失情况) %>% 
  summarize(n=n()) %>% 
  mutate(pct=round(prop.table(n),2) * 100) %>% arrange(desc(pct)) %>%
  mutate(教育水平_文字 = factor(教育水平_文字, levels=c("学士学位", "硕士学位", "本科学历", "本科以下学历", "博士学位"))) %>%
    ggplot(aes(x=fct_reorder(教育水平_文字,pct), y=pct, fill=流失情况, color=流失情况)) + 
    geom_bar(stat="identity", color="black", size=.2, width=0.6, position="fill") +
    geom_label(aes(label=paste0(pct, "%"), fill = 流失情况), 
               color = "black", family = font, 
               position = position_fill(vjust = 0.5)) + 
    theme_minimal() +
    # 坐标轴标题
    labs(y=st_y_per, x="教育水平") +
    # 填充颜色
    scale_fill_manual(values=c(red, green))+
    # 全图字体&字号
    theme(text = element_text(family = font, size=11)) +
    # 坐标轴外观
    theme(
      # 坐标轴标题距离其他元素间距&字号
      axis.title.x = element_text(margin = margin(t = 10, r = 0, b = 0, l = 0), size=12), 
      axis.title.y = element_text(margin = margin(t = 0, r = 15, b = 0, l = 0), size=12),
      # 坐标轴刻度字号
      axis.text = element_text(size = 10))
p4_2

In [None]:
options(repr.plot.width=8, repr.plot.height=20) 

library(Cairo)
CairoPNG(file="01-人口特征.png", width=12, height=13, units = "in", res = 300)

library(cowplot)
plot_grid(p1_1, p1_2, p2_1, p2_2, p3_1, p3_2, p4_1, p4_2, ncol=2, nrow=4)

dev.off()

## Department and job role
Pyramid scheme

### Department -> Attrition

In [None]:
options(repr.plot.width=10, repr.plot.height=4) 

attr.dep <- df_zh %>% 
select(所属部门, 流失情况) %>% 
group_by(所属部门, 流失情况) %>% 
summarize(amount=n()) %>%
mutate(pct=round(prop.table(amount),2) * 100) %>% 
arrange(pct)

nofunc <- colorRampPalette(c("#A9F5A9", "#58FA58", "#01DF01"))
yesfunc <- colorRampPalette(c("#F5A9A9", "#FE2E2E", "#B40404"))

yes.attr <- attr.dep %>% 
filter(流失情况 == "已流失") %>% 
arrange(所属部门) 

no.attr <- attr.dep %>% 
filter(流失情况 == "未流失") %>% 
arrange(所属部门)

library(Cairo)
CairoPNG(file="04-部门.png", width=8, height=4, units = "in", res = 300)

library(plotrix)
# 使用par指定字体、线粗等
par(family = font, lwd=0.5)
par(mar = pyramid.plot(no.attr$pct, yes.attr$pct, 
                       labels = unique(attr.dep$所属部门),
                       top.labels=c("未流失","","已流失"),
                       gap=30, show.values = T, 
                       rxcol = yesfunc(9), lxcol = nofunc(9)))
dev.off()

### Job role -> Attrition

In [None]:
options(repr.plot.width=10, repr.plot.height=6) 

attr.job <- df_zh %>% 
select(岗位名称, 流失情况) %>% 
group_by(岗位名称, 流失情况) %>% 
summarize(amount=n()) %>%
mutate(pct=round(prop.table(amount),2) * 100) %>% 
arrange(pct)

nofunc <- colorRampPalette(c("#A9F5A9", "#58FA58", "#01DF01"))
yesfunc <- colorRampPalette(c("#F5A9A9", "#FE2E2E", "#B40404"))

yes.attr <- attr.job %>% 
filter(流失情况 == "已流失") %>% 
arrange(岗位名称) 

no.attr <- attr.job %>% 
filter(流失情况 == "未流失") %>% 
arrange(岗位名称)

library(Cairo)
CairoPNG(file="06-岗位.png", width=8, height=4, units = "in", res = 300)

library(plotrix)
# 使用par指定字体、线粗等
par(family = font, lwd=0.5)
par(mar = pyramid.plot(no.attr$pct, yes.attr$pct, 
                       labels = unique(attr.job$岗位名称),
                       top.labels=c("未流失","","已流失"),
                       gap=30, show.values = T, 
                       rxcol = yesfunc(9), lxcol = nofunc(9)))
dev.off()

#  Ranking of feature importance
1. Decision Tree
2. H2O AutoML

## Decision Tree

In [None]:
set.seed(142)
# I personally prefer to shuffle my data before splitting.
df_zh <- df_zh[sample(nrow(df_zh)),]

# Let's encode the ordinal variables
df_zh$出差情况 = factor(df_zh$出差情况,
                         levels = c('频繁出差', '很少出差', '从不出差'),
                         labels = c(1, 2, 3))

# Changing the datatype from integer to factors from the ordinal variables.
cols <- c("教育水平", "环境满意度", "工作投入度", "岗位级别",
         "工作满意度", "绩效评级", "人际关系满意度", 
         "股票期权级别", "去年接受培训的次数", "工作与生活的平衡情况")

df_zh[cols] <- lapply(df_zh[cols], factor)

library(caret)
# Splitting our data
trainIndex <- createDataPartition(df_zh$流失情况, p=0.8, list=FALSE, times=1)

train <- df_zh[trainIndex,]
test <- df_zh[-trainIndex,]

In [None]:
# Checking that both the training and testing sets have the same label proportions.
prop_train <- train %>% select(流失情况) %>% group_by(流失情况) %>% summarize(n=n()) %>%
mutate(pct=round(prop.table(n), 2))

prop_test <- test %>% select(流失情况) %>% group_by(流失情况) %>% summarize(n=n()) %>%
mutate(pct=round(prop.table(n), 2))

prop_train
prop_test

In [None]:
options(repr.plot.width=15, repr.plot.height=10) 

library(rpart) 

# 构建了一个决策树模型
tree <- rpart(流失情况 ~ ., data=train)

library(visNetwork)

visTree(tree,height = "800px",
        colorY = c("green","red","blue"))

# Complicated DecisionTree, Is there a way to determine variable importance?
var_imp <- data.frame(tree$variable.importance)
var_imp$features <- rownames(var_imp)
var_imp <- var_imp[, c(2, 1)]
var_imp$importance <- round(var_imp$tree.variable.importance, 2)
var_imp$tree.variable.importance <- NULL

library(RColorBrewer)

colorCount <- length(unique(var_imp$features))
feature_importance <- var_imp %>%
ggplot(aes(x=reorder(features, importance), y=importance, fill=features)) + 
geom_bar(stat='identity', color="black", size=0.2) + 
coord_flip() + 
scale_fill_manual(values = colorRampPalette(brewer.pal(24, "Pastel1"))(colorCount)) + 
theme_minimal() +
theme(# 全图字体&字号
      text = element_text(family = font, size=11),
      # 不显示图例
      legend.position="none", 
      # 坐标轴标题距离其他元素间距&字号
      axis.title.x = element_text(margin = margin(t = 10, r = 0, b = 0, l = 0), size=12), 
      axis.title.y = element_text(margin = margin(t = 0, r = 15, b = 0, l = 0), size=12),
      # 坐标轴刻度字号
      axis.text = element_text(size = 10)) + 
geom_label(aes(label=paste0(importance, "%")), color = "black", family = font) + 
labs(x="特征", y="重要性")

library(Cairo)
CairoPNG(file="19-特征重要性.png", width=10, height=8, units = "in", res = 300)
feature_importance
dev.off()

In [None]:
options(repr.plot.width=8, repr.plot.height=6) 

predictions <- predict(tree, test, type="class")
conf_df <- data.frame(table(test$流失情况, predictions))

library(Cairo)
CairoPNG(file="20-混淆矩阵.png", width=4, height=4, units = "in", res = 300)

ggplot(data =  conf_df, mapping = aes(x = predictions, y = Var1)) +
geom_tile(aes(fill = Freq)) +
geom_text(aes(label = sprintf("%1.0f", Freq)), vjust = 1, family = "serif", size=6) +
scale_fill_gradient(low = "#fcf4ed", high = "#f47b2a") +
theme_minimal() +
theme(# 全图字体&字号
      text = element_text(family = "serif", size=11),
      # 坐标轴标题距离其他元素间距&字号
      axis.title.x = element_text(margin = margin(t = 10, r = 0, b = 0, l = 0), size=12), 
      axis.title.y = element_text(margin = margin(t = 0, r = 15, b = 0, l = 0), size=12),
      # 坐标轴刻度字号
      axis.text = element_text(size = 10)) + 
labs(y="流失情况", x="预测", fill="频率")

dev.off()

In [None]:
prune.tree <- prune(tree, cp=0.02) # pruning the tree

visTree(prune.tree,height = "600px",
        colorY = c("green","red","blue"))

library(partykit)

rparty.tree <- as.party(tree)
rparty.tree

## H2O AutoML

In [None]:
library(h2o)

# 启动模型
h2o.init()

# 将数据框改成h2o格式
h2o_df <- as.h2o(df_train)

# 拆分训练集、测试集、验证集
split_df <- h2o.splitFrame(h2o_df, c(0.7, 0.15), seed=12)
h2o_train <- h2o.assign(split_df[[1]], "train")
h2o_validation <- h2o.assign(split_df[[2]], "validation")
h2o_test <- h2o.assign(split_df[[2]], "test")

h2o.describe(h2o_train)

In [None]:
# Establish X and Y (Features and Labels)
y <- "Attrition"
x <- setdiff(names(h2o_train), y)

auto_ml <- h2o.automl(
    # 指定目标变量
    y = y,
    # 指定特征变量
    x = x,
    # 指定用于训练的数据集
    training_frame = h2o_train,
    # 指定用于生成排行榜的数据集
    leaderboard_frame = h2o_validation,
    project_name = "Attribution",
    max_models = 10,
    seed = 12
)

# Check for the top models
top_models <- auto_ml@leaderboard
print(top_models)

In [None]:
# Get the best model
# Our aim is to determine the feature importance
model_id <- as.data.frame(top_models$model_id)[,1]
best_family <- h2o.getModel(grep("StackedEnsemble_BestOfFamily", model_id, value=TRUE)[1])
obtain_model <- h2o.getModel(best_family@model$metalearner$name)

In [None]:
# How important is each model to the StackEnsemble
h2o.varimp(obtain_model)

In [None]:
options(repr.plot.width=8, repr.plot.height=4) 

library(Cairo)
CairoPNG(file="21-变量重要性：GLM.png", width=8, height=4, units = "in", res = 300)

par(family=font)
h2o.varimp_plot(obtain_model)

dev.off()

In [None]:
xgb <- h2o.getModel(grep("DeepLearning", model_id, value = TRUE)[1])

# Examine the variable importance of the top XGBoost model
# XGBoost can show the feature importance as oppose to the stack ensemble
h2o.varimp(xgb)

# We can also plot the base learner contributions to the ensemble.
library(Cairo)
CairoPNG(file="22-变量重要性：DeepLearning.png", width=8, height=4, units = "in", res = 300)

par(family=font)
h2o.varimp_plot(xgb)

dev.off()

In [None]:
xgb <- h2o.getModel(grep("GBM", model_id, value = TRUE)[1])

# Examine the variable importance of the top XGBoost model
# XGBoost can show the feature importance as oppose to the stack ensemble
h2o.varimp(xgb)

# We can also plot the base learner contributions to the ensemble.
library(Cairo)
CairoPNG(file="23-变量重要性：GBM.png", width=8, height=4, units = "in", res = 300)

par(family=font)
h2o.varimp_plot(xgb)

dev.off()

In [None]:
xgb <- h2o.getModel(grep("DRF", model_id, value = TRUE)[1])

# Examine the variable importance of the top XGBoost model
# XGBoost can show the feature importance as oppose to the stack ensemble
h2o.varimp(xgb)

# We can also plot the base learner contributions to the ensemble.
library(Cairo)
CairoPNG(file="24-变量重要性：DRF.png", width=8, height=4, units = "in", res = 300)

par(family=font)
h2o.varimp_plot(xgb)

dev.off()

In [None]:
xgb <- h2o.getModel(grep("GLM", model_id, value = TRUE)[1])

# Examine the variable importance of the top XGBoost model
# XGBoost can show the feature importance as oppose to the stack ensemble
h2o.varimp(xgb)

# We can also plot the base learner contributions to the ensemble.
library(Cairo)
CairoPNG(file="GLM.png", width=8, height=4, units = "in", res = 300)

par(family=font)
h2o.varimp_plot(xgb)

dev.off()

# Analyze important factors
1. Important factors -> Attrition
2. Basic factors -> Important factors
3. Correlation coefficient analysis

## Important factors -> Attrition
1. Overtime work
2. Monthly income

### Overtime work -> Attrition

In [None]:
options(repr.plot.width=6, repr.plot.height=4) 

p1_1 <- 
  df_zh %>%
  select(是否加班, 流失情况) %>% 
  group_by(是否加班, 流失情况) %>% 
  summarize(n=n()) %>% 
    ggplot(aes(x=是否加班, y=n, fill = 流失情况)) +
    geom_bar(stat="identity", color="black", size=.2, width=.6, position="stack") +
    geom_label(aes(label=n, y=n), 
               color = "black", family = font) +
    theme_minimal() +
    # 坐标轴标题
    labs(y=st_y) +
    # 填充颜色
    scale_fill_manual(values=c(red, green))+
    # 全图字体&字号
    theme(text = element_text(family = font, size=11)) +
    # 坐标轴外观
    theme(
      # 坐标轴标题距离其他元素间距&字号
      axis.title.x = element_text(margin = margin(t = 10, r = 0, b = 0, l = 0), size=12), 
      axis.title.y = element_text(margin = margin(t = 0, r = 15, b = 0, l = 0), size=12),
      # 坐标轴刻度字号
      axis.text = element_text(size = 10)) +
    # 不显示图例
    theme(legend.position="none")
p1_1

In [None]:
options(repr.plot.width=6, repr.plot.height=4) 

p1_2 <- 
  df_zh %>%
  select(是否加班, 流失情况) %>% 
  group_by(是否加班, 流失情况) %>% 
  summarize(n=n()) %>% 
  mutate(pct=round(prop.table(n),2) * 100) %>% arrange(desc(pct)) %>%
    ggplot(aes(x=fct_reorder(是否加班,pct), y=pct, fill=流失情况, color=流失情况)) + 
    geom_bar(stat="identity", color="black", size=.2, width=.6, position="fill") +
    geom_label(aes(label=paste0(pct, "%"), fill = 流失情况), 
               color = "black", family = font, 
               position = position_fill(vjust = 0.5)) + 
    theme_minimal() +
    # 坐标轴标题
    labs(y=st_y_per, x="是否加班") +
    # 填充颜色
    scale_fill_manual(values=c(red, green))+
    # 全图字体&字号
    theme(text = element_text(family = font, size=11)) +
    # 坐标轴外观
    theme(
      # 坐标轴标题距离其他元素间距&字号
      axis.title.x = element_text(margin = margin(t = 10, r = 0, b = 0, l = 0), size=12), 
      axis.title.y = element_text(margin = margin(t = 0, r = 15, b = 0, l = 0), size=12),
      # 坐标轴刻度字号
      axis.text = element_text(size = 10))
p1_2

In [None]:
options(repr.plot.width=8, repr.plot.height=4) 

library(Cairo)
CairoPNG(file="08-加班.png", width=8, height=4, units = "in", res = 300)

library(cowplot)
plot_grid(p1_1, p1_2, ncol=2)

dev.off()

### Monthly income -> Attrition

In [None]:
options(repr.plot.width=6, repr.plot.height=4) 

p1 <- 
  df_zh %>%
    ggplot(aes(x = 月收入, fill = 流失情况, group = 流失情况)) +
    geom_density(alpha=.6, size=.3, position="fill") +
    theme_minimal() +
    # 坐标轴标题
    labs(y=st_y_per) +
    # 填充颜色
    scale_fill_manual(values=c(red, green))+
    # 全图字体&字号
    theme(text = element_text(family = font, size=11)) +
    # 坐标轴外观
    theme(
      # 坐标轴标题距离其他元素间距&字号
      axis.title.x = element_text(margin = margin(t = 10, r = 0, b = 0, l = 0), size=12), 
      axis.title.y = element_text(margin = margin(t = 0, r = 15, b = 0, l = 0), size=12),
      # 坐标轴刻度字号
      axis.text = element_text(size = 10)) 

library(Cairo)
CairoPNG(file="02-月收入.png", width=6, height=4, units = "in", res = 300)
p1
dev.off()

## Basic factors -> Important factors
1. Demographic characteristics
2. Department and job role

### Demographic characteristics -> Overtime work

In [None]:
options(repr.plot.width=6, repr.plot.height=4) 

p1 <- 
  df_zh %>%
  select(性别, 是否加班) %>% 
  group_by(性别, 是否加班) %>% 
  summarize(n=n()) %>% 
  mutate(pct=round(prop.table(n),2) * 100) %>% 
  arrange(desc(pct)) %>%
    ggplot(aes(x=fct_reorder(性别,pct), y=pct, fill=是否加班, color=是否加班)) + 
    geom_bar(stat="identity", color="black", size=.1, width=.6, position="fill") +
    geom_label(aes(label=paste0(pct, "%"), fill = 是否加班), 
               color = "black", family = font, 
               position = position_fill(vjust = 0.5)) + 
    coord_flip() +
    theme_minimal() +
    # 坐标轴标题
    labs(y=st_y_per, x="性别") +
    # 填充颜色
    scale_fill_manual(values=c(yellow, brown))+
    # 全图字体&字号
    theme(text = element_text(family = font, size=11)) +
    # 坐标轴外观
    theme(
      # 坐标轴标题距离其他元素间距&字号
      axis.title.x = element_text(margin = margin(t = 10, r = 0, b = 0, l = 0), size=12), 
      axis.title.y = element_text(margin = margin(t = 0, r = 15, b = 0, l = 0), size=12),
      # 坐标轴刻度字号
      axis.text = element_text(size = 10)) +
    # 不显示图例
    theme(legend.position="none")
p1


In [None]:
options(repr.plot.width=6, repr.plot.height=4) 

p2 <- 
  df_zh %>%
  select(代际, 是否加班) %>% 
  group_by(代际, 是否加班) %>% 
  summarize(n=n()) %>% 
  mutate(pct=round(prop.table(n),2) * 100) %>% 
  arrange(desc(pct)) %>%
    ggplot(aes(x=fct_reorder(代际,pct), y=pct, fill=是否加班, color=是否加班)) + 
    geom_bar(stat="identity", color="black", size=.1, width=.6, position="fill") +
    geom_label(aes(label=paste0(pct, "%"), fill = 是否加班), 
               color = "black", family = font, 
               position = position_fill(vjust = 0.5)) + 
    coord_flip() +
    theme_minimal() +
    # 坐标轴标题
    labs(y=st_y_per, x="代际") +
    # 填充颜色
    scale_fill_manual(values=c(yellow, brown))+
    # 全图字体&字号
    theme(text = element_text(family = font, size=11)) +
    # 坐标轴外观
    theme(
      # 坐标轴标题距离其他元素间距&字号
      axis.title.x = element_text(margin = margin(t = 10, r = 0, b = 0, l = 0), size=12), 
      axis.title.y = element_text(margin = margin(t = 0, r = 15, b = 0, l = 0), size=12),
      # 坐标轴刻度字号
      axis.text = element_text(size = 10))
p2

In [None]:
options(repr.plot.width=6, repr.plot.height=4) 

p3 <- 
  df_zh %>%
  select(婚姻状态, 是否加班) %>% 
  group_by(婚姻状态, 是否加班) %>% 
  summarize(n=n()) %>% 
  mutate(pct=round(prop.table(n),2) * 100) %>% 
  arrange(desc(pct)) %>%
    ggplot(aes(x=fct_reorder(婚姻状态,pct), y=pct, fill=是否加班, color=是否加班)) + 
    geom_bar(stat="identity", color="black", size=.1, width=.6, position="fill") +
    geom_label(aes(label=paste0(pct, "%"), fill = 是否加班), 
               color = "black", family = font, 
               position = position_fill(vjust = 0.5)) + 
    coord_flip() +
    theme_minimal() +
    # 坐标轴标题
    labs(y=st_y_per, x="婚姻状态") +
    # 填充颜色
    scale_fill_manual(values=c(yellow, brown))+
    # 全图字体&字号
    theme(text = element_text(family = font, size=11)) +
    # 坐标轴外观
    theme(
      # 坐标轴标题距离其他元素间距&字号
      axis.title.x = element_text(margin = margin(t = 10, r = 0, b = 0, l = 0), size=12), 
      axis.title.y = element_text(margin = margin(t = 0, r = 15, b = 0, l = 0), size=12),
      # 坐标轴刻度字号
      axis.text = element_text(size = 10)) +
    # 不显示图例
    theme(legend.position="none")
p3

In [None]:
options(repr.plot.width=6, repr.plot.height=4) 

p4 <- 
  df_zh %>%
  select(教育水平_文字, 是否加班) %>% 
  group_by(教育水平_文字, 是否加班) %>% 
  summarize(n=n()) %>% 
  mutate(pct=round(prop.table(n),2) * 100) %>% 
  arrange(desc(pct)) %>%
    ggplot(aes(x=fct_reorder(教育水平_文字,pct), y=pct, fill=是否加班, color=是否加班)) + 
    geom_bar(stat="identity", color="black", size=.1, width=.6, position="fill") +
    geom_label(aes(label=paste0(pct, "%"), fill = 是否加班), 
               color = "black", family = font, 
               position = position_fill(vjust = 0.5)) + 
    coord_flip() +
    theme_minimal() +
    # 坐标轴标题
    labs(y=st_y_per, x="教育水平") +
    # 填充颜色
    scale_fill_manual(values=c(yellow, brown))+
    # 全图字体&字号
    theme(text = element_text(family = font, size=11)) +
    # 坐标轴外观
    theme(
      # 坐标轴标题距离其他元素间距&字号
      axis.title.x = element_text(margin = margin(t = 10, r = 0, b = 0, l = 0), size=12), 
      axis.title.y = element_text(margin = margin(t = 0, r = 15, b = 0, l = 0), size=12),
      # 坐标轴刻度字号
      axis.text = element_text(size = 10))
p4

In [None]:
ptions(repr.plot.width=8, repr.plot.height=4) 

library(Cairo)
CairoPNG(file="26-人口-加班.png", width=10, height=8, units = "in", res = 300)

library(cowplot)
plot_grid(p1, p2, p3, p4, ncol=2, nrow=2)

dev.off()

### Demographic characteristics -> Monthly income

In [None]:
options(repr.plot.width=8, repr.plot.height=8) 

p1 <- df_zh %>% 
select(流失情况, 月收入, 性别) %>% 
ggplot(aes(x=月收入, y=性别)) + 
geom_jitter(aes(col=流失情况), alpha=0.5) + 
scale_color_manual(values=c("#a1dfc3", darkred)) + 
labs(x="月收入（$）") + 
theme_minimal() + 
theme(text = element_text(family = font, size=11),
      panel.border = element_rect(color = "black", fill = NA, size = 0.3),
      axis.title.x = element_text(margin = margin(t = 15, r = 0, b = 0, l = 0), size=12), 
      axis.title.y = element_text(margin = margin(t = 0, r = 15, b = 0, l = 0), size=12)) +
# 不显示图例
theme(legend.position="none")

p1

In [None]:
options(repr.plot.width=8, repr.plot.height=8) 

p2 <- df_zh %>% 
select(流失情况, 月收入, 代际) %>% 
ggplot(aes(x=月收入, y=代际)) + 
geom_jitter(aes(col=流失情况), alpha=0.5) + 
scale_color_manual(values=c("#a1dfc3", darkred)) + 
labs(x="月收入（$）") + 
theme_minimal() + 
theme(text = element_text(family = font, size=11),
      panel.border = element_rect(color = "black", fill = NA, size = 0.3),
      axis.title.x = element_text(margin = margin(t = 15, r = 0, b = 0, l = 0), size=12), 
      axis.title.y = element_text(margin = margin(t = 0, r = 15, b = 0, l = 0), size=12))

p2

In [None]:
options(repr.plot.width=8, repr.plot.height=8) 

p3 <- df_zh %>% 
select(流失情况, 月收入, 婚姻状态) %>% 
ggplot(aes(x=月收入, y=婚姻状态)) + 
geom_jitter(aes(col=流失情况), alpha=0.5) + 
scale_color_manual(values=c("#a1dfc3", darkred)) + 
labs(x="月收入（$）") + 
theme_minimal() + 
theme(text = element_text(family = font, size=11),
      panel.border = element_rect(color = "black", fill = NA, size = 0.3),
      axis.title.x = element_text(margin = margin(t = 15, r = 0, b = 0, l = 0), size=12), 
      axis.title.y = element_text(margin = margin(t = 0, r = 15, b = 0, l = 0), size=12)) +
# 不显示图例
theme(legend.position="none")

p3

In [None]:
options(repr.plot.width=8, repr.plot.height=8) 

p4 <- df_zh %>% 
select(流失情况, 月收入, 教育水平_文字) %>% 
ggplot(aes(x=月收入, y=教育水平_文字)) + 
geom_jitter(aes(col=流失情况), alpha=0.5) + 
scale_color_manual(values=c("#a1dfc3", darkred)) + 
labs(x="月收入（$）", y="教育水平") + 
theme_minimal() + 
theme(text = element_text(family = font, size=11),
      panel.border = element_rect(color = "black", fill = NA, size = 0.3),
      axis.title.x = element_text(margin = margin(t = 15, r = 0, b = 0, l = 0), size=12), 
      axis.title.y = element_text(margin = margin(t = 0, r = 15, b = 0, l = 0), size=12))

p4


In [None]:
options(repr.plot.width=16, repr.plot.height=16) 

library(Cairo)
CairoPNG(file="10-人口特征-月收入.png", width=13, height=13, units = "in", res = 300)

library(cowplot)
plot_grid(p1, p2, p3, p4, ncol=2, nrow=2)

dev.off()

### Department and job role -> Overtime work

In [None]:
options(repr.plot.width=6, repr.plot.height=4) 

p1 <- 
  df_zh %>%
  select(所属部门, 是否加班) %>% 
  group_by(所属部门, 是否加班) %>% 
  summarize(n=n()) %>% 
  mutate(pct=round(prop.table(n),2) * 100) %>% 
  arrange(desc(pct)) %>%
    ggplot(aes(x=fct_reorder(所属部门,pct), y=pct, fill=是否加班, color=是否加班)) + 
    geom_bar(stat="identity", color="black", size=.1, width=.6, position="fill") +
    geom_label(aes(label=paste0(pct, "%"), fill = 是否加班), 
               color = "black", family = font, 
               position = position_fill(vjust = 0.5)) + 
    coord_flip() +
    theme_minimal() +
    # 坐标轴标题
    labs(y=st_y_per, x="部门") +
    # 填充颜色
    scale_fill_manual(values=c(yellow, brown))+
    # 全图字体&字号
    theme(text = element_text(family = font, size=11)) +
    # 坐标轴外观
    theme(
      # 坐标轴标题距离其他元素间距&字号
      axis.title.x = element_text(margin = margin(t = 10, r = 0, b = 0, l = 0), size=12), 
      axis.title.y = element_text(margin = margin(t = 0, r = 15, b = 0, l = 0), size=12),
      # 坐标轴刻度字号
      axis.text = element_text(size = 10)) +
    # 不显示图例
    theme(legend.position="none")
p1

In [None]:
options(repr.plot.width=6, repr.plot.height=4) 

p2 <- 
  df_zh %>%
  select(岗位名称, 是否加班) %>% 
  group_by(岗位名称, 是否加班) %>% 
  summarize(n=n()) %>% 
  mutate(pct=round(prop.table(n),2) * 100) %>% 
  arrange(desc(pct)) %>%
    ggplot(aes(x=fct_reorder(岗位名称,pct), y=pct, fill=是否加班, color=是否加班)) + 
    geom_bar(stat="identity", color="black", size=.1, width=.6, position="fill") +
    geom_label(aes(label=paste0(pct, "%"), fill = 是否加班), 
               color = "black", family = font, 
               position = position_fill(vjust = 0.5)) + 
    coord_flip() +
    theme_minimal() +
    # 坐标轴标题
    labs(y=st_y_per, x="岗位") +
    # 填充颜色
    scale_fill_manual(values=c(yellow, brown))+
    # 全图字体&字号
    theme(text = element_text(family = font, size=11)) +
    # 坐标轴外观
    theme(
      # 坐标轴标题距离其他元素间距&字号
      axis.title.x = element_text(margin = margin(t = 10, r = 0, b = 0, l = 0), size=12), 
      axis.title.y = element_text(margin = margin(t = 0, r = 15, b = 0, l = 0), size=12),
      # 坐标轴刻度字号
      axis.text = element_text(size = 10))
p2

In [None]:
options(repr.plot.width=8, repr.plot.height=4) 

library(Cairo)
CairoPNG(file="25-部门&岗位-加班.png", width=10, height=4, units = "in", res = 300)

library(cowplot)
plot_grid(p1, p2, ncol=2)

dev.off()

### Department and job role -> Monthly income

In [None]:
ptions(repr.plot.width=8, repr.plot.height=8) 

p1_1 <- df_zh %>% 
select(流失情况, 所属部门, 月收入) %>% 
ggplot(aes(x=月收入, y=所属部门)) + 
geom_jitter(aes(col=流失情况), alpha=0.5) + 
scale_color_manual(values=c("#a1dfc3", darkred)) + 
labs(y="所属部门", x="月收入（$）") + 
theme_minimal() + 
theme(text = element_text(family = font, size=11),
      panel.border = element_rect(color = "black", fill = NA, size = 0.3),
      axis.title.x = element_text(margin = margin(t = 15, r = 0, b = 0, l = 0), size=12), 
      axis.title.y = element_text(margin = margin(t = 0, r = 15, b = 0, l = 0), size=12)) +
# 不显示图例
theme(legend.position="none")

p1_1

In [None]:
options(repr.plot.width=8, repr.plot.height=8) 

p1_1 <- df_zh %>% 
select(流失情况, 岗位名称, 月收入) %>% 
ggplot(aes(x=月收入, y=岗位名称)) + 
geom_jitter(aes(col=流失情况), alpha=0.5) + 
scale_color_manual(values=c("#a1dfc3", darkred)) + 
labs(y="岗位", x="月收入（$）") + 
theme_minimal() + 
theme(text = element_text(family = font, size=11),
      panel.border = element_rect(color = "black", fill = NA, size = 0.3),
      axis.title.x = element_text(margin = margin(t = 15, r = 0, b = 0, l = 0), size=12), 
      axis.title.y = element_text(margin = margin(t = 0, r = 15, b = 0, l = 0), size=12)) +
# 不显示图例
theme(legend.position="none")

p1_1

In [None]:
options(repr.plot.width=16, repr.plot.height=8) 

library(Cairo)
CairoPNG(file="05-部门-收入.png", width=12, height=6, units = "in", res = 300)

library(cowplot)
plot_grid(p1_1, p1_2, ncol=2)

dev.off()

## Correlation coefficient analysis
1. Draw the correlation matrix
2. Bivariate analysis

### Draw the correlation matrix

In [None]:
options(repr.plot.width=13, repr.plot.height=10) 
# 筛选数值列
nums <- select_if(df_train, is.numeric)
#计算相关系数
corr <- round(cor(nums), 1)
# 记录显著差异
p.mat <- cor_pmat(nums)

In [None]:
options(repr.plot.width=10, repr.plot.height=7) 

nums <- select_if(df_zh, is.numeric)

corr <- round(cor(nums), 1)

library(Cairo)
CairoPNG(file="17-相关系数.png", width=10, height=7, units = "in", res = 300)

library(ggcorrplot)
ggcorrplot(corr, 
           type = "lower", 
           lab = TRUE, 
           lab_size = 3, 
           method="square", 
           colors = c("#95001c", "white", "#1c642d"),
           ggtheme=theme_minimal()) + 
theme(text = element_text(family = font, size=10))

dev.off()

### Bivariate analysis

In [None]:
options(repr.plot.width=6, repr.plot.height=4) 

    p1 <- df_zh %>% 
    ggplot(aes(x=总工作年数, y=月收入)) + 
    geom_point(color = "#c4e3ba", alpha=1/2) +
    geom_smooth(method="loess", color="red") + 
theme_minimal() +
theme(# 全图字体&字号
      text = element_text(family = font, size=11),
      # 坐标轴标题距离其他元素间距&字号
      axis.title.x = element_text(margin = margin(t = 10, r = 0, b = 0, l = 0), size=12), 
      axis.title.y = element_text(margin = margin(t = 0, r = 15, b = 0, l = 0), size=12),
      # 坐标轴刻度字号
      axis.text = element_text(size = 10))
p1

In [None]:
options(repr.plot.width=6, repr.plot.height=4) 

p2 <- df_zh %>% 
ggplot(aes(x=年龄, y=月收入)) + 
geom_point(color = "#c4e3ba", alpha=1/2) +
geom_smooth(method="loess", color="red") + 
theme_minimal() +
theme(# 全图字体&字号
      text = element_text(family = font, size=11),
      # 坐标轴标题距离其他元素间距&字号
      axis.title.x = element_text(margin = margin(t = 10, r = 0, b = 0, l = 0), size=12), 
      axis.title.y = element_text(margin = margin(t = 0, r = 15, b = 0, l = 0), size=12),
      # 坐标轴刻度字号
      axis.text = element_text(size = 10))
p2

In [None]:
options(repr.plot.width=6, repr.plot.height=4) 

p3 <- df_zh %>% 
ggplot(aes(x=在当前公司的工作年数, y=月收入)) + 
geom_point(color = "#c4e3ba", alpha=1/2) +
geom_smooth(method="loess", color="red") + 
theme_minimal() +
theme(# 全图字体&字号
      text = element_text(family = font, size=11),
      # 坐标轴标题距离其他元素间距&字号
      axis.title.x = element_text(margin = margin(t = 10, r = 0, b = 0, l = 0), size=12), 
      axis.title.y = element_text(margin = margin(t = 0, r = 15, b = 0, l = 0), size=12),
      # 坐标轴刻度字号
      axis.text = element_text(size = 10))
p3

In [None]:
options(repr.plot.width=6, repr.plot.height=4) 

p4 <- df_zh %>% 
ggplot(aes(x=在当前职位的工作年数, y=月收入)) + 
geom_point(color = "#c4e3ba", alpha=1/2) +
geom_smooth(method="loess", color="red") + 
theme_minimal() +
theme(# 全图字体&字号
      text = element_text(family = font, size=11),
      # 坐标轴标题距离其他元素间距&字号
      axis.title.x = element_text(margin = margin(t = 10, r = 0, b = 0, l = 0), size=12), 
      axis.title.y = element_text(margin = margin(t = 0, r = 15, b = 0, l = 0), size=12),
      # 坐标轴刻度字号
      axis.text = element_text(size = 10))
p4

In [None]:
options(repr.plot.width=22, repr.plot.height=8) 

library(Cairo)
CairoPNG(file="18-双变量-正.png", width=13, height=10, units = "in", res = 300)

library(cowplot)
plot_grid(p1, p2, p3, p4, ncol=2, nrow=2)

dev.off()

# Satisfaction analysis
1. Satisfaction -> Attrition
2. Department and job role -> Satisfaction

## Satisfaction -> Attrition

In [None]:
options(repr.plot.width=6, repr.plot.height=4) 

p1 <- 
  df_zh %>%
  select(环境满意度, 流失情况) %>% 
  group_by(环境满意度, 流失情况) %>% 
  summarize(n=n()) %>% 
  mutate(pct=round(prop.table(n),2) * 100) %>% 
  arrange(desc(pct)) %>%
    ggplot(aes(x=环境满意度, y=pct, fill=流失情况, color=流失情况)) + 
    geom_bar(stat="identity", color="black", size=.1, width=.6, position="fill") +
    geom_label(aes(label=paste0(pct, "%"), fill = 流失情况), 
               color = "black", family = font, 
               position = position_fill(vjust = 0.5)) + 
    coord_flip() +
    theme_minimal() +
    # 坐标轴标题
    labs(y=st_y_per) +
    # 填充颜色
    scale_fill_manual(values=c(red, green))+
    # 全图字体&字号
    theme(text = element_text(family = font, size=11)) +
    # 坐标轴外观
    theme(
      # 坐标轴标题距离其他元素间距&字号
      axis.title.x = element_text(margin = margin(t = 10, r = 0, b = 0, l = 0), size=12), 
      axis.title.y = element_text(margin = margin(t = 0, r = 15, b = 0, l = 0), size=12),
      # 坐标轴刻度字号
      axis.text = element_text(size = 10)) +
    # 不显示图例
    theme(legend.position="none")
p1

In [None]:
options(repr.plot.width=6, repr.plot.height=4) 

p2 <- 
  df_zh %>%
  select(人际关系满意度, 流失情况) %>% 
  group_by(人际关系满意度, 流失情况) %>% 
  summarize(n=n()) %>% 
  mutate(pct=round(prop.table(n),2) * 100) %>% 
  arrange(desc(pct)) %>%
    ggplot(aes(x=人际关系满意度, y=pct, fill=流失情况, color=流失情况)) + 
    geom_bar(stat="identity", color="black", size=.1, width=.6, position="fill") +
    geom_label(aes(label=paste0(pct, "%"), fill = 流失情况), 
               color = "black", family = font, 
               position = position_fill(vjust = 0.5)) + 
    coord_flip() +
    theme_minimal() +
    # 坐标轴标题
    labs(y=st_y_per) +
    # 填充颜色
    scale_fill_manual(values=c(red, green))+
    # 全图字体&字号
    theme(text = element_text(family = font, size=11)) +
    # 坐标轴外观
    theme(
      # 坐标轴标题距离其他元素间距&字号
      axis.title.x = element_text(margin = margin(t = 10, r = 0, b = 0, l = 0), size=12), 
      axis.title.y = element_text(margin = margin(t = 0, r = 15, b = 0, l = 0), size=12),
      # 坐标轴刻度字号
      axis.text = element_text(size = 10)) +
    # 不显示图例
    theme(legend.position="none")
p2

In [None]:
options(repr.plot.width=6, repr.plot.height=4) 

p3 <- 
  df_zh %>%
  select(工作满意度, 流失情况) %>% 
  group_by(工作满意度, 流失情况) %>% 
  summarize(n=n()) %>% 
  mutate(pct=round(prop.table(n),2) * 100) %>% 
  arrange(desc(pct)) %>%
    ggplot(aes(x=工作满意度, y=pct, fill=流失情况, color=流失情况)) + 
    geom_bar(stat="identity", color="black", size=.1, width=.6, position="fill") +
    geom_label(aes(label=paste0(pct, "%"), fill = 流失情况), 
               color = "black", family = font, 
               position = position_fill(vjust = 0.5)) + 
    coord_flip() +
    theme_minimal() +
    # 坐标轴标题
    labs(y=st_y_per) +
    # 填充颜色
    scale_fill_manual(values=c(red, green))+
    # 全图字体&字号
    theme(text = element_text(family = font, size=11)) +
    # 坐标轴外观
    theme(
      # 坐标轴标题距离其他元素间距&字号
      axis.title.x = element_text(margin = margin(t = 10, r = 0, b = 0, l = 0), size=12), 
      axis.title.y = element_text(margin = margin(t = 0, r = 15, b = 0, l = 0), size=12),
      # 坐标轴刻度字号
      axis.text = element_text(size = 10))
p3

In [None]:
options(repr.plot.width=15, repr.plot.height=4) 

library(Cairo)
CairoPNG(file="09-满意度.png", width=12, height=4, units = "in", res = 300)

library(cowplot)
plot_grid(p1, p2, p3, ncol=3)

dev.off()

## Department and job role -> Satisfaction

In [None]:
options(repr.plot.width=6, repr.plot.height=4) 
library(viridis)

p_b_h <- df_zh %>% 
ggplot(aes(x=所属部门, group=环境满意度, fill=环境满意度)) +
geom_density(adjust=1.5, position="fill", size=0.5) + 
# 颜色集
scale_fill_viridis(alpha=0.8) +
theme_minimal() + 
# 设置图例顺序
guides(fill = guide_legend(reverse=FALSE)) +
labs(y=st_y_per) +
theme(text = element_text(family = font, size=11),
      axis.title.x = element_text(margin = margin(t = 15, r = 0, b = 0, l = 0), size=12), 
      axis.title.y = element_text(margin = margin(t = 0, r = 15, b = 0, l = 0), size=12)) +
# 不显示图例
theme(legend.position="none") 
p_b_h

In [None]:
options(repr.plot.width=6, repr.plot.height=4) 
library(viridis)

p_b_r <- df_zh %>% 
ggplot(aes(x=所属部门, group=人际关系满意度, fill=人际关系满意度)) +
geom_density(adjust=1.5, position="fill", size=0.5) + 
# 颜色集
scale_fill_viridis(alpha=0.8) +
theme_minimal() + 
# 设置图例顺序
guides(fill = guide_legend(reverse=FALSE)) +
labs(y=st_y_per) +
theme(text = element_text(family = font, size=11),
      axis.title.x = element_text(margin = margin(t = 15, r = 0, b = 0, l = 0), size=12), 
      axis.title.y = element_text(margin = margin(t = 0, r = 15, b = 0, l = 0), size=12)) +
# 不显示图例
theme(legend.position="none") 
p_b_r

In [None]:
options(repr.plot.width=6, repr.plot.height=4) 
library(viridis)

p_b_g <- df_zh %>% 
ggplot(aes(x=所属部门, group=工作满意度, fill=工作满意度)) +
geom_density(adjust=1.5, position="fill", size=0.5) + 
# 颜色集
scale_fill_viridis(alpha=0.8) +
theme_minimal() + 
# 设置图例顺序
guides(fill = guide_legend(reverse=FALSE)) +
labs(y=st_y_per) +
theme(text = element_text(family = font, size=11),
      axis.title.x = element_text(margin = margin(t = 15, r = 0, b = 0, l = 0), size=12), 
      axis.title.y = element_text(margin = margin(t = 0, r = 15, b = 0, l = 0), size=12)) +
# 不显示图例
theme(legend.position="none") 
p_b_g

In [None]:
options(repr.plot.width=6, repr.plot.height=4) 
library(viridis)

p_g_h <- df_zh %>% 
ggplot(aes(x=岗位名称, group=环境满意度, fill=环境满意度)) +
geom_density(adjust=1.5, position="fill", size=0.5) + 
# 颜色集
scale_fill_viridis(alpha=0.8) +
theme_minimal() + 
# 设置图例顺序
guides(fill = guide_legend(reverse=FALSE)) +
labs(y=st_y_per) +
theme(text = element_text(family = font, size=11),
      axis.title.x = element_text(margin = margin(t = -10, r = 0, b = 0, l = 0), size=12), 
      axis.title.y = element_text(margin = margin(t = 0, r = 15, b = 0, l = 0), size=12),
      axis.text.x = element_text(margin = margin(t = 10, r = 0, b = 0, l = 0), angle=45)) +
theme(plot.margin = margin(t = 0, r = .5, b = 0, l = 0, unit = "cm"))
p_g_h

In [None]:
options(repr.plot.width=6, repr.plot.height=4) 
library(viridis)

p_g_r <- df_zh %>% 
ggplot(aes(x=岗位名称, group=人际关系满意度, fill=人际关系满意度)) +
geom_density(adjust=1.5, position="fill", size=0.5) + 
# 颜色集
scale_fill_viridis(alpha=0.8) +
theme_minimal() + 
# 设置图例顺序
guides(fill = guide_legend(reverse=FALSE)) +
labs(y=st_y_per) +
theme(text = element_text(family = font, size=11),
      axis.title.x = element_text(margin = margin(t = -10, r = 0, b = 0, l = 0), size=12), 
      axis.title.y = element_text(margin = margin(t = 0, r = 15, b = 0, l = 0), size=12),
      axis.text.x = element_text(margin = margin(t = 10, r = 0, b = 0, l = 0), angle=45)) +
theme(plot.margin = margin(t = 0, r = .5, b = 0, l = 0, unit = "cm"))
p_g_r

In [None]:
options(repr.plot.width=6, repr.plot.height=4) 
library(viridis)

p_g_g <- df_zh %>% 
ggplot(aes(x=岗位名称, group=工作满意度, fill=工作满意度)) +
geom_density(adjust=1.5, position="fill", size=0.5) + 
# 颜色集
scale_fill_viridis(alpha=0.8) +
theme_minimal() + 
# 设置图例顺序
guides(fill = guide_legend(reverse=FALSE)) +
labs(y=st_y_per) +
theme(text = element_text(family = font, size=11),
      axis.title.x = element_text(margin = margin(t = -10, r = 0, b = 0, l = 0), size=12), 
      axis.title.y = element_text(margin = margin(t = 0, r = 15, b = 0, l = 0), size=12),
      axis.text.x = element_text(margin = margin(t = 10, r = 0, b = 0, l = 0), angle=45)) +
theme(plot.margin = margin(t = 0, r = .5, b = 0, l = 0, unit = "cm"))
      
p_g_g

In [None]:
options(repr.plot.width=13, repr.plot.height=13) 

library(Cairo)
CairoPNG(file="27-部门&岗位-满意度.png", width=13, height=13, units = "in", res = 300)

library(cowplot)
plot_grid(p_b_h, p_g_h, p_b_r, p_g_r, p_b_g, p_g_g, ncol=2, nrow=3)

dev.off()