![](https://cdn-images-1.medium.com/max/1200/1*Yb0EWWZd1mZUTkdJZOJS-Q.png)

# HR Analytics Employee Attrition & Performance

### This kernel features a project for the class "Special Topics in Information Systems ΙI" of the Business Administration department, University of Macedonia.* 
### Its purpose is creating a reliable model that predicts **employee attrition**.
### Knowing the reasons why your company's employees stay or leave, can prepare you in order to adapt to their preferences. This can help you predict their actions and make them stay in the company longer.



# 1. Importing Packages
### We first import all the packages we will need for our kernel

In [None]:
#importing Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

#Importing Model Packages
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz

#Importing Metrics
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score

import gc                         
gc.enable()


# 2. Our Dataset
### We then take a look into our dataset by importing it and visualizing some of its features.

In [None]:
#We import the file and view its first 10 rows as a dataframe

data = pd.read_csv("../input/WA_Fn-UseC_-HR-Employee-Attrition.csv")
data.head(10)

In [None]:
#We then check the kind of information our dataset contains
data.info()

In [None]:
#We print the number of rows and columns
data.shape

In [None]:
#Creating a box plot for the age allocation of our employees
vis1= sns.boxplot(data=data, x='Age', color='Purple')
plt.xlabel('Employee Age')
plt.show()

In [None]:
#Getting a chart of the number of employees for each gender
Gender= data.groupby('Gender')[['Gender']].count()
Gender.columns= ['Num_of_Empl']
vis3= sns.barplot(data=Gender, x=Gender.index, y='Num_of_Empl', palette='Set1')
plt.xticks(range(3),rotation='vertical')
plt.title('Number of Employees of each Gender ')
plt.xlabel('Gender')
plt.ylabel('Number of Employees')
plt.show()

In [None]:
#Taking a look at the distribution among the years in our employees' current roles
plt.hist(data.YearsInCurrentRole, bins=100)
plt.title('Years of Employees per Current Role ')
plt.xlabel('Years In Current Role')
plt.ylabel('Number of Employees')
plt.show()

sns.set_style('darkgrid')



In [None]:
#Viewing the fields our employees are specialised in
EducationFields= data.groupby('EducationField')[['EducationField']].count()
EducationFields.head(10)

In [None]:
#The data of our highest paid employees
highest_paid= data.sort_values(by='MonthlyIncome', ascending=False)
highest_paid.head()

In [None]:
#Creating a box plot for the Monthly Income of our employees
vis2= sns.boxplot(data=data, x='MonthlyIncome', color='Green')
plt.xlabel('Employee Income')
plt.show()

In [None]:
#We print the unique values of the "Department" column
print(data.Department.unique())

In [None]:
#Getting a chart of the number of employees for each department
Department= data.groupby('Department')[['Department']].count()
Department.columns= ['Num_of_Empl']
vis3= sns.barplot(data=Department, x=Department.index, y='Num_of_Empl', palette='Blues')
plt.xticks(range(3),rotation='vertical')
plt.title('Number of Employees for each Department ')
plt.xlabel('Departments')
plt.ylabel('Number of Employees')
plt.show()

In [None]:
#The amount of employees that stayed and left (No=stayed, Yes=left)
Attrition_num= data.Attrition.value_counts()
Attrition_num.head()

In [None]:
#The percentages of the amounts above
round(Attrition_num/data.Attrition.count()*100, 2)

# 3. Categorical data transformation
### We need to transform our string data into numeric figures, in order to help our model understand their value.


In [None]:
#We first change the type of the variables below into categorical

data.Attrition= data.Attrition.astype('category')
data.BusinessTravel= data.BusinessTravel.astype('category')
data.Gender= data.Gender.astype('category')
data.EducationField= data.EducationField.astype('category')
data.JobRole= data.JobRole.astype('category')
data.MaritalStatus= data.MaritalStatus.astype('category')
data.OverTime= data.OverTime.astype('category')



In [None]:
data.info()

### 3.1 We then transform our categorical variables using "pd.get_dummies".
#### This creates a separate dataframe containing the unique values of the variable as its columns
#### The value/column that describes the employee/row takes the number 1 while the rest take the number 0.
#### We then drop one of the columns in order to avoid a "dummy trap" which comes from having too many variables with high relevance between them
#### Finally, we drop the original column and join the created dataframe to our dataset

In [None]:
#Transforming the Department column
departments= pd.get_dummies(data.Department)
departments.head(10)

In [None]:
demartments= departments.drop('Human Resources', axis=1)
data= data.drop("Department", axis=1)
data = data.join(departments)
data.head()

In [None]:
#Transforming the EducationField column
education_fields= pd.get_dummies(data.EducationField)
education_fields= education_fields.drop('Human Resources', axis=1)
data= data.drop("EducationField", axis=1)
data = data.join(education_fields)
data.head()


In [None]:
#Transforming the JobRole column
job_roles= pd.get_dummies(data.JobRole)
jobe_roles= job_roles.drop("Human Resources", axis=1)
data= data.drop("JobRole", axis=1)
data = data.join(jobe_roles)
data.head()

In [None]:
#Transforming the MaritalStatus column
marital_status= pd.get_dummies(data.MaritalStatus)
marital_status= marital_status.drop("Single", axis=1)
data= data.drop("MaritalStatus", axis=1)
data= data.join(marital_status)
data.head()


In [None]:
#Transforming the Gender column
genders= pd.get_dummies(data.Gender)
genders= genders.drop("Female", axis=1)
data= data.drop("Gender", axis=1)
genders.columns= ['Gender']
data= data.join(genders)
data.head()

In [None]:
#Transforming the OverTime column
overtime= pd.get_dummies(data.OverTime)
overtime= overtime.drop("Yes", axis=1)
data= data.drop("OverTime", axis=1)
overtime.columns= ['OverTime']
data= data.join(overtime)
data.head()

In [None]:
#Transforming the BusinessTravel column
businesstravel= pd.get_dummies(data.BusinessTravel)
businesstravel= businesstravel.drop("Travel_Rarely", axis=1)
data= data.drop("BusinessTravel", axis=1)
data= data.join(businesstravel)
data.head()


#### We drop these 3 following columns as they are of no value to our model

In [None]:
#Dropping the "Over18" , "StandardHours" and "EmployeeCount" columns since all employees share the same values.
data= data.drop("Over18", axis=1)
data= data.drop("StandardHours", axis=1)
data= data.drop("EmployeeCount", axis=1)
data.head()

#### We transform our target column (y)

In [None]:
#Transforming the Attrition column
attrition= pd.get_dummies(data.Attrition)
attrition= attrition.drop("Yes", axis=1)
data= data.drop("Attrition", axis=1)
attrition.columns= ['Attrition']
data= data.join(attrition)
data.head()

# 4. Creating predictors set
### We separate our predictors from our target (y) column and split the data to prepare them for model training.

In [None]:

# Setting the target (dependent values) and features (independent values)

# 1. Setting the column "Attrition" as target
target = data.Attrition

# 2. Setting evrything else as features
features = data.drop("Attrition",axis=1)

features.head()



#### We decided to focus on the stayers, therefore the employees who left are the 0s and the ones who stayed are the 1s

In [None]:
#The amount of employees that stayed and left (1=stayed, 0=left)
Attrition_num= data.Attrition.value_counts()
Attrition_num.head()

In [None]:
#Now we will split the above (target, features) into train and test sets with 70%/30% ratio, respectively

target_train, target_test, features_train, features_test = train_test_split(target,features,test_size=0.3,
                                                                            random_state=42)

# 5. We begin training our model.
### The model we will use is the Decision Tree Classifier and we tried to figure out different ways and uses of it, in order to maximize our goal.
### Since we chose to focus on the stayers, we wish our model to have a high recall score, although our top priority will be the precision score.

![](https://www.researchgate.net/profile/Fernando_Crespo/publication/292304919/figure/fig13/AS:341406267789324@1458409005603/Confusion-matrix-for-a-two-class-problem-TP-is-the-number-of-correct-predictions-that-an.png)

### Recall Score = TP/(TP + FN)
### Precision Score= TP/(TP + FP)

In [None]:
# Initialize it and call model by specifying the random_state parameter
model = DecisionTreeClassifier(random_state=42)

# Apply a decision tree model to fit features to the target
model.fit(features_train, target_train)

In [None]:


# Check the accuracy score of the prediction for the training set
model.score(features_train,target_train)*100

In [None]:
# Check the accuracy score of the prediction for the test set
print ("The accuracy is" ,model.score(features_test,target_test))

# Use the initial model to predict churn
prediction = model.predict(features_test)

# Calculate recall score by comparing target_test with the prediction
print("The recall score is", recall_score(target_test, prediction))

#Print the precision score of the model predictions
print( "The precision score is", precision_score(target_test, prediction))

# Calculate the f1_score
y_true= target_test
y_pred= model.predict(features_test)
print("The f1_score is" , f1_score(y_true, y_pred))

### In order to avoid overfitting (check the training set's accuracy above) we experiment by limiting the Decision Tree's depth.
#### This can lower our accuracy, but it gives us a more realistic result.

In [None]:
# Initialize the DecisionTreeClassifier while limiting the depth of the tree to 5
model_depth_5 = DecisionTreeClassifier(max_depth=5, random_state=42)

# Fit the model
model_depth_5.fit(features_train,target_train)

# Print the accuracy of the prediction for the training set
print(model_depth_5.score(features_train,target_train)*100)

In [None]:
# Use the initial model to predict churn
prediction_5 = model_depth_5.predict(features_test)

# Print the accuracy of the prediction for the test set
print("The accuracy is", model_depth_5.score(features_test,target_test))

# Calculate recall score by comparing target_test with the prediction
print("The recall score is", recall_score(target_test, prediction_5))

#Print the precision score of the model predictions
print( "The precision score is", precision_score(target_test, prediction_5))

# Calculate the f1_score
y_true= target_test
y_pred= prediction_5
print("The f1_score is" , f1_score(y_true, y_pred))

### We notice that our f1 score has improved, however our recall score is higher than our precision score. We keep experimenting in order to maximize our precision while keeping a relatively high recall score

#### We try to modify the minimum number of samples required to be at a leaf node. A split point at any depth will only be considered if it leaves at least 100 training samples in each of the left and right branches. This may have the effect of smoothing the model

In [None]:
# Initialize the DecisionTreeClassifier while limiting the sample size in leaves to 100
model_sample_100 = DecisionTreeClassifier(min_samples_leaf= 100, random_state=42)

# Fit the model
model_sample_100.fit(features_train,target_train)

# Print the accuracy of the prediction (in percentage points) for the training set
print(model_sample_100.score(features_train,target_train)*100)

In [None]:
# Use the model to predict churn
prediction_100 = model_sample_100.predict(features_test)

# Print the accuracy of the prediction for the test set
print("The accuracy is" , model_sample_100.score(features_test,target_test))

# Calculate recall score by comparing target_test with the prediction
print("The recall score is", recall_score(target_test, prediction_100))

#Print the precision score of the model predictions
print( "The precision score is", precision_score(target_test, prediction_100))

# Calculate the f1_score
y_true= target_test
y_pred= prediction_100
print("The f1_score is" , f1_score(y_true, y_pred))

### For classification problems, not just decision trees, it isn't uncommon for unbalanced classes to give overly optimistic accuracy scores. Here's a way to handle this.

In [None]:
# Initialize the model
model_depth_7_b = DecisionTreeClassifier(max_depth=7 ,class_weight="balanced", random_state=42)
# Fit it to the training component
model_depth_7_b.fit(features_train,target_train)
# Make prediction using test component
prediction_b = model_depth_7_b.predict(features_test)

In [None]:
# Print the accuracy of the prediction for the test set
print("The accuracy is" , model_depth_7_b.score(features_test, target_test))

# Calculate recall score by comparing target_test with the prediction
print("The recall score is", recall_score(target_test, prediction_b))

#Print the precision score of the model predictions
print( "The precision score is", precision_score(target_test, prediction_b))

# Calculate the f1_score
y_true= target_test
y_pred= prediction_b
print("The f1_score is" , f1_score(y_true, y_pred))

#### We try the cross_val_score model to give us accuracy scores for different data folds.

In [None]:
# Import the function for implementing cross validation
from sklearn.model_selection import cross_val_score

# Use that function to print the cross validation score for 10 folds
print(cross_val_score(model,features,target,cv=10))

## 5.2 Parameter tuning
### Since we are not 100% certain of what kind of Decision Tree we need to use, we find the best possible parameters for our model using the GridSearchCV function.

In [None]:

# Generate values for maximum depth
depth = [i for i in range(5,21,1)]

# Generate values for minimum sample size
samples = [i for i in range(25,500,25)]

# Create the dictionary with parameters to be checked
parameters = dict(max_depth=depth, min_samples_leaf=samples)

In [None]:
# import the GridSearchCV function
from sklearn.model_selection import GridSearchCV

# set up parameters: done
parameters = dict(max_depth=depth, min_samples_leaf=samples)

# initialize the param_search function using the GridSearchCV function, initial model and parameters above
param_search = GridSearchCV(model, parameters)

# fit the param_search to the training dataset
param_search.fit(features_train, target_train)

# print the best parameters found
print(param_search.best_params_)

### We then calculate the feature importances for each Decision Tree variation we previously used.

In [None]:
# Calculate feature importances
feature_importances = model.feature_importances_

# Create a list of features: done
feature_list = list(features)

# Save the results inside a DataFrame using feature_list as an indnex
relative_importances = pd.DataFrame(index=feature_list, data=feature_importances, columns=["importance"])

# Sort values to learn most important features
relative_importances.sort_values(by="importance", ascending=False).head(10)


In [None]:
# Calculate feature importances
feature_importances_5 = model_depth_5.feature_importances_

# Create a list of features: done
feature_list = list(features)

# Save the results inside a DataFrame using feature_list as an indnex
relative_importances_5 = pd.DataFrame(index=feature_list, data=feature_importances_5, columns=["importance"])

# Sort values to learn most important features
relative_importances_5.sort_values(by="importance", ascending=False).head(10)

In [None]:
# Calculate feature importances
feature_importances_7 = model_depth_7_b.feature_importances_

# Create a list of features: done
feature_list = list(features)

# Save the results inside a DataFrame using feature_list as an indnex
relative_importances_7 = pd.DataFrame(index=feature_list, data=feature_importances_7, columns=["importance"])

# Sort values to learn most important features
relative_importances_7.sort_values(by="importance", ascending=False).head(10)

In [None]:
# select only features with relative importance higher than 1%
selected_features = relative_importances[relative_importances.importance>0]

# create a list from those features: done
selected_list = selected_features.index

# transform both features_train and features_test components to include only selected features
features_train_selected = features_train[selected_list]
features_test_selected = features_test[selected_list]

## 5.3 Best Model Initialization 
### And finally we use the features with value of importance higher than 0 from our original model, as well as the best parameters given by GridSearchCV, to get the best possible prediction model for our Employee Attrition problem.

In [None]:
# Initialize the best model using parameters provided in description
model_best = DecisionTreeClassifier(max_depth=5, min_samples_leaf=25, class_weight="balanced", random_state=42)

# Fit the model using only selected features from training set: done
model_best.fit(features_train_selected, target_train)

# Make prediction based on selected list of features from test set
prediction_best = model_best.predict(features_test_selected)

# Print the general accuracy of the model_best
print("The accuracy is" , model_best.score(features_test_selected, target_test) * 100)

# Print the recall score of the model predictions
print("The recall score is" ,recall_score(target_test, prediction_best) * 100)

#Print the precision score of the model predictions
print("The precision score is",  precision_score(target_test, prediction_best)* 100)

# Calculate the f1_score
y_true= target_test
y_pred= prediction_best
print("The f1_score is" , f1_score(y_true, y_pred))



In [None]:
gc.collect()

## 6. We finally use the RandomForestClassifier model and 4 of our top features/predictors to create a visualized Decision Tree.

In [None]:
# Model (can also use single decision tree)
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=100000, max_depth=3)

# Train
model.fit(features[['MonthlyIncome', 'YearsInCurrentRole', 'OverTime', 'StockOptionLevel']], target)
# Extract single tree
estimator = model.estimators_[5]

from sklearn.tree import export_graphviz

import pydotplus

dt_graphviz = tree.export_graphviz(estimator, feature_names=['MonthlyIncome', 'YearsInCurrentRole', 'OverTime', 'StockOptionLevel'], out_file = None)

pydot_graph = pydotplus.graph_from_dot_data(dt_graphviz)

from IPython.display import Image

Image(pydot_graph.create_png())

### THANK YOU!!