# Project Topic: Understanding factors that affects employee attrition


# Setting the environment by Importing libraries needed

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Reading in the dataset

In [None]:
df = pd.read_csv("employeeAttrition.csv")
df.head()

df['Attrition']=df['Attrition'].replace(['Yes'], 1)
df['Attrition']=df['Attrition'].replace(['No'], 0)
df['Gender']=df['Gender'].replace(['Male'], 1)
df['Gender']=df['Gender'].replace(['Female'], 0)
df['MaritalStatus']=df['MaritalStatus'].replace(['Single'],1)
df['MaritalStatus']=df['MaritalStatus'].replace(['Married'],2)
df['MaritalStatus']=df['MaritalStatus'].replace(['Divorced'],3)

df.head(3)

In [None]:
df['Education']=df['Education'].replace([1],'Below college')
df['Education']=df['Education'].replace([2],'College')
df['Education']=df['Education'].replace([3],'Bachelor')
df['Education']=df['Education'].replace([4],'Masters')
df['Education']=df['Education'].replace([5],'Doctor')

df['JobSatisfaction']=df['JobSatisfaction'].replace([1],'Low')
df['JobSatisfaction']=df['JobSatisfaction'].replace([2],'Medium')
df['JobSatisfaction']=df['JobSatisfaction'].replace([3],'High')
df['JobSatisfaction']=df['JobSatisfaction'].replace([4],'Very High')

df['JobInvolvement']=df['JobInvolvement'].replace([1],'Low')
df['JobInvolvement']=df['JobInvolvement'].replace([2],'Medium')
df['JobInvolvement']=df['JobInvolvement'].replace([3],'High')
df['JobInvolvement']=df['JobInvolvement'].replace([4],'Very High')

df['PerformanceRating']=df['PerformanceRating'].replace([1],'Low')
df['PerformanceRating']=df['PerformanceRating'].replace([2],'Good')
df['PerformanceRating']=df['PerformanceRating'].replace([3],'Excellent')
df['PerformanceRating']=df['PerformanceRating'].replace([4],'Outstanding')

In [None]:
#dropping the columns that are not needed
new_df = df.drop([ 'EmployeeCount', 'EmployeeNumber', 'HourlyRate', 'Over18', 'StandardHours','RelationshipSatisfaction','WorkLifeBalance', 'StockOptionLevel', 'YearsInCurrentRole','DailyRate', 'YearsSinceLastPromotion', 'YearsWithCurrManager'], axis=1)
new_df.head()

In [None]:
new_df.info()

In [None]:
new_df.describe()

In [None]:
# Adding new column 'AgeRange'
age_ranges = ["{0} - {1}".format(age, age + 10) for age in range(10, 60, 10)]
age_ranges
count_unique_age_ranges = len(age_ranges)
count_unique_age_ranges
new_df['AgeRange'] = pd.cut(x=new_df['Age'], bins=count_unique_age_ranges, labels=age_ranges)
new_df.head(6)

In [None]:
#group by AgeRange
group_df= new_df.groupby(['Attrition']).count()
group_df.head()

In [None]:
new_df['Attrition'].value_counts()

In [None]:
new_df.groupby(['Attrition']).mean()

In [None]:
new_df.groupby('Education').mean()


In [None]:
pd.crosstab(new_df.Attrition,new_df.Education).plot(kind='bar')
plt.title('Attrition level for Education')
plt.xlabel('Education')
plt.ylabel('Attrition Frequency')

In [None]:
new_df.groupby('Department').mean()


In [None]:
features=['Attrition','JobInvolvement','Education','JobSatisfaction', 'PerformanceRating','DistanceFromHome', 'Gender','MaritalStatus','Department', 'AgeRange']
fig=plt.subplots(figsize=(15,20))
for i, j in enumerate(features):
    plt.subplot(5, 2, i+1)
    plt.subplots_adjust(hspace = 1.0)
    sns.countplot(x=j,data = new_df)
    plt.xticks(rotation=90)
    plt.title('NoOfEmployee')
    plt.savefig('AboutEmployee_AtIBM_bar_chart')
    
    #You can observe the following points in the above visualization:

#Most of the employee is doing the project from 3-5.
#There is a huge drop between 3 years and 4 years experienced employee.
#The no of employee left is 19 % of the total employment.
#A decidedly less number of employee get the promotion in the last 5 year.
#The sales department is having maximum no.of employee followed by technical and support
#Most of the employees are getting salary either medium or low.

In [None]:
fig=plt.subplots(figsize=(15,20))
for i, j in enumerate(features):
    plt.subplot(5, 2, i+1)
    plt.subplots_adjust(hspace = 1.0)
    sns.countplot(x=j,data = new_df, hue='Attrition')
    plt.xticks(rotation=90)
    plt.title('No.OfEmployee')
    plt.savefig('Factors_of_Attrition_bar_chart')
    
    #You can observe the following points in the above visualization:

#Those employees who have the number of projects more than 5 were left the company.
#The employee who had done 6 and 7 projects, left the company it seems to like that they were overloaded with work.
#The employee with five-year experience is leaving more because of no promotions in last 5 years and more than 6 years experience are not leaving because of affection with the company.
#Those who promotion in last 5 years they didn't leave, i.e., all those left they didn't get the promotion in the previous 5 years.

Data Analysis and Visualization Summary:
Following features are most influencing a person to leave the company:

Promotions: Employees are far more likely to quit their job if they haven't received a promotion in the last 5 years.
Time with Company: Here, The three-year mark looks like a time to be a crucial point in an employee's career. Most of them quit their job around the three-year mark. Another important point is 6-years point, where the employee is very unlikely to leave.
Number Of Projects: Employee engagement is another critical factor to influence the employee to leave the company. Employees with 3-5 projects are less likely to leave the company. The employee with less and more number of projects are likely to leave.
Salary: Most of the employees that quit among the mid or low salary groups.

# Question: How does age and gender affect employee attrition, does it has anything to tell/hint us about our health?

In [None]:
#showing the frequency of age range
hg= sns.catplot(x='AgeRange',data=new_df, kind='count', ci=None)
hg.fig.suptitle('This table shows the Frequency of Age', y= 1.03)


In [None]:
#This table shows the level of attrition based on Gender and Age Range
hg=sns.catplot(x='Gender',col='AgeRange',data=new_df, ci = None, kind= 'count', hue='Attrition')
hg.fig.suptitle('This table shows the level of attrition based on Gender and Age Range', y= 1.05)

# Question: How does work performance rating  affect employee attrition, In what way does level of education enhance work performance rating (positively/negatively) and how does it affect employee attrition

In [None]:
#This table shows the effect of education on employee attrition
ag= sns.catplot(x='Attrition',col='AgeRange', data= new_df, kind= 'count', hue='Education' )
ag.fig.suptitle('The effect of education on employee attrition', y= 1.05)


In [None]:
bg= sns.catplot(x='PerformanceRating', hue= 'AgeRange', col= 'Education', data=new_df, kind='count')
bg.fig.suptitle(' This graph shows how Education affects Performance Rating using AgeRange', y=1.05)

In [None]:
#this table shows how education affects Performance Rating
dg=sns.catplot(x='PerformanceRating',col='AgeRange', data= new_df, kind= 'count', hue='Education')
dg.fig.suptitle('The effect of education on performance rating based on age group', y=1.05)


In [None]:
#this plot is supposed to depict how monthly rate affects Performance rating using Age Range
plt.figure(figsize=(15,10))
plt.title('This table shows how monhtly rate affects performance rating based on age group', y = 1.03)
fig=sns.boxplot(x='AgeRange',y='MonthlyRate', data= new_df, hue='PerformanceRating')


In [None]:
#sns.catplot(x='MonthlyRate',col='AgeRange',data=new_df, kind= 'count',hue='PerformanceRating')

NoAttrition= pd.to_numeric(new_df['Attrition'])== '1'.count()
print (NoAttrition)
TotalStaff= pd.to_numeric(new_df['Attrition']).count()
print ('TotalStaff: ' + TotalStaff)
AttritionRate= (NoAttrition/TotalStaff)*100
print ('The Attriton Rate of IBM is : '+ AttritionRate)

In [None]:
NoAttrition= pd.to_numeric(new_df['Attrition']==1).count()
print (NoAttrition)

# Question: How does distance from home affect work performance rating and in what way does it leads to employee attrition?

In [None]:
# This plt shows how distance from work affect performance rating based on age range
df = new_df.pivot_table(index='Gender', columns='AgeRange', values='MonthlyIncome', aggfunc=np.median)
sns.heatmap(df, annot=True, fmt=".1f")
plt.show()

In [None]:
# # This plt shows how distance from work affect performance rating based on Atrrition
df = new_df.pivot_table(index='Attrition', columns='AgeRange', values='DistanceFromHome', aggfunc=np.median)
sns.heatmap(df, annot=True, fmt=".1f")
plt.show()

# Question: How is job satisfaction rate a factor for employee attrition?

In [None]:
# this plt shows the relationship between job satisfaction rate and level of attrition
#heat_df = new_df.pivot_table(index='Attrition', columns='AgeRange', values='JobSatisfaction', aggfunc=np.median)
#sns.heatmap(heat_df, annot=True, fmt=".1f")
#plt.show()

gg=sns.catplot(x='JobSatisfaction',col='AgeRange', data= new_df, kind= 'count')
gg.fig.suptitle('The effect of education on performance rating based on age group', y=1.05)

In [None]:
# The Relationship between AgeRange and Total working years and level of attrition

sns.relplot(x="Attrition", y="TotalWorkingYears", col='AgeRange',kind="line", data=new_df, ci=None)


In [None]:
#sns.relplot(x='Attrition',y='TotalWorkingYears',hue='MaritalStatus',size='AgeRange',col='Gender',data=new_df)

# The Relationship between AgeRange, Total working years and Jobsatisfcation
sns.relplot(x="AgeRange", y="TotalWorkingYears", col='JobSatisfaction',kind="line", data=new_df)

In [None]:
# The Relationship between AgeRange,Total working years, Jobsatisfcation and Education
sns.relplot(x="JobSatisfaction", y="TotalWorkingYears", col='AgeRange',kind="line",size='Education', data=new_df, ci=None)


# Question: Who is at risk of employee attrition: 
            #Male: Single, married or divorced
            #Female: Single, married or divorced


In [None]:
# This plt shows the category of people at the risk of employee attrition based on age range
sns.relplot(x='AgeRange',y='Gender',hue='MaritalStatus',col='Attrition',data=new_df)