In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns

In [None]:
df = pd.read_csv('Downloads/WA_Fn-UseC_-HR-Employee-Attrition.csv')
df.head()

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
# Let's replace 'Attritition' , 'overtime' , 'Over18' column with integers before performing any visualizations

df['Attrition'] = df['Attrition'].apply(lambda x: 1 if x == 'Yes' else 0)
df['Over18'] = df['Over18'].apply(lambda x: 1 if x == 'Y' else 0)
df['OverTime'] = df['OverTime'].apply(lambda x: 1 if x == 'Yes' else 0)

In [None]:
# Let's see if we have any missing data, luckily we don't!

if df.isnull().sum().sum() == 0:
    print('CHECK: No missing data \n')
else:
    print('CHECK: Missing data found \n')
print(df.isnull().sum())

In [None]:
sns.countplot(x='Attrition', data=df,hue='Attrition',legend=False)
plt.xlabel('Attrition')
plt.ylabel('Count')
plt.xticks([0,1], ['Stayed', 'Left'])
plt.show()

In [None]:
df.hist(figsize=(20,20), bins=50)
plt.show()

In [None]:
# It makes sense to drop 'EmployeeCount' , 'Standardhours' and 'Over18' since they do not change from one employee to the other
# Let's drop 'EmployeeNumber' as well
df.drop(['EmployeeCount', 'StandardHours', 'Over18', 'EmployeeNumber'], axis="columns", inplace=True)

In [None]:
# Let's see how many employees left the company!
# Count the number of employees who stayed and left
# It seems that we are dealing with an imbalanced dataset

left_df = df[df['Attrition'] == 1]
stayed_df = df[df['Attrition'] == 0]
print(f'Total Employees: {len(df)}\n')
print(f'Number of employees who left: {df["Attrition"].value_counts()[1]}')
print(f'% of employees who left: {round(df["Attrition"].value_counts()[1]/len(df)*100,2)}%\n')
print(f'Number of employees who stayed: {df["Attrition"].value_counts()[0]}')
print(f'% of employees who stayed: {round(df["Attrition"].value_counts()[0]/len(df)*100,2)}%')

In [None]:
fig, ax = plt.subplots(1, 3, figsize=(20, 5))
sns.histplot(df, x='Age', hue='Attrition', kde=True, ax=ax[0])
sns.histplot(df, x='DailyRate', hue='Attrition', kde=True, ax=ax[1])
sns.histplot(df, x='DistanceFromHome', hue='Attrition', kde=True, ax=ax[2])
plt.show()

In [None]:
correlations = df.corr(numeric_only=True, method='spearman') #We're using Spearman's Correlation Coefficient as we are dealing with non-parametric data (not normally distributed)
f, ax = plt.subplots(figsize=(20, 20))
sns.heatmap(correlations, annot=True)

In [None]:
plt.figure(figsize=[25,12])
sns.countplot(x = 'Age', hue = 'Attrition', data = df)

In [None]:
plt.figure(figsize=[20,20])
plt.subplot(411)
sns.countplot(x = 'JobRole', hue = 'Attrition', data = df)
plt.subplot(412)
sns.countplot(x = 'MaritalStatus', hue = 'Attrition', data = df)
plt.subplot(413)
sns.countplot(x = 'JobInvolvement', hue = 'Attrition', data = df)
plt.subplot(414)
sns.countplot(x = 'JobLevel', hue = 'Attrition', data = df)

In [None]:
plt.figure(figsize=(12,7))
sns.kdeplot(left_df['DistanceFromHome'], label='Employees who left', fill=True, color='r')
sns.kdeplot(stayed_df['DistanceFromHome'], label='Employees who Stayed', fill=True, color='b')
plt.xlabel('Distance From Home')
plt.legend()

In [None]:
plt.figure(figsize=(12,7))
sns.kdeplot(left_df['YearsWithCurrManager'], label='Employees who left', fill=True, color='r')
sns.kdeplot(stayed_df['YearsWithCurrManager'], label='Employees who Stayed', fill=True, color='b')
plt.xlabel('Years With Current Manager')
plt.legend()

In [None]:
plt.figure(figsize=(12,7))
sns.kdeplot(left_df['TotalWorkingYears'], label='Employees who left', fill=True, color='r')
sns.kdeplot(stayed_df['TotalWorkingYears'], label='Employees who Stayed', fill=True, color='b')
plt.xlabel('Total Working Years')
plt.legend()

In [None]:
plt.figure(figsize=(12,8))
sns.boxplot(x='Gender', y='MonthlyIncome', data=df,hue='Gender')

In [None]:
plt.figure(figsize=(15, 10))
sns.boxplot(x='MonthlyIncome', y='JobRole', data=df,hue='JobRole')

In [None]:
box_attrition = df[['Attrition', 'JobSatisfaction', 'Gender']]

plt.figure()
sns.boxplot(data=box_attrition, x='Attrition', y='JobSatisfaction', hue='Gender')
plt.xlabel('Attrition')
plt.ylabel('Job Satisfaction')
plt.title('Boxplot of Job Satisfaction by Attrition and Gender')
plt.legend(title='Gender')
plt.show()

In [None]:
df['JobSatisfaction'].value_counts()

In [None]:
# Boxplot of job satisfaction by attrition and gender
sns.boxplot(data=df, x='Gender', y='JobSatisfaction', hue='Attrition')
plt.title('Boxplot of Job Satisfaction by Attrition and Gender')
plt.xlabel('Gender')
plt.ylabel('Job Satisfaction')

plt.tight_layout()
plt.show()

