In [2]:
# imports
from pathlib import Path
import pandas as pd
import plotly.express as px
import plotly.io as pio
pio.templates.default = "simple_white"

# import to ignore warning output
import warnings
warnings.filterwarnings('ignore')

In [5]:
# init dataframe
path = Path('data.csv')
df = pd.read_csv(path)
df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


# Employee Demographics

# Exploratory Analysis of Attrition 

In [35]:
# Show distribution for most important columns when predicting Attrition True
# The feature importance CSV is included in the reporting directory
column_feature_top10 = df[['OverTime','JobRole','BusinessTravel','DistanceFromHome','MaritalStatus','NumCompaniesWorked','YearsSinceLastPromotion','Attrition']]

# Loop through every column in the dataframe and plot
'''
for i in column_feature_top10[0:]:
    fig = px.histogram(df,x=i)
    fig.show()
'''

'\nfor i in column_feature_top10[0:]:\n    fig = px.histogram(df,x=i)\n    fig.show()\n'

In [36]:
# over time and attrition
fig = px.histogram(column_feature_top10,x='OverTime',color='Attrition',barmode='group',title="Count of Over Time where Attrition is True")
fig.show()

In [39]:
# business travel and attrition
# normalize dataset here
fig = px.histogram(column_feature_top10, x='BusinessTravel', color='Attrition', barmode='group', title='Level of Business Travel where Attrition is True')
fig.show()

In [41]:
# Job Role and attrition
fig = px.histogram(column_feature_top10, x='JobRole', color='Attrition', barmode='group', title='Headcount by Department Split by Attrition Status')
fig.show()

In [131]:
# marital status 
fig = px.histogram(df, x='MaritalStatus',color='Attrition',barmode='group',title='Count of Marital Status by Attrition')
fig.show()

In [37]:
# Show distribution for most important columns when predicting Attrition False
# The feature importance CSV is included in the reporting directory
column_feature_bottom10 = df[['JobLevel','WorkLifeBalance','JobInvolvement','EnvironmentSatisfaction','StockOptionLevel','JobRole']]
# Loop through every column in the dataframe and plot
"""for i in column_feature_bottom10[0:]:
    fig = px.histogram(df,x=i)
    fig.show()"""

'for i in column_feature_bottom10[0:]:\n    fig = px.histogram(df,x=i)\n    fig.show()'

In [56]:
fig = px.histogram(df,x="EnvironmentSatisfaction",color="Attrition",barmode='group',title="Environment Satisfaction by Attrition Status")
fig.show()

In [48]:
# convert to percentage
fig = px.bar(df.query("Gender=='Female'"),x='DailyRate',y='Attrition',color='Department',orientation='h',title="Female Employee's Daily Rate by Attrition Status")
fig.show()

In [58]:
fig = px.bar(df.query("Gender=='Male'"),x='DailyRate',y='Attrition',color='Department',orientation='h',title="Male Employee's Daily Rate by Attrition Status")
fig.show()

# Company Deomgraphics

In [8]:
# plot the data
fig=px.histogram(avg_profit,x=avg_profit['ProfitPCT'],y='Department',color='Gender',title='Distribution of Profit as Percent by Department and Gender')
fig.show()

In [6]:
# avg profit across department 
# monthly rate - monthly income = firm's monthly profit
avg_profit = df[['Department','JobRole','MonthlyIncome','MonthlyRate','Attrition','Gender','EducationField','Education','Age','DailyRate']]
avg_profit['MonthlyProfit'] = avg_profit.apply(lambda x: x['MonthlyRate'] - x['MonthlyIncome'],axis=1)
avg_profit['ProfitPCT'] = avg_profit.apply(lambda x: (x['MonthlyProfit'] / sum(avg_profit['MonthlyProfit']))*100, axis=1)
# plot the data
fig=px.histogram(avg_profit,x=avg_profit['ProfitPCT'],color='Department',title='Distribution of Profit as Percent by Department')
fig.show()

In [102]:
# plot the data
# update title
fig=px.histogram(avg_profit,x=avg_profit['ProfitPCT'],y='Attrition',color='Department',title='Distribution of Profit as Percent by Attrition and Department')
fig.show()

In [24]:
# plot the data
avg_profit['MonthlyRatePCT'] = avg_profit.apply(lambda x: (x['MonthlyRate']/sum(avg_profit['MonthlyRate'])*100),axis=1)
fig=px.histogram(avg_profit,x="MonthlyRatePCT", y='Department',color='EducationField',title='Monthly Rate as Percentage by Department and Education Field')
fig.show() 

In [6]:
# find avg profit per department
mean_profit_slice = avg_profit.groupby(avg_profit['Department']).mean().round(2)

# plot the data
fig = px.bar(mean_profit_slice,x=mean_profit_slice.index,y='MonthlyProfit',title="Average Monthly Profit by Department")
fig.show()

In [7]:
fig = px.scatter(avg_profit, x="MonthlyProfit", y='Age',color='Department', size="DailyRate", title="Distribution of Monthly Profit by Employee Age")
fig.show()