In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import pearsonr
from scipy.stats import wilcoxon
from scipy.stats import kruskal
from scipy.stats import friedmanchisquare
from scipy.stats import mannwhitneyu
from scipy.stats import chi2_contingency
from scipy.stats import ttest_rel
from scipy.stats import ttest_1samp
from scipy.stats import ttest_ind

In [3]:
# to visualise all the columns in the dataframe
pd.pandas.set_option('display.max_columns', None)

In [4]:
df = pd.read_csv("general_data.csv")
df.columns

Index(['Age', 'Attrition', 'BusinessTravel', 'Department', 'DistanceFromHome',
       'Education', 'EducationField', 'EmployeeCount', 'EmployeeID', 'Gender',
       'JobLevel', 'JobRole', 'MaritalStatus', 'MonthlyIncome',
       'NumCompaniesWorked', 'Over18', 'PercentSalaryHike', 'StandardHours',
       'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear',
       'YearsAtCompany', 'YearsSinceLastPromotion', 'YearsWithCurrManager'],
      dtype='object')

In [6]:
df.dropna(inplace=True)

In [7]:
df.drop_duplicates()

Unnamed: 0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeID,Gender,JobLevel,JobRole,MaritalStatus,MonthlyIncome,NumCompaniesWorked,Over18,PercentSalaryHike,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,YearsAtCompany,YearsSinceLastPromotion,YearsWithCurrManager
0,51,No,Travel_Rarely,Sales,6,2,Life Sciences,1,1,Female,1,Healthcare Representative,Married,131160,1.0,Y,11,8,0,1.0,6,1,0,0
1,31,Yes,Travel_Frequently,Research & Development,10,1,Life Sciences,1,2,Female,1,Research Scientist,Single,41890,0.0,Y,23,8,1,6.0,3,5,1,4
2,32,No,Travel_Frequently,Research & Development,17,4,Other,1,3,Male,4,Sales Executive,Married,193280,1.0,Y,15,8,3,5.0,2,5,0,3
3,38,No,Non-Travel,Research & Development,2,5,Life Sciences,1,4,Male,3,Human Resources,Married,83210,3.0,Y,11,8,3,13.0,5,8,7,5
4,32,No,Travel_Rarely,Research & Development,10,1,Medical,1,5,Male,1,Sales Executive,Single,23420,4.0,Y,12,8,2,9.0,2,6,0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4404,29,No,Travel_Rarely,Sales,4,3,Other,1,4405,Female,2,Human Resources,Single,35390,1.0,Y,18,8,0,6.0,2,6,1,5
4405,42,No,Travel_Rarely,Research & Development,5,4,Medical,1,4406,Female,1,Research Scientist,Single,60290,3.0,Y,17,8,1,10.0,5,3,0,2
4406,29,No,Travel_Rarely,Research & Development,2,4,Medical,1,4407,Male,1,Laboratory Technician,Divorced,26790,2.0,Y,15,8,0,10.0,2,3,0,2
4407,25,No,Travel_Rarely,Research & Development,25,2,Life Sciences,1,4408,Male,2,Sales Executive,Married,37020,0.0,Y,20,8,0,5.0,4,4,1,2


In [8]:
df['Attrition'] = df.Attrition.map({'Yes':1, 'No':0})

In [9]:
df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeID,Gender,JobLevel,JobRole,MaritalStatus,MonthlyIncome,NumCompaniesWorked,Over18,PercentSalaryHike,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,YearsAtCompany,YearsSinceLastPromotion,YearsWithCurrManager
0,51,0,Travel_Rarely,Sales,6,2,Life Sciences,1,1,Female,1,Healthcare Representative,Married,131160,1.0,Y,11,8,0,1.0,6,1,0,0
1,31,1,Travel_Frequently,Research & Development,10,1,Life Sciences,1,2,Female,1,Research Scientist,Single,41890,0.0,Y,23,8,1,6.0,3,5,1,4
2,32,0,Travel_Frequently,Research & Development,17,4,Other,1,3,Male,4,Sales Executive,Married,193280,1.0,Y,15,8,3,5.0,2,5,0,3
3,38,0,Non-Travel,Research & Development,2,5,Life Sciences,1,4,Male,3,Human Resources,Married,83210,3.0,Y,11,8,3,13.0,5,8,7,5
4,32,0,Travel_Rarely,Research & Development,10,1,Medical,1,5,Male,1,Sales Executive,Single,23420,4.0,Y,12,8,2,9.0,2,6,0,4


In [10]:
df.Attrition.value_counts()

0    3677
1     705
Name: Attrition, dtype: int64

### Displaying correlation for all the variables at once

In [11]:
df.corr()

Unnamed: 0,Age,Attrition,DistanceFromHome,Education,EmployeeCount,EmployeeID,JobLevel,MonthlyIncome,NumCompaniesWorked,PercentSalaryHike,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,YearsAtCompany,YearsSinceLastPromotion,YearsWithCurrManager
Age,1.0,-0.158399,0.007376,-0.0339,,0.008105,-0.001137,-0.045163,0.299527,-0.032561,,-0.031504,0.680037,-0.028962,0.311281,0.21565,0.20158
Attrition,-0.158399,1.0,-0.009449,-0.017106,,-0.004621,-0.012382,-0.03016,0.042831,0.033153,,-0.008164,-0.16967,-0.047586,-0.133003,-0.031423,-0.154692
DistanceFromHome,0.007376,-0.009449,1.0,-0.007491,,-0.000326,-0.03999,-0.022757,-0.014449,0.03772,,0.009353,0.009574,-0.008957,0.030746,0.002243,0.021773
Education,-0.0339,-0.017106,-0.007491,1.0,,-0.009389,0.045822,0.007289,-0.01621,-0.041054,,0.002386,-0.009228,0.009939,0.005997,0.023457,0.005645
EmployeeCount,,,,,,,,,,,,,,,,,
EmployeeID,0.008105,-0.004621,-0.000326,-0.009389,,1.0,-0.00309,0.007865,0.000719,-0.004877,,-0.013488,-0.001688,-0.012102,0.004117,0.000814,0.009079
JobLevel,-0.001137,-0.012382,-0.03999,0.045822,,-0.00309,1.0,0.046688,-0.009759,0.010874,,0.000365,-0.036293,-0.031931,-0.06336,-0.05968,-0.053898
MonthlyIncome,-0.045163,-0.03016,-0.022757,0.007289,,0.007865,0.046688,1.0,-0.021446,0.004607,,0.027242,-0.034398,0.04978,8.8e-05,0.06447,0.023095
NumCompaniesWorked,0.299527,0.042831,-0.014449,-0.01621,,0.000719,-0.009759,-0.021446,1.0,0.030064,,0.016291,0.238807,-0.031335,-0.117213,-0.035855,-0.109372
PercentSalaryHike,-0.032561,0.033153,0.03772,-0.041054,,-0.004877,0.010874,0.004607,0.030064,1.0,,0.012104,-0.01848,-0.03672,-0.029022,-0.028654,-0.039687


# Attrition and Age

In [12]:
stats, p= pearsonr(df.Attrition,df.Age)
print(f"The value of stats is {stats} and value of p is {p}")

The value of stats is -0.15839867954096706 and value of p is 5.1265982193975044e-26


## Age is not correlated with the Attrition rate

# Attrition and BusinessTravel

In [14]:
chi2_contingency(pd.crosstab(df.Attrition, df.BusinessTravel))

(70.07594084831366,
 6.07019702736392e-16,
 2,
 array([[ 375.92332268,  692.26951164, 2608.80716568],
        [  72.07667732,  132.73048836,  500.19283432]]))

## Business Travel is not correlated with Attrition rate

# Attrition and Department

In [16]:
chi2_contingency(pd.crosstab(df.Attrition, df.Department))

(25.89432541916022,
 2.382970570769315e-06,
 2,
 array([[ 156.91442264, 2404.06321314, 1116.02236422],
        [  30.08557736,  460.93678686,  213.97763578]]))

## Department is not correlated with Attrition rate

# Attrition and DistanceFromHome

In [15]:
stats, p= pearsonr(df.Attrition,df.DistanceFromHome)
print(f"The value of stats is {stats} and value of p is {p}")

The value of stats is -0.009730141010179659 and value of p is 0.5182860428065224


## DistanceFromHome is negatively correlated with Attrition rate

# Attrition and Education

In [16]:
stats, p= pearsonr(df.Attrition,df.Education)
print(f"The value of stats is {stats} and value of p is {p}")

The value of stats is -0.015111167710968744 and value of p is 0.3157293177115451


In [17]:
chi2_contingency(pd.crosstab(df.Attrition, df.Education))

(6.2735361781270615,
 0.17963050984273224,
 4,
 array([[ 426.27019626,  706.53445915, 1427.33386581,  996.86809676,
          119.99338202],
        [  81.72980374,  135.46554085,  273.66613419,  191.13190324,
           23.00661798]]))

## Education is correlated with Attrition

# Attrition and EducationField

In [19]:
chi2_contingency(pd.crosstab(df.Attrition, df.EducationField))

(43.132860148900384,
 3.472896220587872e-08,
 5,
 array([[  67.12916476, 1515.44089457,  398.57941579, 1162.17366499,
          204.74395253,  328.93290735],
        [  12.87083524,  290.55910543,   76.42058421,  222.82633501,
           39.25604747,   63.06709265]]))

## EducationField  has relation with Attrition

# Attrition and Gender

In [21]:
chi2_contingency(pd.crosstab(df.Attrition, df.Gender))

(1.3825823839528295,
 0.23966176275638887,
 1,
 array([[1473.48516659, 2203.51483341],
        [ 282.51483341,  422.48516659]]))

## Gender is correlated with Attrition rate

# Attrition and EmployeeID

In [17]:
stats, p= pearsonr(df.Attrition,df.EmployeeID)
print(f"The value of stats is {stats} and value of p is {p}")

The value of stats is -0.004729122995066088 and value of p is 0.753548740189224


# Attrition and JobLevel

In [18]:
stats, p= pearsonr(df.Attrition,df.JobLevel)
print(f"The value of stats is {stats} and value of p is {p}")

The value of stats is -0.01028971328749503 and value of p is 0.4945171727200731


In [23]:
chi2_contingency(pd.crosstab(df.Attrition, df.JobLevel))

(7.036480574687178,
 0.13397290697032432,
 4,
 array([[1358.52647193, 1334.1921497 ,  546.26357827,  266.83842994,
          171.17937015],
        [ 260.47352807,  255.8078503 ,  104.73642173,   51.16157006,
           32.82062985]]))

## JobLevel is correlated with Attrition rate

# Attrition and JobRole

In [25]:
chi2_contingency(pd.crosstab(df.Attrition, df.JobRole))

(21.893724958847,
 0.005116592717526599,
 8,
 array([[326.41556367, 130.90187129, 648.63555454, 255.92994067,
         359.98014605, 198.87015062, 731.70789594, 818.13669557,
         206.42218165],
        [ 62.58443633,  25.09812871, 124.36444546,  49.07005933,
          69.01985395,  38.12984938, 140.29210406, 156.86330443,
          39.57781835]]))

## JobRole is not correlated with attrition rate

# Attrition and MaritalStatus

In [26]:
chi2_contingency(pd.crosstab(df.Attrition, df.MaritalStatus))

(133.85785802925156,
 8.573051828219379e-30,
 2,
 array([[ 813.94112277, 1684.10292104, 1178.95595618],
        [ 156.05887723,  322.89707896,  226.04404382]]))

##  Marital Status is not related with Artrition rate

# Attrition and MonthlyIncome

In [19]:
stats, p= pearsonr(df.Attrition,df.MonthlyIncome)
print(f"The value of stats is {stats} and value of p is {p}")

The value of stats is -0.031176281698115017 and value of p is 0.038427484905971684


# Attrition and NumCompaniesWorked

In [20]:
stats, p= pearsonr(df.Attrition,df.NumCompaniesWorked)
print(f"The value of stats is {stats} and value of p is {p}")

The value of stats is nan and value of p is 1.0


# Attrition and PercentSalaryHike

In [21]:
stats, p= pearsonr(df.Attrition,df.PercentSalaryHike)
print(f"The value of stats is {stats} and value of p is {p}")

The value of stats is 0.032532594891053514 and value of p is 0.030743386433316814


# Attrition and StockOptionLevel

In [22]:
stats, p= pearsonr(df.Attrition,df.StockOptionLevel)
print(f"The value of stats is {stats} and value of p is {p}")

The value of stats is -0.006838852403261549 and value of p is 0.6498072937492487


In [27]:

chi2_contingency(pd.crosstab(df.Attrition, df.StockOptionLevel))

(3.444801419724394,
 0.32799207189761653,
 3,
 array([[1577.53537198, 1490.26745778,  396.06207211,  213.13509813],
        [ 302.46462802,  285.73254222,   75.93792789,   40.86490187]]))

## StockOptionLevel has relation with artrition rate

# Attrition and TotalWorkingYears

In [23]:
stats, p= pearsonr(df.Attrition,df.TotalWorkingYears)
print(f"The value of stats is {stats} and value of p is {p}")

The value of stats is nan and value of p is 1.0


# Attrition and TrainingTimesLastYear

In [24]:
stats, p= pearsonr(df.Attrition,df.TrainingTimesLastYear)
print(f"The value of stats is {stats} and value of p is {p}")

The value of stats is -0.04943057624425504 and value of p is 0.0010247061915349563


# Attrition and YearsAtCompany

In [25]:
stats, p= pearsonr(df.Attrition,df.YearsAtCompany)
print(f"The value of stats is {stats} and value of p is {p}")

The value of stats is -0.13439221398997717 and value of p is 3.16388312248436e-19


# Attrition and YearsSinceLastPromotion

In [26]:
stats, p= pearsonr(df.Attrition,df.YearsSinceLastPromotion)
print(f"The value of stats is {stats} and value of p is {p}")

The value of stats is -0.03301877514258439 and value of p is 0.02833033618936241


# Attrition and YearsWithCurrManager

In [27]:
stats, p= pearsonr(df.Attrition,df.YearsWithCurrManager)
print(f"The value of stats is {stats} and value of p is {p}")

The value of stats is -0.15619931590162842 and value of p is 1.7339322652874626e-25
