In [11]:
import pandas as pd
import numpy as np
import os
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns


from scipy.stats import (
                         shapiro,
                         levene,
                         mannwhitneyu,
                         ttest_ind,
                         chi2_contingency,
                         fisher_exact)

pd.set_option('display.max_columns', None)

In [2]:
data_path = Path('../../Artifacts/Attrition.csv').resolve() 
df = pd.read_csv(data_path)

In [3]:
df.sample(5)

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,Gender,HourlyRate,JobInvolvement,JobLevel,JobRole,JobSatisfaction,MaritalStatus,MonthlyIncome,MonthlyRate,NumCompaniesWorked,Over18,OverTime,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
1047,33,No,Travel_Frequently,430,Sales,7,3,Medical,1,1477,4,Male,54,3,2,Sales Executive,1,Married,4373,17456,0,Y,No,14,3,1,80,2,5,2,3,4,3,0,3
1048,34,No,Travel_Rarely,1326,Sales,3,3,Other,1,1478,4,Male,81,1,2,Sales Executive,1,Single,4759,15891,3,Y,No,18,3,4,80,0,15,2,3,13,9,3,12
1214,44,No,Travel_Rarely,921,Research & Development,2,3,Life Sciences,1,1703,3,Female,96,4,3,Healthcare Representative,4,Married,7879,14810,1,Y,Yes,19,3,2,80,1,9,2,3,8,7,6,7
1338,30,Yes,Travel_Rarely,945,Sales,9,3,Medical,1,1876,2,Male,89,3,1,Sales Representative,4,Single,1081,16019,1,Y,No,13,3,3,80,0,1,3,2,1,0,0,0
200,27,No,Travel_Frequently,472,Research & Development,1,1,Technical Degree,1,274,3,Male,60,2,2,Manufacturing Director,1,Married,4298,9679,5,Y,No,19,3,3,80,1,6,1,3,2,2,2,0


In [4]:
df.shape

(1470, 35)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 35 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Age                       1470 non-null   int64 
 1   Attrition                 1470 non-null   object
 2   BusinessTravel            1470 non-null   object
 3   DailyRate                 1470 non-null   int64 
 4   Department                1470 non-null   object
 5   DistanceFromHome          1470 non-null   int64 
 6   Education                 1470 non-null   int64 
 7   EducationField            1470 non-null   object
 8   EmployeeCount             1470 non-null   int64 
 9   EmployeeNumber            1470 non-null   int64 
 10  EnvironmentSatisfaction   1470 non-null   int64 
 11  Gender                    1470 non-null   object
 12  HourlyRate                1470 non-null   int64 
 13  JobInvolvement            1470 non-null   int64 
 14  JobLevel                

In [6]:
df.isna().sum()

Age                         0
Attrition                   0
BusinessTravel              0
DailyRate                   0
Department                  0
DistanceFromHome            0
Education                   0
EducationField              0
EmployeeCount               0
EmployeeNumber              0
EnvironmentSatisfaction     0
Gender                      0
HourlyRate                  0
JobInvolvement              0
JobLevel                    0
JobRole                     0
JobSatisfaction             0
MaritalStatus               0
MonthlyIncome               0
MonthlyRate                 0
NumCompaniesWorked          0
Over18                      0
OverTime                    0
PercentSalaryHike           0
PerformanceRating           0
RelationshipSatisfaction    0
StandardHours               0
StockOptionLevel            0
TotalWorkingYears           0
TrainingTimesLastYear       0
WorkLifeBalance             0
YearsAtCompany              0
YearsInCurrentRole          0
YearsSince

In [7]:
df.select_dtypes(exclude='object').describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Age,1470.0,36.92381,9.135373,18.0,30.0,36.0,43.0,60.0
DailyRate,1470.0,802.485714,403.5091,102.0,465.0,802.0,1157.0,1499.0
DistanceFromHome,1470.0,9.192517,8.106864,1.0,2.0,7.0,14.0,29.0
Education,1470.0,2.912925,1.024165,1.0,2.0,3.0,4.0,5.0
EmployeeCount,1470.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
EmployeeNumber,1470.0,1024.865306,602.024335,1.0,491.25,1020.5,1555.75,2068.0
EnvironmentSatisfaction,1470.0,2.721769,1.093082,1.0,2.0,3.0,4.0,4.0
HourlyRate,1470.0,65.891156,20.329428,30.0,48.0,66.0,83.75,100.0
JobInvolvement,1470.0,2.729932,0.711561,1.0,2.0,3.0,3.0,4.0
JobLevel,1470.0,2.063946,1.10694,1.0,1.0,2.0,3.0,5.0


In [9]:
num_features = df.select_dtypes(exclude='object').columns.to_list()
cat_features = df.select_dtypes(include='object').columns.to_list()

descrete_features = list()
continues_features = list()

for feature in num_features:
    if df[feature].nunique() > 5:
        continues_features.append(feature)
    else:
        descrete_features.append(feature)    

cat_features.remove('Attrition')


target_feature = 'Attrition'

In [42]:
significant_num_features = list()

def checkNormality(sample_1,sample_2):
    print('Shapiro-Wilk Test')
    # Null Hypothesis (H0): the data was drawn from a normal distribution.
    # Alternative Hypothesis (H1): the data was not drawn from a normal distribution.

    stats_1,p_val_1 = shapiro(sample_1)
    stats_2,p_val_2 = shapiro(sample_2)
    print()
    print(f'sample 1 : stats {stats_1}, p-val: {p_val_1}')
    print(f'sample 1 : stats {stats_2}, p-val: {p_val_2}')
    result = p_val_2 > 0.05 or p_val_1 > 0.05
    return result


def checkVarianceHomogenisity(sample_1,sample_2):
    print('levene Test')
    # Null Hypothesis (H0): The variances are equal across the groups.
    # Alternative Hypothesis (H1): The variances are not equal across the groups.

    stats, p_val = levene(sample_1, sample_2)
    print()
    print(f'sample 1 : stats {stats}, p-val: {p_val}')
    result = p_val > 0.05
    return result


def MannWhitneyTest(sample_1, sample_2):
    print('Mann-Whitney U test')
    # Null Hypothesis (H0): The distributions of the two groups are equal.
    # Alternative Hypothesis (H1): The distributions of the two groups are not equal.

    stats, p_val = mannwhitneyu(sample_1, sample_2)
    print()
    print(f'sample 1 : stats {stats}, p-val: {p_val}')
    result = p_val > 0.05
    return result
        

def Ttest(sample_1, sample_2,var_same=True):
    if var_same:
        print('T-test of Independence')
    else:
        print("Wetch's T-test")    
    # Null Hypothesis (H0): The distributions of the two groups are equal.
    # Alternative Hypothesis (H1): The distributions of the two groups are not equal.

    stats, p_val = ttest_ind(sample_1, sample_2,equal_var=var_same)
    print()
    print(f'sample 1 : stats {stats}, p-val: {p_val}')
    result = p_val > 0.05
    return result


def CheckSignificance(feature):
    print('levene Test')
    # taking out random samples
    class_samples_1 = df[df['Attrition'] == 'Yes'][feature].sample(200)
    class_samples_2 = df[df['Attrition'] == 'No'][feature].sample(200)
    print(feature)
    print('-'*25)

    

    if checkNormality(class_samples_1,class_samples_2):
        print('The samples not follows normal distribution. (Reject H0) ✓')
        if checkVarianceHomogenisity(class_samples_1,class_samples_2):
            print(f'This variance of two given samples are homogeneus. (Reject H0) ✓') 
            if Ttest(class_samples_1, class_samples_2):
                print(f'This means that there is a significant difference between the two groups. (Reject H0) ✓')  
                significant_num_features.append(feature)     
            else:
                 print(f'This means that there is no significant difference between the two groups. (Accept H0) ✗')   
        else:
            if MannWhitneyTest(class_samples_1, class_samples_2):
                print(f'This means that there is a significant difference between the two groups. (Reject H0) ✓')  
                significant_num_features.append(feature)     
            else:
                print(f'This means that there is no significant difference between the two groups. (Accept H0) ✗')
    else:
        print('The samples follows normal distribution. (Reject H0) ✓')
        if checkVarianceHomogenisity(class_samples_1,class_samples_2):
            print(f'This variance of two given samples are homogeneus. (Reject H0) ✓') 
            if Ttest(class_samples_1, class_samples_2,False):
                print(f'This means that there is a significant difference between the two groups. (Reject H0) ✓')  
                significant_num_features.append(feature)     
            else:
                 print(f'This means that there is no significant difference between the two groups. (Accept H0) ✗')   
        else:
            if MannWhitneyTest(class_samples_1, class_samples_2):
                print(f'This means that there is a significant difference between the two groups. (Reject H0) ✓')  
                significant_num_features.append(feature)     
            else:
                print(f'This means that there is no significant difference between the two groups. (Accept H0) ✗')
    print('-'*25)

for feature in continues_features:
    CheckSignificance(feature)

levene Test
Age
-------------------------
Shapiro-Wilk Test

sample 1 : stats 0.9373736490696776, p-val: 1.362509688736108e-07
sample 1 : stats 0.9675995011078379, p-val: 0.00014434326964359677
The samples follows normal distribution. (Reject H0) ✓
levene Test

sample 1 : stats 1.9800408975945734, p-val: 0.16016653083408341
This variance of two given samples are homogeneus. (Reject H0) ✓
Wetch's T-test

sample 1 : stats -4.078506436178763, p-val: 5.506467962506044e-05
This means that there is no significant difference between the two groups. (Accept H0) ✗
-------------------------
levene Test
DailyRate
-------------------------
Shapiro-Wilk Test

sample 1 : stats 0.9364352608192787, p-val: 1.1349516612551868e-07
sample 1 : stats 0.9478143763829645, p-val: 1.1682095492745311e-06
The samples follows normal distribution. (Reject H0) ✓
levene Test

sample 1 : stats 0.030627774209332277, p-val: 0.8611624006299059
This variance of two given samples are homogeneus. (Reject H0) ✓
Wetch's T-tes

In [16]:
significant_num_features

['DistanceFromHome',
 'EmployeeNumber',
 'HourlyRate',
 'MonthlyRate',
 'PercentSalaryHike',
 'TrainingTimesLastYear',
 'YearsSinceLastPromotion']

In [17]:
for feature in descrete_features:
    print(feature,df[feature].nunique())

descrete_features = [feature for feature in descrete_features if df[feature].nunique() > 1]  

Education 5
EmployeeCount 1
EnvironmentSatisfaction 4
JobInvolvement 4
JobLevel 5
JobSatisfaction 4
PerformanceRating 2
RelationshipSatisfaction 4
StandardHours 1
StockOptionLevel 4
WorkLifeBalance 4


In [18]:
for feature in cat_features:
    print(feature,df[feature].nunique())
    
cat_features = [feature for feature in cat_features if df[feature].nunique() > 1]  

BusinessTravel 3
Department 3
EducationField 6
Gender 2
JobRole 9
MaritalStatus 3
Over18 1
OverTime 2


In [38]:
significant_cat_features = list()

def significant_bin_cat_features(feature,target):
    print(feature)
    feat,target = df[feature].sample(500),df[target].sample(500)
    contingency_table = pd.crosstab(feat,target)
    print('-'*25)

    result = chi2_contingency(contingency_table)
    if (result.expected_freq < 5).any():
        p_value = result.pvalue
        print(f'The Frequency count greater than 5. Assumption satisfied ✓') 
        if p_value < 0.05:
            print("Reject the null hypothesis: There is an association between the two categorical variables. ✓")
            significant_cat_features.append(feature)
        else:
            print("Fail to reject the null hypothesis: There is no association between the two categorical variables. ✗") 
    else:
        print(f'The Frequency count is greater than 5. Assumption violated ✗')
        print()
        print('Fisher exact test')

        # Null Hypothesis (H0): There is no association between the two categorical variables.
        # Alternative Hypothesis (H1): There is an association between the two categorical variables.
        oddsratio, p_value = fisher_exact(contingency_table)

        print("Odds Ratio:", oddsratio)
        print("P-value:", p_value)

        # Interpretation based on p-value
        if p_value < 0.05:
            print("Reject the null hypothesis: There is an association between the two categorical variables. ✓")
            significant_cat_features.append(feature)
        else:
            print("Fail to reject the null hypothesis: There is no association between the two categorical variables. ✗")     
    print('-'*25) 


for feature in descrete_features+cat_features:
    significant_bin_cat_features(feature,target_feature)       
        

Education
-------------------------
The Frequency count greater than 5. Assumption satisfied ✓
Fail to reject the null hypothesis: There is no association between the two categorical variables. ✗
-------------------------
EnvironmentSatisfaction
-------------------------
The Frequency count greater than 5. Assumption satisfied ✓
Fail to reject the null hypothesis: There is no association between the two categorical variables. ✗
-------------------------
JobInvolvement
-------------------------
The Frequency count greater than 5. Assumption satisfied ✓
Fail to reject the null hypothesis: There is no association between the two categorical variables. ✗
-------------------------
JobLevel
-------------------------
The Frequency count greater than 5. Assumption satisfied ✓
Reject the null hypothesis: There is an association between the two categorical variables. ✓
-------------------------
JobSatisfaction
-------------------------
The Frequency count greater than 5. Assumption satisfied ✓
R

In [47]:
len(df.columns.to_list())

35

In [45]:
len(significant_cat_features+significant_num_features)

13