In [85]:
import pandas as pd
import numpy as np
import os
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns



from scipy.stats import (levene,
                         mannwhitneyu,
                         ttest_ind,
                         chi2_contingency,
                         fisher_exact)

pd.set_option('display.max_columns', None)

In [2]:
data_path = Path('../../Artifacts/Attrition.csv').resolve() 
df = pd.read_csv(data_path)

In [3]:
df.sample(5)

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,Gender,HourlyRate,JobInvolvement,JobLevel,JobRole,JobSatisfaction,MaritalStatus,MonthlyIncome,MonthlyRate,NumCompaniesWorked,Over18,OverTime,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
435,33,Yes,Travel_Rarely,1277,Research & Development,15,1,Medical,1,582,2,Male,56,3,3,Manager,3,Married,13610,24619,7,Y,Yes,12,3,4,80,0,15,2,4,7,6,7,7
401,56,No,Travel_Frequently,906,Sales,6,3,Life Sciences,1,532,3,Female,86,4,4,Sales Executive,1,Married,13212,18256,9,Y,No,11,3,4,80,3,36,0,2,7,7,7,7
548,43,No,Travel_Frequently,775,Sales,15,3,Life Sciences,1,754,4,Male,47,2,2,Sales Executive,4,Married,6804,23683,3,Y,No,18,3,3,80,1,7,5,3,2,2,2,2
428,47,No,Travel_Rarely,983,Research & Development,2,2,Medical,1,574,1,Female,65,3,2,Manufacturing Director,4,Divorced,5070,7389,5,Y,No,13,3,3,80,3,20,2,3,5,0,0,4
1410,40,No,Travel_Rarely,444,Sales,2,2,Marketing,1,1986,2,Female,92,3,2,Sales Executive,2,Married,5677,4258,3,Y,No,14,3,3,80,1,15,4,3,11,8,5,10


In [4]:
df.shape

(1470, 35)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 35 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Age                       1470 non-null   int64 
 1   Attrition                 1470 non-null   object
 2   BusinessTravel            1470 non-null   object
 3   DailyRate                 1470 non-null   int64 
 4   Department                1470 non-null   object
 5   DistanceFromHome          1470 non-null   int64 
 6   Education                 1470 non-null   int64 
 7   EducationField            1470 non-null   object
 8   EmployeeCount             1470 non-null   int64 
 9   EmployeeNumber            1470 non-null   int64 
 10  EnvironmentSatisfaction   1470 non-null   int64 
 11  Gender                    1470 non-null   object
 12  HourlyRate                1470 non-null   int64 
 13  JobInvolvement            1470 non-null   int64 
 14  JobLevel                

In [6]:
df.isna().sum()

Age                         0
Attrition                   0
BusinessTravel              0
DailyRate                   0
Department                  0
DistanceFromHome            0
Education                   0
EducationField              0
EmployeeCount               0
EmployeeNumber              0
EnvironmentSatisfaction     0
Gender                      0
HourlyRate                  0
JobInvolvement              0
JobLevel                    0
JobRole                     0
JobSatisfaction             0
MaritalStatus               0
MonthlyIncome               0
MonthlyRate                 0
NumCompaniesWorked          0
Over18                      0
OverTime                    0
PercentSalaryHike           0
PerformanceRating           0
RelationshipSatisfaction    0
StandardHours               0
StockOptionLevel            0
TotalWorkingYears           0
TrainingTimesLastYear       0
WorkLifeBalance             0
YearsAtCompany              0
YearsInCurrentRole          0
YearsSince

In [7]:
df.select_dtypes(exclude='object').describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Age,1470.0,36.92381,9.135373,18.0,30.0,36.0,43.0,60.0
DailyRate,1470.0,802.485714,403.5091,102.0,465.0,802.0,1157.0,1499.0
DistanceFromHome,1470.0,9.192517,8.106864,1.0,2.0,7.0,14.0,29.0
Education,1470.0,2.912925,1.024165,1.0,2.0,3.0,4.0,5.0
EmployeeCount,1470.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
EmployeeNumber,1470.0,1024.865306,602.024335,1.0,491.25,1020.5,1555.75,2068.0
EnvironmentSatisfaction,1470.0,2.721769,1.093082,1.0,2.0,3.0,4.0,4.0
HourlyRate,1470.0,65.891156,20.329428,30.0,48.0,66.0,83.75,100.0
JobInvolvement,1470.0,2.729932,0.711561,1.0,2.0,3.0,3.0,4.0
JobLevel,1470.0,2.063946,1.10694,1.0,1.0,2.0,3.0,5.0


In [None]:
num_features = df.select_dtypes(exclude='object').columns.to_list()
cat_features = df.select_dtypes(include='object').columns.to_list()

descrete_features = list()
continues_features = list()

for feature in num_features:
    if df[feature].nunique() > 5:
        continues_features.append(feature)
    else:
        descrete_features.append(feature)    

cat_features.remove('Attrition')


target_feature = 'Attrition'

In [None]:
significant_num_features = list()

def CheckSignificance(feature):
    # taking out random samples
    class_samples_1 = df[df['Attrition'] == 'Yes'][feature].sample(100)
    class_samples_2 = df[df['Attrition'] == 'No'][feature].sample(100)
    print(feature)
    print('-'*25)
    # no need to check normality we are using n > 30 sample where central limit theoram help to save. 
    # we are checking for variance whether feature two different group have homogenius variance
    
    # Null Hypothesis (H0): The variances are equal across the groups.
    # Alternative Hypothesis (H1): The variances are not equal across the groups.
    stat, p_val = levene(class_samples_1, class_samples_2)

    if p_val <= 0.05:
        print(f'The variances are not equal across the groups. (Reject H0) ✗')

        # Mann Whitney U test
        print()
        print('Mann-Whitney U test')

        # Null Hypothesis (H0): The distributions of the two groups are equal.
        # Alternative Hypothesis (H1): The distributions of the two groups are not equal.

        stat, p_val = mannwhitneyu(class_samples_1, class_samples_2)
        if p_val <= 0.05:
            print(f'This means that there is a significant difference between the two groups. (Reject H0) ✓')  
            significant_num_features.append(feature) 
        else:
            print(f'This means that there is no significant difference between the two groups. (Accept H0) ✗')     
    else:
        print(f' The variances are equal across the groups. (Accept H0) ✓')   

        # Ttest of independence
        print()
        print('T-test for the means of two independent samples ')

        # Null Hypothesis (H0): The distributions of the two groups are equal.
        # Alternative Hypothesis (H1): The distributions of the two groups are not equal.
        stat, p_val = ttest_ind(class_samples_1, class_samples_2)

        if p_val <= 0.05:
            print(f'This means that there is a significant difference between the two groups. (Reject H0) ✓') 
            significant_num_features.append(feature)   
        else:
            print(f'This means that there is no significant difference between the two groups. (Accept H0) ✗')  
        print()      
        print('-'*25)

for feature in continues_features:
    CheckSignificance(feature)

Age
-------------------------
 The variances are equal across the groups. (Accept H0) ✓

T-test for the means of two independent samples 
This means that there is a significant difference between the two groups. (Reject H0) ✓

-------------------------
DailyRate
-------------------------
 The variances are equal across the groups. (Accept H0) ✓

T-test for the means of two independent samples 
This means that there is a significant difference between the two groups. (Reject H0) ✓

-------------------------
DistanceFromHome
-------------------------
 The variances are equal across the groups. (Accept H0) ✓

T-test for the means of two independent samples 
This means that there is a significant difference between the two groups. (Reject H0) ✓

-------------------------
EmployeeNumber
-------------------------
 The variances are equal across the groups. (Accept H0) ✓

T-test for the means of two independent samples 
This means that there is no significant difference between the two groups

In [None]:
for feature in descrete_features:
    print(feature,df[feature].nunique())

descrete_features = [feature for feature in descrete_features if df[feature].nunique() > 1]  

Education 5
EmployeeCount 1
EnvironmentSatisfaction 4
JobInvolvement 4
JobLevel 5
JobSatisfaction 4
PerformanceRating 2
RelationshipSatisfaction 4
StandardHours 1
StockOptionLevel 4
WorkLifeBalance 4


In [69]:
for feature in cat_features:
    print(feature,df[feature].nunique())
    
cat_features = [feature for feature in cat_features if df[feature].nunique() > 1]  

BusinessTravel 3
Department 3
EducationField 6
Gender 2
JobRole 9
MaritalStatus 3
Over18 1
OverTime 2


In [70]:
bin_cat_features = list()
multi_cat_features = list()

for feature in descrete_features+cat_features:
    if df[feature].nunique()>2:
        multi_cat_features.append(feature)
    else:
        bin_cat_features.append(feature)

In [95]:
significant_cat_features = list()

def significant_bin_cat_features(feature,target):
    print(feature)
    feature,target = df[feature].sample(100),df[target].sample(100)
    contingency_table = pd.crosstab(feature,target)
    print('-'*25)

    result = chi2_contingency(contingency_table)
    if (result.expected_freq < 5).any():
        print(f'The Frequency count is less than 5. Assumption violated ✗')
        print()
        print('Fisher exact test')

        # Null Hypothesis (H0): There is no association between the two categorical variables.
        # Alternative Hypothesis (H1): There is an association between the two categorical variables.
        oddsratio, p_value = fisher_exact(contingency_table)

        print("Odds Ratio:", oddsratio)
        print("P-value:", p_value)

        # Interpretation based on p-value
        if p_value < 0.05:
            print("Reject the null hypothesis: There is an association between the two categorical variables.")
            significant_cat_features.append(feature)
        else:
            print("Fail to reject the null hypothesis: There is no association between the two categorical variables.")
            
        
    else:
        p_value = result.pvalue
        print(f'The Frequency count greater than 5. Assumption satisfied ✓') 
        if p_value < 0.05:
            print("Reject the null hypothesis: There is an association between the two categorical variables.")
            significant_cat_features.append(feature)
        else:
            print("Fail to reject the null hypothesis: There is no association between the two categorical variables.")
    print('-'*25) 


for feature in bin_cat_features:
    significant_bin_cat_features(feature,target_feature)       
        

PerformanceRating
-------------------------
The Frequency count is less than 5. Assumption violated ✗

Fisher exact test
Odds Ratio: 0.0
P-value: 1.0
Fail to reject the null hypothesis: There is no association between the two categorical variables.
-------------------------
Gender
-------------------------
The Frequency count is less than 5. Assumption violated ✗

Fisher exact test
Odds Ratio: inf
P-value: 1.0
Fail to reject the null hypothesis: There is no association between the two categorical variables.
-------------------------
OverTime
-------------------------
The Frequency count is less than 5. Assumption violated ✗

Fisher exact test
Odds Ratio: 1.0
P-value: 1.0
Fail to reject the null hypothesis: There is no association between the two categorical variables.
-------------------------


In [96]:
significant_cat_features

[]