In [1]:
import numpy as np
import pandas as pd

import scipy.stats as stats
from scipy.stats import f_oneway
from scipy.stats import chi2
from scipy.stats import chi2_contingency
import plotly.express as px

In [2]:
df = pd.read_csv(r"C:\Users\Vignesh\OneDrive\Desktop\Datascience\PROJECT1\Employee\Employee-Attrition_Processed.csv")
df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,Gender,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,Diploma,Life Sciences,Medium,Female,...,Excellent,Low,0,8,0,Bad,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,High School,Life Sciences,High,Male,...,Outstanding,Very High,1,10,3,Better,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,Diploma,Other,Very High,Male,...,Excellent,Medium,0,7,3,Better,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,Postgraduate,Life Sciences,Very High,Female,...,Excellent,High,0,8,3,Better,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,High School,Medical,Low,Male,...,Excellent,Very High,1,6,3,Better,2,2,2,2


In [3]:
print("The shape of data frame:", df.shape)
print("Number of Rows in the dataframe:", len(df))
print("Number of Columns in the dataframe:", len(df.columns))

The shape of data frame: (1470, 31)
Number of Rows in the dataframe: 1470
Number of Columns in the dataframe: 31


## ****STASTICAL ANALYSIS****

**ANOVA TEST:**

In [4]:
num_cols = df.select_dtypes(include=[np.number]).columns
new_df = df.copy()
new_df['Attrition'] = new_df['Attrition'].replace({'Yes': 1, 'No': 0})

  new_df['Attrition'] = new_df['Attrition'].replace({'Yes': 1, 'No': 0})


In [5]:
f_scores = {}
p_values = {}

for col in num_cols:
    f_score, p_value = stats.f_oneway(new_df[col],new_df["Attrition"])
    
    f_scores[col] = f_score
    p_values[col] = p_value

In [6]:
#Visulaize f_scores and p_values
f_scores_df = pd.DataFrame(list(f_scores.items()), columns=['Feature', 'F_Score'])
fig = px.bar(f_scores_df, x='Feature', y='F_Score', title='ANOVA F-Scores for Numerical Features',color='F_Score')
fig.show()

In [7]:
#COMPARING F_SCORE AND P_VALUE OF ANOVA TEST
f_scores_df['P_Value'] = f_scores_df['Feature'].map(p_values)
f_scores_df = f_scores_df.sort_values(by='F_Score', ascending=False)
f_scores_df['Significant'] = f_scores_df['P_Value'] < 0.05
f_scores_df

Unnamed: 0,Feature,F_Score,P_Value,Significant
7,PercentSalaryHike,24602.507947,0.0,True
0,Age,23766.934042,0.0,True
3,HourlyRate,15362.122371,0.0,True
5,MonthlyRate,5944.089071,0.0,True
1,DailyRate,5811.796569,0.0,True
10,TrainingTimesLastYear,5691.401732,0.0,True
9,TotalWorkingYears,2994.90631,0.0,True
4,MonthlyIncome,2804.459632,0.0,True
12,YearsInCurrentRole,1834.262264,7.895023e-312,True
11,YearsAtCompany,1829.442766,3.485969e-311,True


**CHI-SQUARE TEST:**

In [8]:
cat_cols = df.select_dtypes(include=['object']).columns
cat_cols = cat_cols.drop('Attrition')

In [9]:
chi2_statistic = {}
p_values = {}

# Perform chi-square test for each column
for col in cat_cols:
    contingency_table = pd.crosstab(df[col], df['Attrition'])
    chi2, p_value, _, _ = chi2_contingency(contingency_table)
    chi2_statistic[col] = chi2
    p_values[col] = p_value

In [10]:
#visualize chi2_statistic and p_values
chi2_df = pd.DataFrame(list(chi2_statistic.items()), columns=['Feature','chi2_statistic'])
chi2_df['P_Value'] = chi2_df['Feature'].map(p_values)
fig = px.bar(chi2_df, x='Feature', y='chi2_statistic', title='Chi-Square Statistics for Categorical Features', color='chi2_statistic')
fig.show()

In [11]:
#comparing chi2_statistic and p_values
chi2_df = chi2_df.sort_values(by='chi2_statistic', ascending=False)
chi2_df['Significant'] = chi2_df['P_Value'] < 0.05
chi2_df

Unnamed: 0,Feature,chi2_statistic,P_Value,Significant
11,OverTime,87.564294,8.158424e-21,True
8,JobRole,86.190254,2.752482e-15,True
7,JobLevel,72.529013,6.634685e-15,True
10,MaritalStatus,46.163677,9.455511e-11,True
6,JobInvolvement,28.492021,2.863181e-06,True
0,BusinessTravel,24.182414,5.608614e-06,True
4,EnvironmentSatisfaction,22.503881,5.123469e-05,True
9,JobSatisfaction,17.505077,0.0005563005,True
14,WorkLifeBalance,16.325097,0.0009725699,True
3,EducationField,16.024674,0.00677398,True
