# Statistical Analysis - Employee Attrition

This notebook performs statistical analysis to identify significant factors affecting employee attrition.

## Analysis includes:
1. Hypothesis testing
2. Chi-square tests for categorical variables
3. T-tests for numerical variables
4. Correlation analysis
5. Feature importance analysis


In [1]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.stats import chi2_contingency, ttest_ind, mannwhitneyu
import sys
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Add src to path
project_root = Path().resolve().parent
sys.path.append(str(project_root))

# Import custom utilities
from src.utils import load_data, get_data_path

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 200)
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)


In [2]:
# Load cleaned data
data_path = get_data_path('cleaned_data.csv', subfolder='processed')
df = load_data(data_path)

# Create binary target variable
df['Attrition_Binary'] = (df['Attrition'] == 'Yes').astype(int)

print(f"Data loaded: {df.shape}")
print(f"\nAttrition distribution:")
print(df['Attrition'].value_counts())


Data loaded: (1470, 42)

Attrition distribution:
Attrition
No     1232
Yes     238
Name: count, dtype: int64


## 1. Chi-Square Tests for Categorical Variables


In [3]:
# Perform Chi-square tests for categorical variables
categorical_vars = ['Gender', 'MaritalStatus', 'Department', 'JobRole', 'BusinessTravel', 
                     'EducationField', 'OverTime', 'HowToEmploy']

chi2_results = []

for var in categorical_vars:
    if var in df.columns:
        # Create contingency table
        contingency_table = pd.crosstab(df[var], df['Attrition'])
        
        # Perform chi-square test
        chi2, p_value, dof, expected = chi2_contingency(contingency_table)
        
        # Calculate Cramér's V (effect size)
        n = contingency_table.sum().sum()
        cramers_v = np.sqrt(chi2 / (n * (min(contingency_table.shape) - 1)))
        
        chi2_results.append({
            'Variable': var,
            'Chi-square': chi2,
            'p-value': p_value,
            'Degrees of Freedom': dof,
            "Cramér's V": cramers_v,
            'Significant': 'Yes' if p_value < 0.05 else 'No'
        })

chi2_df = pd.DataFrame(chi2_results)
chi2_df = chi2_df.sort_values('p-value')

print("Chi-Square Test Results:")
print("=" * 80)
print(chi2_df.to_string(index=False))
print("\nSignificance level: α = 0.05")


Chi-Square Test Results:
      Variable  Chi-square      p-value  Degrees of Freedom  Cramér's V Significant
       JobRole   93.029772 1.126586e-16                   8    0.251566         Yes
 MaritalStatus   36.294672 1.314353e-08                   2    0.157131         Yes
BusinessTravel   19.810167 4.992026e-05                   2    0.116087         Yes
EducationField   18.946613 1.966646e-03                   5    0.113529         Yes
    Department    8.551995 1.389817e-02                   2    0.076274         Yes
      OverTime   75.959677 2.606054e-02                  54    0.227318         Yes
   HowToEmploy    7.528208 1.842255e-01                   5    0.071563          No
        Gender    0.088193 7.664872e-01                   1    0.007746          No

Significance level: α = 0.05


## 2. T-Tests for Numerical Variables


In [4]:
# Perform t-tests for numerical variables
numerical_vars = ['Age', 'MonthlyIncome', 'YearsAtCompany', 'TotalWorkingYears', 
                  'DistanceFromHome', 'JobSatisfaction', 'EnvironmentSatisfaction',
                  'WorkLifeBalance', 'StressRating', 'PerformanceIndex', 'Incentive']

t_test_results = []

for var in numerical_vars:
    if var in df.columns:
        # Split data by attrition
        group_yes = df[df['Attrition'] == 'Yes'][var].dropna()
        group_no = df[df['Attrition'] == 'No'][var].dropna()
        
        # Check normality (Shapiro-Wilk test on sample)
        if len(group_yes) > 3 and len(group_no) > 3:
            # Use Mann-Whitney U test if data is not normal, otherwise t-test
            try:
                # Try t-test first
                t_stat, p_value = ttest_ind(group_yes, group_no)
                test_type = 't-test'
            except:
                # Use Mann-Whitney U test
                u_stat, p_value = mannwhitneyu(group_yes, group_no, alternative='two-sided')
                t_stat = u_stat
                test_type = 'Mann-Whitney U'
            
            # Calculate effect size (Cohen's d)
            pooled_std = np.sqrt(((len(group_yes) - 1) * group_yes.std()**2 + 
                                  (len(group_no) - 1) * group_no.std()**2) / 
                                 (len(group_yes) + len(group_no) - 2))
            cohens_d = (group_yes.mean() - group_no.mean()) / pooled_std if pooled_std > 0 else 0
            
            t_test_results.append({
                'Variable': var,
                'Test Type': test_type,
                'Statistic': t_stat,
                'p-value': p_value,
                "Mean (Yes)": group_yes.mean(),
                "Mean (No)": group_no.mean(),
                "Difference": group_yes.mean() - group_no.mean(),
                "Cohen's d": cohens_d,
                'Significant': 'Yes' if p_value < 0.05 else 'No'
            })

t_test_df = pd.DataFrame(t_test_results)
t_test_df = t_test_df.sort_values('p-value')

print("T-Test / Mann-Whitney U Test Results:")
print("=" * 100)
print(t_test_df.to_string(index=False))
print("\nSignificance level: α = 0.05")


T-Test / Mann-Whitney U Test Results:
               Variable Test Type  Statistic      p-value  Mean (Yes)   Mean (No)  Difference  Cohen's d Significant
              Incentive    t-test  -7.890071 5.858117e-15  451.890756 1276.359578 -824.468822  -0.558658         Yes
           StressRating    t-test   7.189963 1.028530e-12    2.869748    2.427760    0.441988   0.509086         Yes
      TotalWorkingYears    t-test  -6.827620 1.259826e-11    8.159664   11.851461   -3.691797  -0.483431         Yes
                    Age    t-test  -6.157868 9.498923e-10   33.592437   37.533279   -3.940842  -0.436009         Yes
         YearsAtCompany    t-test  -5.535209 3.676757e-08    5.021008    7.392045   -2.371037  -0.391921         Yes
          MonthlyIncome    t-test  -4.022731 6.045500e-05 6452.714286 7012.964286 -560.250000  -0.284830         Yes
        JobSatisfaction    t-test  -3.441024 5.957731e-04    2.504202    2.771916   -0.267714  -0.243642         Yes
EnvironmentSatisfaction   

In [5]:
# Calculate correlations with attrition (binary)
numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()
numerical_cols = [col for col in numerical_cols if col not in ['EmployeeNumber', 'Attrition_Binary']]

correlations = []
for col in numerical_cols:
    if col in df.columns:
        corr, p_value = stats.pearsonr(df[col], df['Attrition_Binary'])
        correlations.append({
            'Variable': col,
            'Correlation': corr,
            'p-value': p_value,
            'Significant': 'Yes' if p_value < 0.05 else 'No'
        })

corr_df = pd.DataFrame(correlations)
corr_df = corr_df.sort_values('Correlation', key=abs, ascending=False)

print("Correlation with Attrition (Binary):")
print("=" * 70)
print(corr_df.to_string(index=False))
print("\nSignificance level: α = 0.05")


Correlation with Attrition (Binary):
                Variable  Correlation      p-value Significant
               Incentive    -0.201697 5.858117e-15         Yes
                JobLevel    -0.191087 1.486771e-13         Yes
            StressRating     0.184437 1.028530e-12         Yes
       TotalWorkingYears    -0.175436 1.259826e-11         Yes
    YearsWithCurrManager    -0.159634 7.518405e-10         Yes
                     Age    -0.158683 9.498923e-10         Yes
      YearsInCurrentRole    -0.154373 2.690842e-09         Yes
           ExtendedLeave    -0.149084 9.288010e-09         Yes
          YearsAtCompany    -0.142983 3.676757e-08         Yes
              RemoteWork    -0.128057 8.397478e-07         Yes
         WelfareBenefits    -0.112692 1.485689e-05         Yes
          JobInvolvement    -0.110920 2.023377e-05         Yes
           MonthlyIncome    -0.104418 6.045500e-05         Yes
            FlexibleWork    -0.100778 1.086361e-04         Yes
        StockOptio

## 4. Summary of Significant Factors


In [7]:
# Summary of significant factors
print("=" * 80)
print("SUMMARY OF SIGNIFICANT FACTORS AFFECTING ATTRITION")
print("=" * 80)

# Significant categorical variables
sig_categorical = chi2_df[chi2_df['Significant'] == 'Yes']
print(f"\n1. Significant Categorical Variables (p < 0.05): {len(sig_categorical)}")
if len(sig_categorical) > 0:
    for _, row in sig_categorical.iterrows():
        print(f"   - {row['Variable']}: p = {row['p-value']:.4f}, Cramér's V = {row['Cramér\'s V']:.3f}")

# Significant numerical variables
sig_numerical = t_test_df[t_test_df['Significant'] == 'Yes']
print(f"\n2. Significant Numerical Variables (p < 0.05): {len(sig_numerical)}")
if len(sig_numerical) > 0:
    for _, row in sig_numerical.iterrows():
        print(f"   - {row['Variable']}: p = {row['p-value']:.4f}, Cohen's d = {row['Cohen\'s d']:.3f}")
        print(f"     Mean difference: {row['Difference']:.2f}")

# Top correlations
sig_corr = corr_df[corr_df['Significant'] == 'Yes'].head(10)
print(f"\n3. Top Correlations with Attrition (p < 0.05): {len(sig_corr)}")
if len(sig_corr) > 0:
    for _, row in sig_corr.iterrows():
        print(f"   - {row['Variable']}: r = {row['Correlation']:.3f}, p = {row['p-value']:.4f}")

print("\n" + "=" * 80)


SUMMARY OF SIGNIFICANT FACTORS AFFECTING ATTRITION

1. Significant Categorical Variables (p < 0.05): 6
   - JobRole: p = 0.0000, Cramér's V = 0.252
   - MaritalStatus: p = 0.0000, Cramér's V = 0.157
   - BusinessTravel: p = 0.0000, Cramér's V = 0.116
   - EducationField: p = 0.0020, Cramér's V = 0.114
   - Department: p = 0.0139, Cramér's V = 0.076
   - OverTime: p = 0.0261, Cramér's V = 0.227

2. Significant Numerical Variables (p < 0.05): 10
   - Incentive: p = 0.0000, Cohen's d = -0.559
     Mean difference: -824.47
   - StressRating: p = 0.0000, Cohen's d = 0.509
     Mean difference: 0.44
   - TotalWorkingYears: p = 0.0000, Cohen's d = -0.483
     Mean difference: -3.69
   - Age: p = 0.0000, Cohen's d = -0.436
     Mean difference: -3.94
   - YearsAtCompany: p = 0.0000, Cohen's d = -0.392
     Mean difference: -2.37
   - MonthlyIncome: p = 0.0001, Cohen's d = -0.285
     Mean difference: -560.25
   - JobSatisfaction: p = 0.0006, Cohen's d = -0.244
     Mean difference: -0.27
   - 