In [174]:
import pandas as pd
import numpy as np
import os
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import mlflow


from scipy.stats import (
                         shapiro,
                         levene,
                         mannwhitneyu,
                         ttest_ind,
                         chi2_contingency,
                         fisher_exact)

from sklearn.preprocessing import (
                            MinMaxScaler,
                            OneHotEncoder,
                            LabelEncoder
                            )

from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from statsmodels.discrete.discrete_model import Logit
from statsmodels.stats.outliers_influence import variance_inflation_factor

pd.set_option('display.max_columns', None)

In [2]:
data_path = Path('../../Artifacts/Attrition.csv').resolve() 
df = pd.read_csv(data_path)

In [3]:
df.sample(5)

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,Gender,HourlyRate,JobInvolvement,JobLevel,JobRole,JobSatisfaction,MaritalStatus,MonthlyIncome,MonthlyRate,NumCompaniesWorked,Over18,OverTime,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
739,27,No,Travel_Rarely,1055,Research & Development,2,4,Life Sciences,1,1027,1,Female,47,3,2,Manufacturing Director,4,Married,4227,4658,0,Y,No,18,3,2,80,1,4,2,3,3,2,2,2
1119,38,No,Travel_Rarely,1245,Sales,14,3,Life Sciences,1,1582,3,Male,80,3,2,Sales Executive,2,Married,9924,12355,0,Y,No,11,3,4,80,1,10,3,3,9,8,7,7
770,46,No,Travel_Rarely,430,Research & Development,1,4,Medical,1,1069,4,Male,40,3,5,Research Director,4,Divorced,19627,21445,9,Y,No,17,3,4,80,2,23,0,3,2,2,2,2
889,27,No,Travel_Rarely,1103,Research & Development,14,3,Life Sciences,1,1244,1,Male,42,3,1,Research Scientist,1,Married,2235,14377,1,Y,Yes,14,3,4,80,2,9,3,2,9,7,6,8
873,36,No,Travel_Rarely,917,Research & Development,6,4,Life Sciences,1,1221,3,Male,60,1,1,Laboratory Technician,3,Divorced,2741,6865,1,Y,No,14,3,3,80,1,7,4,3,7,7,1,7


In [4]:
df.shape

(1470, 35)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 35 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Age                       1470 non-null   int64 
 1   Attrition                 1470 non-null   object
 2   BusinessTravel            1470 non-null   object
 3   DailyRate                 1470 non-null   int64 
 4   Department                1470 non-null   object
 5   DistanceFromHome          1470 non-null   int64 
 6   Education                 1470 non-null   int64 
 7   EducationField            1470 non-null   object
 8   EmployeeCount             1470 non-null   int64 
 9   EmployeeNumber            1470 non-null   int64 
 10  EnvironmentSatisfaction   1470 non-null   int64 
 11  Gender                    1470 non-null   object
 12  HourlyRate                1470 non-null   int64 
 13  JobInvolvement            1470 non-null   int64 
 14  JobLevel                

In [6]:
df.isna().sum()

Age                         0
Attrition                   0
BusinessTravel              0
DailyRate                   0
Department                  0
DistanceFromHome            0
Education                   0
EducationField              0
EmployeeCount               0
EmployeeNumber              0
EnvironmentSatisfaction     0
Gender                      0
HourlyRate                  0
JobInvolvement              0
JobLevel                    0
JobRole                     0
JobSatisfaction             0
MaritalStatus               0
MonthlyIncome               0
MonthlyRate                 0
NumCompaniesWorked          0
Over18                      0
OverTime                    0
PercentSalaryHike           0
PerformanceRating           0
RelationshipSatisfaction    0
StandardHours               0
StockOptionLevel            0
TotalWorkingYears           0
TrainingTimesLastYear       0
WorkLifeBalance             0
YearsAtCompany              0
YearsInCurrentRole          0
YearsSince

In [7]:
df.select_dtypes(exclude='object').describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Age,1470.0,36.92381,9.135373,18.0,30.0,36.0,43.0,60.0
DailyRate,1470.0,802.485714,403.5091,102.0,465.0,802.0,1157.0,1499.0
DistanceFromHome,1470.0,9.192517,8.106864,1.0,2.0,7.0,14.0,29.0
Education,1470.0,2.912925,1.024165,1.0,2.0,3.0,4.0,5.0
EmployeeCount,1470.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
EmployeeNumber,1470.0,1024.865306,602.024335,1.0,491.25,1020.5,1555.75,2068.0
EnvironmentSatisfaction,1470.0,2.721769,1.093082,1.0,2.0,3.0,4.0,4.0
HourlyRate,1470.0,65.891156,20.329428,30.0,48.0,66.0,83.75,100.0
JobInvolvement,1470.0,2.729932,0.711561,1.0,2.0,3.0,3.0,4.0
JobLevel,1470.0,2.063946,1.10694,1.0,1.0,2.0,3.0,5.0


In [4]:
num_features = df.select_dtypes(exclude='object').columns.to_list()
cat_features = df.select_dtypes(include='object').columns.to_list()

descrete_features = list()
continues_features = list()

for feature in num_features:
    if df[feature].nunique() > 5:
        continues_features.append(feature)
    else:
        descrete_features.append(feature)    

cat_features.remove('Attrition')

target_feature = 'Attrition'

In [5]:
significant_num_features = list()

def checkNormality(sample_1,sample_2):
    print('Shapiro-Wilk Test')
    # Null Hypothesis (H0): the data was drawn from a normal distribution.
    # Alternative Hypothesis (H1): the data was not drawn from a normal distribution.

    stats_1,p_val_1 = shapiro(sample_1)
    stats_2,p_val_2 = shapiro(sample_2)
    print()
    print(f'sample 1 : stats {stats_1}, p-val: {p_val_1}')
    print(f'sample 1 : stats {stats_2}, p-val: {p_val_2}')
    result = p_val_2 > 0.05 or p_val_1 > 0.05
    return result


def checkVarianceHomogenisity(sample_1,sample_2):
    print('levene Test')
    # Null Hypothesis (H0): The variances are equal across the groups.
    # Alternative Hypothesis (H1): The variances are not equal across the groups.

    stats, p_val = levene(sample_1, sample_2)
    print()
    print(f'sample 1 : stats {stats}, p-val: {p_val}')
    result = p_val > 0.05
    return result


def MannWhitneyTest(sample_1, sample_2):
    print('Mann-Whitney U test')
    # Null Hypothesis (H0): The distributions of the two groups are equal.
    # Alternative Hypothesis (H1): The distributions of the two groups are not equal.

    stats, p_val = mannwhitneyu(sample_1, sample_2)
    print()
    print(f'sample 1 : stats {stats}, p-val: {p_val}')
    result = p_val > 0.05
    return result
        

def Ttest(sample_1, sample_2,var_same=True):
    if var_same:
        print('T-test of Independence')
    else:
        print("Wetch's T-test")    
    # Null Hypothesis (H0): The distributions of the two groups are equal.
    # Alternative Hypothesis (H1): The distributions of the two groups are not equal.

    stats, p_val = ttest_ind(sample_1, sample_2,equal_var=var_same)
    print()
    print(f'sample 1 : stats {stats}, p-val: {p_val}')
    result = p_val > 0.05
    return result


def CheckSignificance(feature):
    print('levene Test')
    # taking out random samples
    class_samples_1 = df[df['Attrition'] == 'Yes'][feature].sample(200)
    class_samples_2 = df[df['Attrition'] == 'No'][feature].sample(200)
    print(feature)
    print('-'*25)

    

    if checkNormality(class_samples_1,class_samples_2):
        print('The samples not follows normal distribution. (Reject H0) ✓')
        if checkVarianceHomogenisity(class_samples_1,class_samples_2):
            print(f'This variance of two given samples are homogeneus. (Reject H0) ✓') 
            if Ttest(class_samples_1, class_samples_2):
                print(f'This means that there is a significant difference between the two groups. (Reject H0) ✓')  
                significant_num_features.append(feature)     
            else:
                 print(f'This means that there is no significant difference between the two groups. (Accept H0) ✗')   
        else:
            if MannWhitneyTest(class_samples_1, class_samples_2):
                print(f'This means that there is a significant difference between the two groups. (Reject H0) ✓')  
                significant_num_features.append(feature)     
            else:
                print(f'This means that there is no significant difference between the two groups. (Accept H0) ✗')
    else:
        print('The samples follows normal distribution. (Reject H0) ✓')
        if checkVarianceHomogenisity(class_samples_1,class_samples_2):
            print(f'This variance of two given samples are homogeneus. (Reject H0) ✓') 
            if Ttest(class_samples_1, class_samples_2,False):
                print(f'This means that there is a significant difference between the two groups. (Reject H0) ✓')  
                significant_num_features.append(feature)     
            else:
                 print(f'This means that there is no significant difference between the two groups. (Accept H0) ✗')   
        else:
            if MannWhitneyTest(class_samples_1, class_samples_2):
                print(f'This means that there is a significant difference between the two groups. (Reject H0) ✓')  
                significant_num_features.append(feature)     
            else:
                print(f'This means that there is no significant difference between the two groups. (Accept H0) ✗')
    print('-'*25)

for feature in continues_features:
    CheckSignificance(feature)

levene Test
Age
-------------------------
Shapiro-Wilk Test

sample 1 : stats 0.9397645051692768, p-val: 2.1863073824777074e-07
sample 1 : stats 0.9811783555461417, p-val: 0.008762577519654369
The samples follows normal distribution. (Reject H0) ✓
levene Test

sample 1 : stats 0.6640251857952748, p-val: 0.41562981936458887
This variance of two given samples are homogeneus. (Reject H0) ✓
Wetch's T-test

sample 1 : stats -4.377657748239208, p-val: 1.5388592725524938e-05
This means that there is no significant difference between the two groups. (Accept H0) ✗
-------------------------
levene Test
DailyRate
-------------------------
Shapiro-Wilk Test

sample 1 : stats 0.9500604767606473, p-val: 1.9123846513200963e-06
sample 1 : stats 0.948447474056279, p-val: 1.3407199875914233e-06
The samples follows normal distribution. (Reject H0) ✓
levene Test

sample 1 : stats 0.4067122018610013, p-val: 0.5240097173939537
This variance of two given samples are homogeneus. (Reject H0) ✓
Wetch's T-test



In [6]:
for feature in descrete_features:
    print(feature,df[feature].nunique())

descrete_features = [feature for feature in descrete_features if df[feature].nunique() > 1]  

Education 5
EmployeeCount 1
EnvironmentSatisfaction 4
JobInvolvement 4
JobLevel 5
JobSatisfaction 4
PerformanceRating 2
RelationshipSatisfaction 4
StandardHours 1
StockOptionLevel 4
WorkLifeBalance 4


In [7]:
for feature in cat_features:
    print(feature,df[feature].nunique())
    
cat_features = [feature for feature in cat_features if df[feature].nunique() > 1]  

BusinessTravel 3
Department 3
EducationField 6
Gender 2
JobRole 9
MaritalStatus 3
Over18 1
OverTime 2


In [8]:
significant_cat_features = list()

def significant_bin_cat_features(feature,target):
    print(feature)
    feat,target = df[feature].sample(500),df[target].sample(500)
    contingency_table = pd.crosstab(feat,target)
    print('-'*25)

    result = chi2_contingency(contingency_table)
    if (result.expected_freq < 5).any():
        p_value = result.pvalue
        print(f'The Frequency count greater than 5. Assumption satisfied ✓') 
        if p_value < 0.05:
            print("Reject the null hypothesis: There is an association between the two categorical variables. ✓")
            significant_cat_features.append(feature)
        else:
            print("Fail to reject the null hypothesis: There is no association between the two categorical variables. ✗") 
    else:
        print(f'The Frequency count is greater than 5. Assumption violated ✗')
        print()
        print('Fisher exact test')

        # Null Hypothesis (H0): There is no association between the two categorical variables.
        # Alternative Hypothesis (H1): There is an association between the two categorical variables.
        oddsratio, p_value = fisher_exact(contingency_table)

        print("Odds Ratio:", oddsratio)
        print("P-value:", p_value)

        # Interpretation based on p-value
        if p_value < 0.05:
            print("Reject the null hypothesis: There is an association between the two categorical variables. ✓")
            significant_cat_features.append(feature)
        else:
            print("Fail to reject the null hypothesis: There is no association between the two categorical variables. ✗")     
    print('-'*25) 


for feature in descrete_features+cat_features:
    significant_bin_cat_features(feature,target_feature)       
        

Education
-------------------------
The Frequency count greater than 5. Assumption satisfied ✓
Fail to reject the null hypothesis: There is no association between the two categorical variables. ✗
-------------------------
EnvironmentSatisfaction
-------------------------
The Frequency count is greater than 5. Assumption violated ✗

Fisher exact test
Odds Ratio: 0.006444952331739049
P-value: 0.9255
Fail to reject the null hypothesis: There is no association between the two categorical variables. ✗
-------------------------
JobInvolvement
-------------------------
The Frequency count greater than 5. Assumption satisfied ✓
Reject the null hypothesis: There is an association between the two categorical variables. ✓
-------------------------
JobLevel
-------------------------
The Frequency count greater than 5. Assumption satisfied ✓
Fail to reject the null hypothesis: There is no association between the two categorical variables. ✗
-------------------------
JobSatisfaction
----------------

In [11]:
final_selected_features = significant_cat_features+significant_num_features+ [target_feature]

In [12]:
data = df[final_selected_features]

In [19]:
sacle_features= [
                 'HourlyRate',
                 'MonthlyRate',
                 'PercentSalaryHike',
                 'TrainingTimesLastYear',
                 'YearsSinceLastPromotion',
                 'EmployeeNumber',
                 'JobInvolvement',
                 'JobSatisfaction',
                 'RelationshipSatisfaction']
ohe_features = ['OverTime']
label_encode_features = ['Attrition']

In [125]:
mx = MinMaxScaler()
ohe = OneHotEncoder(dtype=int,drop='first')
le = LabelEncoder()


ct = ColumnTransformer([
   ('one_hot_encoding',ohe,ohe_features),
   ('min_max_scaling',mx,sacle_features)]
 )

X , y = df.drop('Attrition',axis=1),df['Attrition']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

X_train_transformed = ct.fit_transform(X_train)
X_test_transformed = ct.transform(X_test)

y_train_transformed = le.fit_transform(y_train)
y_test_transformed = le.transform(y_test)

columns =[feature.split('__')[-1] for feature in ct.get_feature_names_out()]
X_train_transformed = pd.DataFrame(X_train_transformed,columns=columns)
X_test_transformed = pd.DataFrame(X_test_transformed,columns=columns)

y_train_transformed = pd.DataFrame(y_train_transformed,dtype=float)
y_test_transformed = pd.DataFrame(y_test_transformed,dtype=float)

In [168]:
lr1 = Logit(y_train_transformed,X_train_transformed,).fit(maxiter=100)
lr1.summary()

Optimization terminated successfully.
         Current function value: 0.402030
         Iterations 6


0,1,2,3
Dep. Variable:,0,No. Observations:,984.0
Model:,Logit,Df Residuals:,973.0
Method:,MLE,Df Model:,10.0
Date:,"Fri, 28 Feb 2025",Pseudo R-squ.:,0.1173
Time:,16:40:52,Log-Likelihood:,-395.6
converged:,True,LL-Null:,-448.15
Covariance Type:,nonrobust,LLR p-value:,5.17e-18

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-0.3534,0.484,-0.730,0.465,-1.302,0.595
OverTime_Yes,1.5082,0.183,8.222,0.000,1.149,1.868
HourlyRate,-0.0421,0.319,-0.132,0.895,-0.668,0.584
MonthlyRate,0.0295,0.321,0.092,0.927,-0.599,0.658
PercentSalaryHike,-0.0777,0.353,-0.220,0.826,-0.769,0.614
TrainingTimesLastYear,-0.5293,0.444,-1.192,0.233,-1.400,0.341
YearsSinceLastPromotion,-0.8450,0.484,-1.747,0.081,-1.793,0.103
EmployeeNumber,0.2318,0.309,0.750,0.453,-0.374,0.837
JobInvolvement,-1.6874,0.372,-4.535,0.000,-2.417,-0.958


In [199]:
exp = mlflow.create_experiment(name='LogitModelExperiment1')

In [207]:
mlflow.set_tracking_uri(uri='http://127.0.0.1:5000')
exp = mlflow.create_experiment(name=exp)

with mlflow.start_run():
    lr1 = Logit(y_train_transformed,X_train_transformed,).fit(maxiter=100)
    mlflow.statsmodels.log_model(lr1,artifact_path='Statsmodels')
    params = lr1.pvalues.to_dict()
    mlflow.log_params(params)

Optimization terminated successfully.
         Current function value: 0.402030
         Iterations 6




🏃 View run agreeable-shad-437 at: http://127.0.0.1:5000/#/experiments/0/runs/a4e070f82018455181e7beeea18d10b6
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/0


{'const': 0.4654069279460823,
 'OverTime_Yes': 1.9994468407938194e-16,
 'HourlyRate': 0.8951964558753169,
 'MonthlyRate': 0.9268233717923257,
 'PercentSalaryHike': 0.8256381607834216,
 'TrainingTimesLastYear': 0.23344689394942575,
 'YearsSinceLastPromotion': 0.0806646828027661,
 'EmployeeNumber': 0.453227827523358,
 'JobInvolvement': 5.758670029444156e-06,
 'JobSatisfaction': 0.0014325374944757702,
 'RelationshipSatisfaction': 0.11915627868283553}

In [173]:
VIF = pd.DataFrame()
VIF['Features'] = X_train_transformed.columns.to_list()
VIF['VIF'] = [variance_inflation_factor(X_train_transformed.values,i) for i in range(X_train_transformed.shape[1])]
VIF.sort_values(by='VIF',ascending=False,inplace=True)
VIF 

Unnamed: 0,Features,VIF
0,const,29.894225
2,HourlyRate,1.017786
6,YearsSinceLastPromotion,1.012729
9,JobSatisfaction,1.010868
7,EmployeeNumber,1.010062
10,RelationshipSatisfaction,1.009019
1,OverTime_Yes,1.008058
8,JobInvolvement,1.005936
5,TrainingTimesLastYear,1.00583
4,PercentSalaryHike,1.004683
