In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Load your synthetic dataset
df = pd.read_csv("hr_synthetic_dataset.csv")  # Replace with actual filename


In [2]:
from scipy.stats import ttest_ind

job_roles = df['JobRole'].unique()
for role in job_roles:
    males = df[(df['Gender'] == 'Male') & (df['JobRole'] == role)]['MonthlyIncome']
    females = df[(df['Gender'] == 'Female') & (df['JobRole'] == role)]['MonthlyIncome']
    if len(males) > 5 and len(females) > 5:
        stat, p = ttest_ind(males, females, equal_var=False)
        print(f"Role: {role}, p-value = {p:.4f}")


Role: Director of R&D, p-value = 0.0000
Role: Sales Manager, p-value = 0.0000
Role: R&D Manager, p-value = 0.0000
Role: HR Manager, p-value = 0.0000
Role: Sales Director, p-value = 0.0000
Role: HR Director, p-value = 0.0000
Role: Research Scientist, p-value = 0.0024
Role: Senior Researcher, p-value = 0.0000
Role: Sales Executive, p-value = 0.0326
Role: HR Specialist, p-value = 0.0297


In [13]:
print(df.columns.tolist())


['EmployeeNumber', 'EmployeeID', 'FirstName', 'LastName', 'Gender', 'Age', 'MaritalStatus', 'Education', 'EducationField', 'Department', 'JobRole', 'JobLevel', 'HireDate', 'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion', 'YearsWithCurrManager', 'Attrition', 'TerminationDate', 'BusinessTravel', 'DailyRate', 'DistanceFromHome', 'EmployeeCount', 'HourlyRate', 'JobInvolvement', 'JobSatisfaction', 'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked', 'Over18', 'OverTime', 'PercentSalaryHike', 'PerformanceRating', 'RelationshipSatisfaction', 'StandardHours', 'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance']


In [8]:
df.head()


Unnamed: 0,EmployeeNumber,EmployeeID,FirstName,LastName,Gender,Age,MaritalStatus,Education,EducationField,Department,...,Over18,OverTime,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance
0,1,EMP00001,Heather,Carlson,Non-Binary,48,Single,Master,Medical,R&D,...,Y,Yes,20,3,4,40,0,10,6,4
1,2,EMP00002,Jasmine,Smith,Female,49,Single,Bachelor,Life Sciences,Sales,...,Y,Yes,17,3,1,40,0,12,6,4
2,3,EMP00003,Michael,Sampson,Male,40,Married,High School,Life Sciences,R&D,...,Y,Yes,22,4,4,40,0,11,1,4
3,4,EMP00004,Diana,Hicks,Female,37,Married,Associate,Technical Degree,R&D,...,Y,No,22,1,3,40,0,16,0,2
4,5,EMP00005,Patrick,Huerta,Male,49,Single,Master,Life Sciences,HR,...,Y,Yes,13,5,4,40,1,6,2,3


In [18]:
!pip install imbalanced-learn


Defaulting to user installation because normal site-packages is not writeable
Collecting imbalanced-learn
  Downloading imbalanced_learn-0.13.0-py3-none-any.whl.metadata (8.8 kB)
Collecting sklearn-compat<1,>=0.1 (from imbalanced-learn)
  Downloading sklearn_compat-0.1.3-py3-none-any.whl.metadata (18 kB)
Downloading imbalanced_learn-0.13.0-py3-none-any.whl (238 kB)
Downloading sklearn_compat-0.1.3-py3-none-any.whl (18 kB)
Installing collected packages: sklearn-compat, imbalanced-learn

   -------------------- ------------------- 1/2 [imbalanced-learn]
   -------------------- ------------------- 1/2 [imbalanced-learn]
   ---------------------------------------- 2/2 [imbalanced-learn]

Successfully installed imbalanced-learn-0.13.0 sklearn-compat-0.1.3


In [19]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Copy DataFrame
df_encoded = df.copy()

# Define categorical columns to encode
categorical_cols = ['Gender', 'EducationField', 'MaritalStatus', 'JobRole', 
                    'Department', 'OverTime', 'BusinessTravel', 'Education']


# Apply LabelEncoder
le = LabelEncoder()
for col in categorical_cols:
    df_encoded[col] = le.fit_transform(df_encoded[col])

# Create target variable: recently promoted = 1
df_encoded['RecentlyPromoted'] = (df_encoded['YearsSinceLastPromotion'] == 0).astype(int)

# Define features
features = ['Gender', 'Age', 'Education', 'JobLevel', 'JobSatisfaction', 
            'MonthlyIncome', 'PerformanceRating', 'OverTime', 'WorkLifeBalance']



# Split dataset
X = df_encoded[features]
y = df_encoded['RecentlyPromoted']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train Logistic Regression
model = LogisticRegression(max_iter=1000, class_weight='balanced')
model.fit(X_train, y_train)


# Evaluate model
print(classification_report(y_test, model.predict(X_test)))

# Show importance of gender in prediction
gender_idx = features.index('Gender')
print(f"\n📊 Gender Coefficient: {model.coef_[0][gender_idx]:.4f}")


              precision    recall  f1-score   support

           0       0.42      0.48      0.44      1272
           1       0.57      0.51      0.54      1728

    accuracy                           0.50      3000
   macro avg       0.49      0.49      0.49      3000
weighted avg       0.51      0.50      0.50      3000


📊 Gender Coefficient: 0.0689


In [20]:
print(f"Gender Coefficient: {model.coef_[0][features.index('Gender')]}")


Gender Coefficient: 0.06892744985294764
