In [14]:
import pandas as pd
import numpy as np
from scipy.stats import mannwhitneyu, chi2_contingency


In [2]:
Emp = pd.read_csv("employee_survey_data.csv")
Mgr = pd.read_csv("manager_survey_data.csv")
Gen = pd.read_csv("general_data.csv")
Tin = pd.read_csv("in_out_time/in_time.csv")
Tou = pd.read_csv("in_out_time/out_time.csv")

In [6]:
Tin = Tin.rename(columns={"Unnamed: 0": "EmployeeID"})
Tou = Tou.rename(columns={"Unnamed: 0": "EmployeeID"})

# ---------- TIN ----------
tin_dt = Tin.drop(columns=["EmployeeID"]).apply(pd.to_datetime, errors="coerce")

tin_sec = tin_dt.apply(lambda c: c.dt.hour * 3600 + c.dt.minute * 60 + c.dt.second)

tin_agg = pd.DataFrame({
    "EmployeeID": Tin["EmployeeID"],
    "Time_in_min":  tin_sec.min(axis=1),
    "Time_in_max":  tin_sec.max(axis=1),
    "Time_in_mean": tin_sec.mean(axis=1),
})

# ---------- TOUT ----------
tout_dt = Tou.drop(columns=["EmployeeID"]).apply(pd.to_datetime, errors="coerce")

tout_sec = tout_dt.apply(lambda c: c.dt.hour * 3600 + c.dt.minute * 60 + c.dt.second)

tout_agg = pd.DataFrame({
    "EmployeeID": Tou["EmployeeID"],
    "Time_out_min":  tout_sec.min(axis=1),
    "Time_out_max":  tout_sec.max(axis=1),
    "Time_out_mean": tout_sec.mean(axis=1),
})

# ---------- MERGE ----------
Time_features = tin_agg.merge(tout_agg, on="EmployeeID", how="inner")

# ---------- seconds → HH:MM:SS (date permanently lost) ----------
cols = ["Time_in_min","Time_in_max","Time_in_mean","Time_out_min","Time_out_max","Time_out_mean"]

for c in cols:
    s = Time_features[c].round().astype("Int64")
    Time_features[c] = pd.to_timedelta(s, unit="s").astype(str).str[-8:]

Time_features

Unnamed: 0,EmployeeID,Time_in_min,Time_in_max,Time_in_mean,Time_out_min,Time_out_max,Time_out_mean
0,1,09:13:07,10:37:59,10:00:05,16:18:10,18:14:06,17:22:30
1,2,09:12:39,10:43:29,09:59:17,16:27:03,18:49:53,17:42:25
2,3,09:08:23,10:52:27,10:01:30,15:58:24,18:07:51,17:02:18
3,4,09:12:03,10:50:44,09:58:55,16:02:49,18:10:16,17:10:32
4,5,09:18:02,10:54:55,09:59:55,16:53:11,19:22:01,18:00:17
...,...,...,...,...,...,...,...
4405,4406,09:15:34,10:46:55,10:01:11,17:27:37,19:59:54,18:32:31
4406,4407,09:25:22,10:42:05,10:00:15,15:01:24,17:15:45,16:05:50
4407,4408,09:10:20,10:53:02,09:58:48,16:34:04,19:04:35,17:41:12
4408,4409,09:04:17,10:42:55,10:01:07,18:27:09,20:31:07,19:30:40


In [8]:
# --- Rename to avoid collisions (safe if columns missing) ---
Emp = Emp.rename(columns={
    "EnvironmentSatisfaction": "Emp_EnvironmentSatisfaction",
    "JobSatisfaction": "Emp_JobSatisfaction",
    "WorkLifeBalance": "Emp_WorkLifeBalance",
})

Mgr = Mgr.rename(columns={
    "JobInvolvement": "Mgr_JobInvolvement",
    "PerformanceRating": "Mgr_PerformanceRating",
})

# --- Ensure numeric manager columns (robust coercion) ---
for c in ["Mgr_JobInvolvement", "Mgr_PerformanceRating"]:
    if c in Mgr.columns:
        Mgr[c] = pd.to_numeric(Mgr[c], errors="coerce")

# --- Merge ---
EmpMgr = Emp.merge(Mgr, on="EmployeeID", how="left")
GenEmpMgr = Gen.merge(EmpMgr, on="EmployeeID", how="left")

# --- Drop constant/useless columns safely ---
GenEmpMgr = GenEmpMgr.drop(columns=["Over18", "EmployeeCount", "StandardHours"], errors="ignore")

GenEmpMgr

Unnamed: 0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EmployeeID,Gender,JobLevel,...,TotalWorkingYears,TrainingTimesLastYear,YearsAtCompany,YearsSinceLastPromotion,YearsWithCurrManager,Emp_EnvironmentSatisfaction,Emp_JobSatisfaction,Emp_WorkLifeBalance,Mgr_JobInvolvement,Mgr_PerformanceRating
0,51,No,Travel_Rarely,Sales,6,2,Life Sciences,1,Female,1,...,1.0,6,1,0,0,3.0,4.0,2.0,3,3
1,31,Yes,Travel_Frequently,Research & Development,10,1,Life Sciences,2,Female,1,...,6.0,3,5,1,4,3.0,2.0,4.0,2,4
2,32,No,Travel_Frequently,Research & Development,17,4,Other,3,Male,4,...,5.0,2,5,0,3,2.0,2.0,1.0,3,3
3,38,No,Non-Travel,Research & Development,2,5,Life Sciences,4,Male,3,...,13.0,5,8,7,5,4.0,4.0,3.0,2,3
4,32,No,Travel_Rarely,Research & Development,10,1,Medical,5,Male,1,...,9.0,2,6,0,4,4.0,1.0,3.0,3,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4405,42,No,Travel_Rarely,Research & Development,5,4,Medical,4406,Female,1,...,10.0,5,3,0,2,4.0,1.0,3.0,3,3
4406,29,No,Travel_Rarely,Research & Development,2,4,Medical,4407,Male,1,...,10.0,2,3,0,2,4.0,4.0,3.0,2,3
4407,25,No,Travel_Rarely,Research & Development,25,2,Life Sciences,4408,Male,2,...,5.0,4,4,1,2,1.0,3.0,3.0,3,4
4408,42,No,Travel_Rarely,Sales,18,2,Medical,4409,Male,1,...,10.0,2,9,7,8,4.0,1.0,3.0,2,3


In [9]:
features = GenEmpMgr.merge(Time_features, on="EmployeeID", how="left")

In [11]:
features.columns

Index(['Age', 'Attrition', 'BusinessTravel', 'Department', 'DistanceFromHome',
       'Education', 'EducationField', 'EmployeeID', 'Gender', 'JobLevel',
       'JobRole', 'MaritalStatus', 'MonthlyIncome', 'NumCompaniesWorked',
       'PercentSalaryHike', 'StockOptionLevel', 'TotalWorkingYears',
       'TrainingTimesLastYear', 'YearsAtCompany', 'YearsSinceLastPromotion',
       'YearsWithCurrManager', 'Emp_EnvironmentSatisfaction',
       'Emp_JobSatisfaction', 'Emp_WorkLifeBalance', 'Mgr_JobInvolvement',
       'Mgr_PerformanceRating', 'Time_in_min', 'Time_in_max', 'Time_in_mean',
       'Time_out_min', 'Time_out_max', 'Time_out_mean'],
      dtype='object')

In [17]:
features.dtypes

Age                              int64
Attrition                        Int64
BusinessTravel                  object
Department                      object
DistanceFromHome                 int64
Education                        int64
EducationField                  object
EmployeeID                       int64
Gender                          object
JobLevel                         int64
JobRole                         object
MaritalStatus                   object
MonthlyIncome                    int64
NumCompaniesWorked             float64
PercentSalaryHike                int64
StockOptionLevel                 int64
TotalWorkingYears              float64
TrainingTimesLastYear            int64
YearsAtCompany                   int64
YearsSinceLastPromotion          int64
YearsWithCurrManager             int64
Emp_EnvironmentSatisfaction    float64
Emp_JobSatisfaction            float64
Emp_WorkLifeBalance            float64
Mgr_JobInvolvement               int64
Mgr_PerformanceRating    

In [13]:
features.to_csv("features.csv", index=False)

In [None]:
# --- target encoding ---
features = features.copy()
features["Attrition"] = features["Attrition"].map({"Yes": 1, "No": 0}).astype("Int64")

target = "Attrition"

# --- split columns ---
num_cols = features.select_dtypes(include=["int64", "float64", "Int64"]).columns.drop(target)
cat_cols = features.select_dtypes(include=["object", "category"]).columns


IntCastingNaNError: Cannot convert non-finite values (NA or inf) to integer

In [12]:
features_full = features_full.rename(columns={
    # ID
    "EmployeeID": "employee_id",

    # Time-based features
    "weekly_hours_mean": "avg_weekly_hours",
    "hours_trend_slope": "weekly_hours_trend",
    "mean_hours_last_4w": "avg_weekly_hours_last_4w",
    "delta_recent_hours": "delta_weekly_hours_4w",
    "late_arrival_rate": "late_arrival_ratio",

    # HR core
    "Age": "age",
    "Attrition": "attrition",
    "BusinessTravel": "business_travel",
    "Department": "department",
    "DistanceFromHome": "distance_from_home",
    "Education": "education_level",
    "EducationField": "education_field",
    "Gender": "gender",
    "JobLevel": "job_level",
    "JobRole": "job_role",
    "MaritalStatus": "marital_status",
    "MonthlyIncome": "monthly_income",
    "NumCompaniesWorked": "num_companies_worked",
    "PercentSalaryHike": "percent_salary_hike",
    "StockOptionLevel": "stock_option_level",
    "TotalWorkingYears": "total_working_years",
    "TrainingTimesLastYear": "training_times_last_year",
    "YearsAtCompany": "years_at_company",
    "YearsSinceLastPromotion": "years_since_promotion",
    "YearsWithCurrManager": "years_with_manager",

    # Satisfaction / manager (already prefixed correctly)
    "Emp_EnvironmentSatisfaction": "environment_satisfaction",
    "Emp_JobSatisfaction": "job_satisfaction",
    "Emp_WorkLifeBalance": "work_life_balance",
    "Mgr_JobInvolvement": "job_involvement",
    "Mgr_PerformanceRating": "performance_rating",
})
features_full.dtypes

employee_id                   int64
avg_weekly_hours            float64
weekly_hours_max            float64
weekly_hours_std            float64
weekly_hours_cv             float64
weekly_hours_trend          float64
avg_weekly_hours_last_4w    float64
delta_weekly_hours_4w       float64
late_arrival_ratio          float64
start_time_variability      float64
age                           int64
attrition                    object
business_travel              object
department                   object
distance_from_home            int64
education_level               int64
education_field              object
gender                       object
job_level                     int64
job_role                     object
marital_status               object
monthly_income                int64
num_companies_worked        float64
percent_salary_hike           int64
stock_option_level            int64
total_working_years         float64
training_times_last_year      int64
years_at_company            

In [13]:
# y and X
y = features_full["attrition"].map({"Yes": 1, "No": 0}).astype(int)
X = features_full.drop(columns=["employee_id", "attrition"])

# split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)

# column groups
num_cols = X_train.select_dtypes(include=["int64", "float64"]).columns
cat_cols = X_train.select_dtypes(include=["object", "category"]).columns

# preprocess
numeric_pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
])
categorical_pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore")),
])
preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_pipe, num_cols),
        ("cat", categorical_pipe, cat_cols),
    ]
)

# model (balanced)
log_reg = Pipeline(steps=[
    ("preprocess", preprocess),
    ("model", LogisticRegression(max_iter=2000, class_weight="balanced"))
])

# fit + predict
log_reg.fit(X_train, y_train)
y_proba = log_reg.predict_proba(X_test)[:, 1]

# threshold (tune this)
threshold = 0.35
y_pred = (y_proba >= threshold).astype(int)

print(classification_report(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_proba))

              precision    recall  f1-score   support

           0       0.96      0.59      0.73       925
           1       0.29      0.87      0.44       178

    accuracy                           0.64      1103
   macro avg       0.63      0.73      0.58      1103
weighted avg       0.85      0.64      0.68      1103

ROC AUC: 0.8207105982386881
