In [None]:
import pandas as pd
import numpy as np

In [71]:
Emp = pd.read_csv("employee_survey_data.csv")
Mgr = pd.read_csv("manager_survey_data.csv")
Gen = pd.read_csv("general_data.csv")
Tin = pd.read_csv("in_out_time/in_time.csv")
Tou = pd.read_csv("in_out_time/out_time.csv")

In [72]:
Tin = Tin.rename(columns={"Unnamed: 0": "EmployeeID"})
Tou = Tou.rename(columns={"Unnamed: 0": "EmployeeID"})

# ---------- TIN ----------
tin_dt = Tin.drop(columns=["EmployeeID"]).apply(pd.to_datetime, errors="coerce")
tin_sec = tin_dt.apply(lambda c: c.dt.hour * 3600 + c.dt.minute * 60 + c.dt.second)

tin_agg = pd.DataFrame({
    "EmployeeID": Tin["EmployeeID"],
    "Time_in_min":  tin_sec.min(axis=1),
    "Time_in_max":  tin_sec.max(axis=1),
    "Time_in_mean": tin_sec.mean(axis=1),
})

# ---------- TOUT ----------
tout_dt = Tou.drop(columns=["EmployeeID"]).apply(pd.to_datetime, errors="coerce")
tout_sec = tout_dt.apply(lambda c: c.dt.hour * 3600 + c.dt.minute * 60 + c.dt.second)

tout_agg = pd.DataFrame({
    "EmployeeID": Tou["EmployeeID"],
    "Time_out_min":  tout_sec.min(axis=1),
    "Time_out_max":  tout_sec.max(axis=1),
    "Time_out_mean": tout_sec.mean(axis=1),
})

# ---------- FINAL (numeric only) ----------
Time_features = tin_agg.merge(tout_agg, on="EmployeeID", how="inner")

Time_features

Unnamed: 0,EmployeeID,Time_in_min,Time_in_max,Time_in_mean,Time_out_min,Time_out_max,Time_out_mean
0,1,33187.0,38279.0,36005.120690,58690.0,65646.0,62550.262931
1,2,33159.0,38609.0,35957.063559,59223.0,67793.0,63745.351695
2,3,32903.0,39147.0,36090.219008,57504.0,65271.0,61337.884298
3,4,33123.0,39044.0,35934.587234,57769.0,65416.0,61831.829787
4,5,33482.0,39295.0,35994.575510,60791.0,69721.0,64816.804082
...,...,...,...,...,...,...,...
4405,4406,33334.0,38815.0,36071.242798,62857.0,71994.0,66751.440329
4406,4407,33922.0,38525.0,36015.294606,54084.0,62145.0,57949.929461
4407,4408,33020.0,39182.0,35927.904762,59644.0,68675.0,63671.779221
4408,4409,32657.0,38575.0,36066.767635,66429.0,73867.0,70240.107884


In [73]:
df_view = Time_features.copy()

for c in [
    "Time_in_min","Time_in_max","Time_in_mean",
    "Time_out_min","Time_out_max","Time_out_mean"
]:
    sec = (df_view[c] % 86400).astype(int) 
    df_view[c] = (
        (sec // 3600).astype(str).str.zfill(2) + ":" +
        ((sec % 3600) // 60).astype(str).str.zfill(2) + ":" +
        (sec % 60).astype(str).str.zfill(2)
    )

df_view


Unnamed: 0,EmployeeID,Time_in_min,Time_in_max,Time_in_mean,Time_out_min,Time_out_max,Time_out_mean
0,1,09:13:07,10:37:59,10:00:05,16:18:10,18:14:06,17:22:30
1,2,09:12:39,10:43:29,09:59:17,16:27:03,18:49:53,17:42:25
2,3,09:08:23,10:52:27,10:01:30,15:58:24,18:07:51,17:02:17
3,4,09:12:03,10:50:44,09:58:54,16:02:49,18:10:16,17:10:31
4,5,09:18:02,10:54:55,09:59:54,16:53:11,19:22:01,18:00:16
...,...,...,...,...,...,...,...
4405,4406,09:15:34,10:46:55,10:01:11,17:27:37,19:59:54,18:32:31
4406,4407,09:25:22,10:42:05,10:00:15,15:01:24,17:15:45,16:05:49
4407,4408,09:10:20,10:53:02,09:58:47,16:34:04,19:04:35,17:41:11
4408,4409,09:04:17,10:42:55,10:01:06,18:27:09,20:31:07,19:30:40


In [74]:
# --- Rename to avoid collisions (safe if columns missing) ---
Emp = Emp.rename(columns={
    "EnvironmentSatisfaction": "Emp_EnvironmentSatisfaction",
    "JobSatisfaction": "Emp_JobSatisfaction",
    "WorkLifeBalance": "Emp_WorkLifeBalance",
})

Mgr = Mgr.rename(columns={
    "JobInvolvement": "Mgr_JobInvolvement",
    "PerformanceRating": "Mgr_PerformanceRating",
})

# --- Ensure numeric manager columns (robust coercion) ---
for c in ["Mgr_JobInvolvement", "Mgr_PerformanceRating"]:
    if c in Mgr.columns:
        Mgr[c] = pd.to_numeric(Mgr[c], errors="coerce")

# --- Merge ---
EmpMgr = Emp.merge(Mgr, on="EmployeeID", how="left")
GenEmpMgr = Gen.merge(EmpMgr, on="EmployeeID", how="left")

# --- Drop constant/useless columns safely ---
GenEmpMgr = GenEmpMgr.drop(columns=["Over18", "EmployeeCount", "StandardHours"], errors="ignore")

GenEmpMgr

Unnamed: 0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EmployeeID,Gender,JobLevel,...,TotalWorkingYears,TrainingTimesLastYear,YearsAtCompany,YearsSinceLastPromotion,YearsWithCurrManager,Emp_EnvironmentSatisfaction,Emp_JobSatisfaction,Emp_WorkLifeBalance,Mgr_JobInvolvement,Mgr_PerformanceRating
0,51,No,Travel_Rarely,Sales,6,2,Life Sciences,1,Female,1,...,1.0,6,1,0,0,3.0,4.0,2.0,3,3
1,31,Yes,Travel_Frequently,Research & Development,10,1,Life Sciences,2,Female,1,...,6.0,3,5,1,4,3.0,2.0,4.0,2,4
2,32,No,Travel_Frequently,Research & Development,17,4,Other,3,Male,4,...,5.0,2,5,0,3,2.0,2.0,1.0,3,3
3,38,No,Non-Travel,Research & Development,2,5,Life Sciences,4,Male,3,...,13.0,5,8,7,5,4.0,4.0,3.0,2,3
4,32,No,Travel_Rarely,Research & Development,10,1,Medical,5,Male,1,...,9.0,2,6,0,4,4.0,1.0,3.0,3,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4405,42,No,Travel_Rarely,Research & Development,5,4,Medical,4406,Female,1,...,10.0,5,3,0,2,4.0,1.0,3.0,3,3
4406,29,No,Travel_Rarely,Research & Development,2,4,Medical,4407,Male,1,...,10.0,2,3,0,2,4.0,4.0,3.0,2,3
4407,25,No,Travel_Rarely,Research & Development,25,2,Life Sciences,4408,Male,2,...,5.0,4,4,1,2,1.0,3.0,3.0,3,4
4408,42,No,Travel_Rarely,Sales,18,2,Medical,4409,Male,1,...,10.0,2,9,7,8,4.0,1.0,3.0,2,3


In [75]:
features = GenEmpMgr.merge(Time_features, on="EmployeeID", how="left")

In [76]:
features.columns

Index(['Age', 'Attrition', 'BusinessTravel', 'Department', 'DistanceFromHome',
       'Education', 'EducationField', 'EmployeeID', 'Gender', 'JobLevel',
       'JobRole', 'MaritalStatus', 'MonthlyIncome', 'NumCompaniesWorked',
       'PercentSalaryHike', 'StockOptionLevel', 'TotalWorkingYears',
       'TrainingTimesLastYear', 'YearsAtCompany', 'YearsSinceLastPromotion',
       'YearsWithCurrManager', 'Emp_EnvironmentSatisfaction',
       'Emp_JobSatisfaction', 'Emp_WorkLifeBalance', 'Mgr_JobInvolvement',
       'Mgr_PerformanceRating', 'Time_in_min', 'Time_in_max', 'Time_in_mean',
       'Time_out_min', 'Time_out_max', 'Time_out_mean'],
      dtype='object')

In [78]:
features.to_csv("features.csv", index=False)

In [79]:
# Strict encoding: NA after mapping = ERROR
s = features["Attrition"].astype(str).str.strip()

mapped = s.map({"Yes": 1, "No": 0})

# If anything didn't map, stop and show the offending values
if mapped.isna().any():
    bad = s[mapped.isna()].unique()
    raise ValueError(f"Attrition mapping failed. Unexpected values: {bad}")

# Now it's safe to use numpy int64 (no NA possible)
features["Attrition"] = mapped.astype("int64")

In [81]:
features.dtypes

Age                              int64
Attrition                        int64
BusinessTravel                  object
Department                      object
DistanceFromHome                 int64
Education                        int64
EducationField                  object
EmployeeID                       int64
Gender                          object
JobLevel                         int64
JobRole                         object
MaritalStatus                   object
MonthlyIncome                    int64
NumCompaniesWorked             float64
PercentSalaryHike                int64
StockOptionLevel                 int64
TotalWorkingYears              float64
TrainingTimesLastYear            int64
YearsAtCompany                   int64
YearsSinceLastPromotion          int64
YearsWithCurrManager             int64
Emp_EnvironmentSatisfaction    float64
Emp_JobSatisfaction            float64
Emp_WorkLifeBalance            float64
Mgr_JobInvolvement               int64
Mgr_PerformanceRating    

In [84]:
target = "Attrition"

In [85]:
from scipy.stats import mannwhitneyu

num_cols = (
    features
    .select_dtypes(include=["int64", "float64"])
    .columns
    .drop([target, "EmployeeID"])
)

mw_results = []

for col in num_cols:
    g0 = features.loc[features[target] == 0, col]
    g1 = features.loc[features[target] == 1, col]

    if g0.notna().any() and g1.notna().any():
        _, p = mannwhitneyu(g0, g1, alternative="two-sided")
        mw_results.append({
            "feature": col,
            "p_value": p,
            "median_no": g0.median(),
            "median_yes": g1.median()
        })

mw_df = pd.DataFrame(mw_results).sort_values("p_value")
mw_df

Unnamed: 0,feature,p_value,median_no,median_yes
23,Time_out_mean,7.817653e-37,62340.178723,65455.02439
10,YearsAtCompany,1.20952e-36,6.0,3.0
21,Time_out_min,1.806233e-36,58186.0,61369.0
22,Time_out_max,4.32397e-36,66487.0,69601.0
12,YearsWithCurrManager,2.473097e-31,3.0,2.0
0,Age,5.990318e-30,36.0,32.0
11,YearsSinceLastPromotion,0.0004042361,1.0,1.0
9,TrainingTimesLastYear,0.01033591,3.0,3.0
6,PercentSalaryHike,0.03732026,14.0,14.0
4,MonthlyIncome,0.1071546,49300.0,49080.0


In [90]:
very_strong = []
strong = []
moderate = []
weak = []
not_significant = []
not_testable = []

for _, row in mw_df.iterrows():
    p = row["p_value"]
    feature = row["feature"]

    match p:
        case _ if pd.isna(p):
            not_testable.append(feature)

        case _ if p < 1e-10:
            very_strong.append(feature)

        case _ if p < 1e-5:
            strong.append(feature)

        case _ if p < 0.01:
            moderate.append(feature)

        case _ if p < 0.05:
            weak.append(feature)

        case _:
            not_significant.append(feature)

very_strong, strong, moderate, weak, not_significant, not_testable


(['Time_out_mean',
  'YearsAtCompany',
  'Time_out_min',
  'Time_out_max',
  'YearsWithCurrManager',
  'Age'],
 [],
 ['YearsSinceLastPromotion'],
 ['TrainingTimesLastYear', 'PercentSalaryHike'],
 ['MonthlyIncome',
  'Mgr_PerformanceRating',
  'Education',
  'Mgr_JobInvolvement',
  'StockOptionLevel',
  'Time_in_min',
  'JobLevel',
  'DistanceFromHome',
  'Time_in_max',
  'Time_in_mean'],
 ['NumCompaniesWorked',
  'TotalWorkingYears',
  'Emp_EnvironmentSatisfaction',
  'Emp_JobSatisfaction',
  'Emp_WorkLifeBalance'])

In [86]:
from scipy.stats import chi2_contingency

cat_cols = (
    features
    .select_dtypes(include=["object", "category"])
    .columns
)

chi2_results = []

for col in cat_cols:
    table = pd.crosstab(features[col], features[target])

    if table.shape[0] > 1:
        chi2, p, _, _ = chi2_contingency(table)

        n = table.to_numpy().sum()
        k = min(table.shape) - 1
        cramer_v = np.sqrt(chi2 / (n * k)) if k > 0 else 0

        chi2_results.append({
            "feature": col,
            "p_value": p,
            "cramers_v": cramer_v
        })

chi2_df = pd.DataFrame(chi2_results).sort_values("p_value")
chi2_df


Unnamed: 0,feature,p_value,cramers_v
5,MaritalStatus,8.453859000000001e-31,0.177211
0,BusinessTravel,1.764277e-16,0.12826
2,EducationField,8.288917e-09,0.102348
1,Department,4.820888e-07,0.081218
4,JobRole,0.001485545,0.075467
3,Gender,0.2452948,0.017496


In [88]:
# keep only numeric features that were significant in step 1
relevant_num = mw_df.loc[mw_df["p_value"] < 0.05, "feature"]

corr = features[relevant_num].corr().abs()

high_corr_pairs = (
    corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))
        .stack()
        .reset_index()
)

high_corr_pairs.columns = ["feature_1", "feature_2", "correlation"]

high_corr_pairs = high_corr_pairs[
    high_corr_pairs["correlation"] > 0.8
].sort_values("correlation", ascending=False)


high_corr_pairs

Unnamed: 0,feature_1,feature_2,correlation
1,Time_out_mean,Time_out_min,0.992718
2,Time_out_mean,Time_out_max,0.992701
15,Time_out_min,Time_out_max,0.984898
