In [76]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.feature_selection import mutual_info_regression
import matplotlib.pyplot as plt

df = pd.read_csv("mental_health_workplace_survey.csv")
df.head(10)

Unnamed: 0,EmployeeID,Age,Gender,Country,JobRole,Department,YearsAtCompany,WorkHoursPerWeek,RemoteWork,BurnoutLevel,...,CommuteTime,HasMentalHealthSupport,ManagerSupportScore,HasTherapyAccess,MentalHealthDaysOff,SalaryRange,WorkLifeBalanceScore,TeamSize,CareerGrowthScore,BurnoutRisk
0,1001,50,Male,UK,Sales Associate,HR,14,47,No,3.37,...,117,No,3.15,Yes,8,40K-60K,8.82,6,9.2,0
1,1002,36,Male,Germany,Software Engineer,IT,1,59,Hybrid,7.39,...,8,Yes,4.4,Yes,4,80K-100K,2.8,45,8.46,1
2,1003,29,Non-binary,India,IT Admin,IT,13,59,Hybrid,7.1,...,75,No,3.63,No,6,80K-100K,7.28,7,7.96,1
3,1004,42,Male,Australia,HR Specialist,IT,15,31,Yes,4.18,...,43,Yes,4.5,Yes,9,60K-80K,1.31,11,8.9,0
4,1005,40,Male,Brazil,Customer Support,Support,6,34,Yes,8.28,...,58,Yes,5.51,Yes,6,<40K,1.17,18,8.88,1
5,1006,44,Prefer not to say,Germany,Project Manager,Support,3,58,Hybrid,3.12,...,23,Yes,2.56,No,6,40K-60K,5.06,38,4.32,0
6,1007,32,Prefer not to say,USA,Software Engineer,Engineering,17,30,Hybrid,5.15,...,62,No,4.54,Yes,9,100K+,6.91,12,9.76,0
7,1008,32,Male,Canada,Customer Support,Marketing,4,39,No,5.25,...,77,Yes,4.47,No,3,80K-100K,2.28,22,7.38,0
8,1009,45,Prefer not to say,Canada,Marketing Manager,Sales,5,49,Hybrid,4.07,...,112,No,3.57,No,3,80K-100K,7.87,3,4.33,0
9,1010,57,Prefer not to say,Brazil,Software Engineer,Engineering,6,59,Hybrid,9.59,...,40,Yes,9.99,No,2,60K-80K,2.16,19,4.98,1


In [77]:
#ordinalcolums
ordinal_cols = ['RemoteWork', 'HasMentalHealthSupport', 'HasTherapyAccess']
ordinal_map = {'No': 0, 'Hybrid': 1, 'Yes': 2}
df[ordinal_cols] = df[ordinal_cols].replace(ordinal_map)


  df[ordinal_cols] = df[ordinal_cols].replace(ordinal_map)


In [78]:
target = 'ProductivityScore'
numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns.drop(target)
scaler = StandardScaler()
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])
df[numeric_cols].head()

Unnamed: 0,EmployeeID,Age,YearsAtCompany,WorkHoursPerWeek,RemoteWork,BurnoutLevel,JobSatisfaction,StressLevel,SleepHours,PhysicalActivityHrs,CommuteTime,HasMentalHealthSupport,ManagerSupportScore,HasTherapyAccess,MentalHealthDaysOff,WorkLifeBalanceScore,TeamSize,CareerGrowthScore,BurnoutRisk
0,-1.731474,0.835099,0.646445,0.293989,-1.230716,-0.831171,-0.145528,1.519164,0.319776,1.003164,1.659954,-0.979543,-0.882776,1.00736,1.210845,1.29482,-1.36331,1.433632,-0.696526
1,-1.730319,-0.436487,-1.508004,1.707398,0.000821,0.730817,-1.325175,0.051725,0.458507,1.387705,-1.471872,1.020885,-0.401445,1.00736,-0.190866,-1.018834,1.405769,1.145294,1.435697
2,-1.729164,-1.07228,0.480718,1.707398,0.000821,0.618137,0.667888,0.070933,-0.928806,1.632414,0.453195,-0.979543,-0.697945,-0.992693,0.50999,0.702955,-1.292308,0.950471,1.435697
3,-1.728009,0.108478,0.812172,-1.590556,1.232358,-0.516442,-0.646685,0.339835,1.498992,0.269039,-0.46624,1.020885,-0.362939,1.00736,1.561273,-1.591483,-1.0083,1.316738,-0.696526
4,-1.726855,-0.073177,-0.67937,-1.237204,1.232358,1.076631,-1.194103,-0.808762,-1.622463,-0.604919,-0.035255,1.020885,0.025976,1.00736,0.50999,-1.645288,-0.511286,1.308945,1.435697


In [79]:
df['Stress_WorkHrs'] = df['StressLevel'] * df['WorkHoursPerWeek']
df['Job_Sleep'] = df['JobSatisfaction'] / (df['SleepHours'] )
print(df[['Stress_WorkHrs', 'Job_Sleep']].head())

   Stress_WorkHrs  Job_Sleep
0        0.446618  -0.455095
1        0.088316  -2.890194
2        0.121110  -0.719082
3       -0.540527  -0.431413
4        1.000604   0.735982


In [80]:
import seaborn as sns
X = df.drop(columns=[target])
y = df[target]

numeric_df = df.select_dtypes(include=['number'])
correlation = numeric_df.corr()[target].abs().sort_values(ascending=False)
print(correlation)


ProductivityScore         1.000000
WorkHoursPerWeek          0.041243
JobSatisfaction           0.024895
BurnoutLevel              0.020342
Age                       0.019368
WorkLifeBalanceScore      0.017796
EmployeeID                0.017721
CommuteTime               0.017524
HasMentalHealthSupport    0.013641
BurnoutRisk               0.011809
HasTherapyAccess          0.010928
StressLevel               0.010176
Job_Sleep                 0.009421
ManagerSupportScore       0.009326
RemoteWork                0.008737
TeamSize                  0.007280
YearsAtCompany            0.005963
SleepHours                0.004881
Stress_WorkHrs            0.003364
CareerGrowthScore         0.002359
MentalHealthDaysOff       0.001483
PhysicalActivityHrs       0.000137
Name: ProductivityScore, dtype: float64


In [81]:
X_numeric = X.select_dtypes(include=['number'])
mutual_info = pd.Series(mutual_info_regression(X_numeric, y), index=X_numeric.columns)
mutual_info = mutual_info.sort_values(ascending=False)
print(mutual_info)

TeamSize                  0.025253
Age                       0.010429
HasTherapyAccess          0.008818
JobSatisfaction           0.008579
WorkLifeBalanceScore      0.008156
EmployeeID                0.005576
StressLevel               0.000756
YearsAtCompany            0.000000
WorkHoursPerWeek          0.000000
RemoteWork                0.000000
PhysicalActivityHrs       0.000000
BurnoutLevel              0.000000
SleepHours                0.000000
ManagerSupportScore       0.000000
HasMentalHealthSupport    0.000000
CommuteTime               0.000000
MentalHealthDaysOff       0.000000
CareerGrowthScore         0.000000
BurnoutRisk               0.000000
Stress_WorkHrs            0.000000
Job_Sleep                 0.000000
dtype: float64


In [82]:
top_corr_features = correlation.drop(target).head(15).index
top_mi_features = mutual_info.head(15).index
print(top_corr_features)
print(top_mi_features)
selected_features = list(set(top_corr_features).union(set(top_mi_features)))
print(selected_features)

Index(['WorkHoursPerWeek', 'JobSatisfaction', 'BurnoutLevel', 'Age',
       'WorkLifeBalanceScore', 'EmployeeID', 'CommuteTime',
       'HasMentalHealthSupport', 'BurnoutRisk', 'HasTherapyAccess',
       'StressLevel', 'Job_Sleep', 'ManagerSupportScore', 'RemoteWork',
       'TeamSize'],
      dtype='object')
Index(['TeamSize', 'Age', 'HasTherapyAccess', 'JobSatisfaction',
       'WorkLifeBalanceScore', 'EmployeeID', 'StressLevel', 'YearsAtCompany',
       'WorkHoursPerWeek', 'RemoteWork', 'PhysicalActivityHrs', 'BurnoutLevel',
       'SleepHours', 'ManagerSupportScore', 'HasMentalHealthSupport'],
      dtype='object')
['Age', 'JobSatisfaction', 'WorkLifeBalanceScore', 'ManagerSupportScore', 'BurnoutLevel', 'WorkHoursPerWeek', 'StressLevel', 'EmployeeID', 'BurnoutRisk', 'Job_Sleep', 'YearsAtCompany', 'RemoteWork', 'TeamSize', 'HasMentalHealthSupport', 'PhysicalActivityHrs', 'CommuteTime', 'HasTherapyAccess', 'SleepHours']


In [83]:
X_train, X_test, y_train, y_test = train_test_split(X[selected_features], y, test_size=0.2, random_state=42)

#l
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
y_pred_lr = lr_model.predict(X_test)
mse_lr = mean_squared_error(y_test, y_pred_lr)
r2_lr = r2_score(y_test, y_pred_lr)
print("Linear Regression:   MSE =", mse_lr, ", R² =", r2_lr)


Linear Regression:   MSE = 6.526054801226059 , R² = -0.01113062773139517


In [84]:

# Ridge
ridge_model = Ridge(alpha=1.0)
ridge_model.fit(X_train, y_train)
y_pred_ridge = ridge_model.predict(X_test)
mse_ridge = mean_squared_error(y_test, y_pred_ridge)
r2_ridge = r2_score(y_test, y_pred_ridge)
print("Ridge Regression:    MSE =", mse_ridge, ", R² =", r2_ridge)


Ridge Regression:    MSE = 6.5260088515979895 , R² = -0.011123508410811889


In [85]:

# Lasso
lasso_model = Lasso(alpha=0.1)
lasso_model.fit(X_train, y_train)
y_pred_lasso = lasso_model.predict(X_test)
mse_lasso = mean_squared_error(y_test, y_pred_lasso)
r2_lasso = r2_score(y_test, y_pred_lasso)

print("Lasso Regression:    MSE =", mse_lasso, ", R² =", r2_lasso)

Lasso Regression:    MSE = 6.452447473078464 , R² = 0.0002739078084421642
