In [1]:
from google.colab import files
uploades=files.upload()

Saving mental_health_workplace_survey.csv to mental_health_workplace_survey.csv


We imported our data set.

In [25]:
import pandas as pd
import matplotlib.pyplot as mp
import seaborn as sb
import numpy as np
from sklearn.preprocessing import StandardScaler # for standardisation as we are dealing with regression
from sklearn.preprocessing import OneHotEncoder

df=pd.read_csv("mental_health_workplace_survey.csv")
print(df.head())
print("Let's check if there are any missing values")
print(df.isnull().sum())
print("there are: ",df.isnull().sum().sum(),"missing values")

   EmployeeID  Age      Gender    Country            JobRole Department  \
0        1001   50        Male         UK    Sales Associate         HR   
1        1002   36        Male    Germany  Software Engineer         IT   
2        1003   29  Non-binary      India           IT Admin         IT   
3        1004   42        Male  Australia      HR Specialist         IT   
4        1005   40        Male     Brazil   Customer Support    Support   

   YearsAtCompany  WorkHoursPerWeek RemoteWork  BurnoutLevel  ...  \
0              14                47         No          3.37  ...   
1               1                59     Hybrid          7.39  ...   
2              13                59     Hybrid          7.10  ...   
3              15                31        Yes          4.18  ...   
4               6                34        Yes          8.28  ...   

   CommuteTime  HasMentalHealthSupport  ManagerSupportScore  HasTherapyAccess  \
0          117                      No               

In [26]:
x=df.drop(columns=['StressLevel']) # since stress level is our target we need to remove it from scaling, encoding
y=df['StressLevel']
num_cols=x.select_dtypes(include=['int64','float64']).columns #this is a standard code line for finding numerical columns
print(num_cols)
scaler=StandardScaler()
x[num_cols]=scaler.fit_transform(x[num_cols]) # fit caluclates the statistics of the data and transform applies scaling using that data
print(df.head())

Index(['EmployeeID', 'Age', 'YearsAtCompany', 'WorkHoursPerWeek',
       'BurnoutLevel', 'JobSatisfaction', 'ProductivityScore', 'SleepHours',
       'PhysicalActivityHrs', 'CommuteTime', 'ManagerSupportScore',
       'MentalHealthDaysOff', 'WorkLifeBalanceScore', 'TeamSize',
       'CareerGrowthScore', 'BurnoutRisk'],
      dtype='object')
   EmployeeID  Age      Gender    Country            JobRole Department  \
0        1001   50        Male         UK    Sales Associate         HR   
1        1002   36        Male    Germany  Software Engineer         IT   
2        1003   29  Non-binary      India           IT Admin         IT   
3        1004   42        Male  Australia      HR Specialist         IT   
4        1005   40        Male     Brazil   Customer Support    Support   

   YearsAtCompany  WorkHoursPerWeek RemoteWork  BurnoutLevel  ...  \
0              14                47         No          3.37  ...   
1               1                59     Hybrid          7.39  ...   

In [27]:
categorical_cols = x.select_dtypes(include=['object', 'category']).columns

encoder = OneHotEncoder(sparse_output=False)
encoded = encoder.fit_transform(x[categorical_cols])
encoded_df = pd.DataFrame(encoded, columns=encoder.get_feature_names_out(categorical_cols), index=x.index) #it makes sure that dataframe have these encoded data with new names at the same index
print(encoded_df.head())
#here we are adding num cols and encoded categorical cols , axis=1 adds columns i.e, to add along columns
X_processed = pd.concat([x[num_cols], encoded_df], axis=1)

   Gender_Female  Gender_Male  Gender_Non-binary  Gender_Prefer not to say  \
0            0.0          1.0                0.0                       0.0   
1            0.0          1.0                0.0                       0.0   
2            0.0          0.0                1.0                       0.0   
3            0.0          1.0                0.0                       0.0   
4            0.0          1.0                0.0                       0.0   

   Country_Australia  Country_Brazil  Country_Canada  Country_Germany  \
0                0.0             0.0             0.0              0.0   
1                0.0             0.0             0.0              1.0   
2                0.0             0.0             0.0              0.0   
3                1.0             0.0             0.0              0.0   
4                0.0             1.0             0.0              0.0   

   Country_India  Country_UK  ...  RemoteWork_Yes  HasMentalHealthSupport_No  \
0           

In [28]:
from sklearn.feature_selection import mutual_info_regression

mutual_info = mutual_info_regression(X_processed, y)
mutual_info = pd.Series(mutual_info, index=X_processed.columns).sort_values(ascending=False)
print('The mutuaal info score of features w.r.t target are: \n', mutual_info)
#let's set a threshold value and take top features.
threshold = 0.006
selected = mutual_info[mutual_info >= threshold].index
X_selected = X_processed[selected]
print('The selected features are : \n',X_selected)

The mutuaal info score of features w.r.t target are: 
 Country_Brazil                0.013619
SleepHours                    0.012124
Department_IT                 0.011222
BurnoutRisk                   0.010178
CareerGrowthScore             0.009555
ManagerSupportScore           0.008514
RemoteWork_Hybrid             0.008449
JobRole_HR Specialist         0.007826
Gender_Male                   0.007194
YearsAtCompany                0.006456
Country_Germany               0.005768
JobRole_Marketing Manager     0.005608
SalaryRange_40K-60K           0.005099
Department_HR                 0.004751
JobRole_Software Engineer     0.004534
JobRole_Data Scientist        0.004233
RemoteWork_Yes                0.004013
Department_Engineering        0.003047
Gender_Female                 0.003020
SalaryRange_80K-100K          0.002599
SalaryRange_100K+             0.002473
Department_Marketing          0.002355
Country_India                 0.002033
SalaryRange_<40K              0.001963
Country_U

In [29]:
#used for create 2 interaction features
df['WorkHoursPerWeek_vs_SleepHours']=df['WorkHoursPerWeek']/(df['SleepHours']+1e-5)
df['WorkLifeBalance_burnoutlevel']=df['WorkLifeBalanceScore']*df['BurnoutLevel']
interaction_feature=['WorkHoursPerWeek_vs_SleepHours','WorkLifeBalance_burnoutlevel']
df[interaction_feature]=StandardScaler().fit_transform(df[interaction_feature])
X_processed=pd.concat([X_processed,df[interaction_feature]], axis=1)
print(X_processed.head())

   EmployeeID       Age  YearsAtCompany  WorkHoursPerWeek  BurnoutLevel  \
0   -1.731474  0.835099        0.646445          0.293989     -0.831171   
1   -1.730319 -0.436487       -1.508004          1.707398      0.730817   
2   -1.729164 -1.072280        0.480718          1.707398      0.618137   
3   -1.728009  0.108478        0.812172         -1.590556     -0.516442   
4   -1.726855 -0.073177       -0.679370         -1.237204      1.076631   

   JobSatisfaction  ProductivityScore  SleepHours  PhysicalActivityHrs  \
0        -0.145528          -0.521468    0.319776             1.003164   
1        -1.325175          -0.682562    0.458507             1.387705   
2         0.667888           1.258235   -0.928806             1.632414   
3        -0.646685          -0.318183    1.498992             0.269039   
4        -1.194103          -1.303924   -1.622463            -0.604919   

   CommuteTime  ...  HasMentalHealthSupport_Yes  HasTherapyAccess_No  \
0     1.659954  ...             

In [31]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression,Lasso,Ridge
from sklearn.metrics import r2_score,mean_squared_error
x=X_processed
y=df['StressLevel']
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)
Lreg = LinearRegression()
Lreg.fit(x_train, y_train)
x_train_pred = Lreg.predict(x_train)
pred1=r2_score(y_train, x_train_pred)
print('pred1= ',pred1)
x_test_pred = Lreg.predict(x_test)
pred2=r2_score(y_test, x_test_pred)
print('pred2= ',pred2)
#let's try lasso regression
lasreg=Lasso(alpha=0.01)
lasreg.fit(x_train,y_train)
x_train_pred=lasreg.predict(x_train)
pred1_L=r2_score(y_train,x_train_pred)
print('pred1_L= ',pred1_L)
x_test_pred=lasreg.predict(x_test)
pred2_L=r2_score(y_test,x_test_pred)
print('pred2_L= ',pred2_L)
#ridge regression
ridreg = Ridge(alpha=0.1)
ridreg.fit(x_train,y_train)
x_test_pred_rd = ridreg.predict(x_test)
pred1_R=r2_score(y_test, x_test_pred_rd)
print('pred1_R= ',pred1_R)
x_train_pred_rd = ridreg.predict(x_train)
pred2_R=r2_score(y_train, x_train_pred_rd)
print('pred2_R= ',pred2_R)

pred1=  0.028096949441416497
pred2=  -0.06553028703913766
pred1_L=  0.025429787525854453
pred2_L=  -0.048670935954792105
pred1_R=  -0.06551301277004673
pred2_R=  0.028096948096650753


So, out of all these models Ridge is the best. Because, it is the only one out of all to have R2 value of test set positive.