In [1]:
from google.colab import files
uploades=files.upload()

Saving mental_health_workplace_survey.csv to mental_health_workplace_survey.csv


We imported our data set.

In [2]:
import pandas as pd
import matplotlib.pyplot as mp
import seaborn as sb
import numpy as np
from sklearn.preprocessing import StandardScaler # for standardisation as we are dealing with regression
from sklearn.preprocessing import OneHotEncoder

df=pd.read_csv("mental_health_workplace_survey.csv")
print(df.head())
print("Let's check if there are any missing values")
print(df.isnull().sum())
print("there are: ",df.isnull().sum().sum(),"missing values")

   EmployeeID  Age      Gender    Country            JobRole Department  \
0        1001   50        Male         UK    Sales Associate         HR   
1        1002   36        Male    Germany  Software Engineer         IT   
2        1003   29  Non-binary      India           IT Admin         IT   
3        1004   42        Male  Australia      HR Specialist         IT   
4        1005   40        Male     Brazil   Customer Support    Support   

   YearsAtCompany  WorkHoursPerWeek RemoteWork  BurnoutLevel  ...  \
0              14                47         No          3.37  ...   
1               1                59     Hybrid          7.39  ...   
2              13                59     Hybrid          7.10  ...   
3              15                31        Yes          4.18  ...   
4               6                34        Yes          8.28  ...   

   CommuteTime  HasMentalHealthSupport  ManagerSupportScore  HasTherapyAccess  \
0          117                      No               

In [3]:
x=df.drop(columns=['StressLevel']) # since stress level is our target we need to remove it from scaling, encoding
y=df['StressLevel']
num_cols=x.select_dtypes(include=['int64','float64']).columns #this is a standard code line for finding numerical columns
print(num_cols)
scaler=StandardScaler()
df[num_cols]=scaler.fit_transform(df[num_cols]) # fit caluclates the statistics of the data and transform applies scaling using that data
print(df.head())

Index(['EmployeeID', 'Age', 'YearsAtCompany', 'WorkHoursPerWeek',
       'BurnoutLevel', 'JobSatisfaction', 'ProductivityScore', 'SleepHours',
       'PhysicalActivityHrs', 'CommuteTime', 'ManagerSupportScore',
       'MentalHealthDaysOff', 'WorkLifeBalanceScore', 'TeamSize',
       'CareerGrowthScore', 'BurnoutRisk'],
      dtype='object')
   EmployeeID       Age      Gender    Country            JobRole Department  \
0   -1.731474  0.835099        Male         UK    Sales Associate         HR   
1   -1.730319 -0.436487        Male    Germany  Software Engineer         IT   
2   -1.729164 -1.072280  Non-binary      India           IT Admin         IT   
3   -1.728009  0.108478        Male  Australia      HR Specialist         IT   
4   -1.726855 -0.073177        Male     Brazil   Customer Support    Support   

   YearsAtCompany  WorkHoursPerWeek RemoteWork  BurnoutLevel  ...  \
0        0.646445          0.293989         No     -0.831171  ...   
1       -1.508004          1.707398   

In [4]:
categorical_cols = x.select_dtypes(include=['object', 'category']).columns

encoder = OneHotEncoder(sparse_output=False)
encoded = encoder.fit_transform(x[categorical_cols])
encoded_df = pd.DataFrame(encoded, columns=encoder.get_feature_names_out(categorical_cols), index=x.index) #it makes sure that dataframe have these encoded data with new names at the same index
print(encoded_df.head())
#here we are adding num cols and encoded categorical cols , axis=1 adds columns i.e, to add along columns
X_processed = pd.concat([x[num_cols], encoded_df], axis=1)

   Gender_Female  Gender_Male  Gender_Non-binary  Gender_Prefer not to say  \
0            0.0          1.0                0.0                       0.0   
1            0.0          1.0                0.0                       0.0   
2            0.0          0.0                1.0                       0.0   
3            0.0          1.0                0.0                       0.0   
4            0.0          1.0                0.0                       0.0   

   Country_Australia  Country_Brazil  Country_Canada  Country_Germany  \
0                0.0             0.0             0.0              0.0   
1                0.0             0.0             0.0              1.0   
2                0.0             0.0             0.0              0.0   
3                1.0             0.0             0.0              0.0   
4                0.0             1.0             0.0              0.0   

   Country_India  Country_UK  ...  RemoteWork_Yes  HasMentalHealthSupport_No  \
0           

In [5]:
from sklearn.feature_selection import mutual_info_regression

mutual_info = mutual_info_regression(X_processed, y)
mutual_info = pd.Series(mutual_info, index=X_processed.columns).sort_values(ascending=False)
print('The mutuaal info score of features w.r.t target are: \n', mutual_info)
#let's set a threshold value and take top features.
threshold = 0.006
selected = mutual_info[mutual_info >= threshold].index
X_selected = X_processed[selected]
print('The selected features are : \n',X_selected)

The mutuaal info score of features w.r.t target are: 
 Department_IT                 0.014305
RemoteWork_Yes                0.013879
RemoteWork_Hybrid             0.012015
CareerGrowthScore             0.010335
SleepHours                    0.010035
SalaryRange_40K-60K           0.009509
Country_Germany               0.008685
SalaryRange_60K-80K           0.008281
SalaryRange_100K+             0.008009
ManagerSupportScore           0.007937
Department_Marketing          0.007649
Department_Engineering        0.007226
JobRole_Software Engineer     0.005665
Country_UK                    0.002814
Department_Support            0.002791
JobRole_IT Admin              0.002690
Country_Brazil                0.002539
Country_Australia             0.001831
HasTherapyAccess_Yes          0.001472
ProductivityScore             0.001448
Department_HR                 0.001002
Country_India                 0.000571
JobRole_HR Specialist         0.000502
Gender_Male                   0.000000
Gender_Fe

In [12]:
#used for feature selection
df['WorkHoursPerWeek_vs_SleepHours']=df['WorkHoursPerWeek']/(df['SleepHours']+1e-5)
interaction_feat_1=['WorkHoursPerWeek_vs_SleepHours']
df[interaction_feat_1]=StandardScaler().fit_transform(df[interaction_feat_1])
X_processed=pd.concat([X_processed,df[interaction_feat_1]], axis=1)
print(X_processed.head())

   EmployeeID  Age  YearsAtCompany  WorkHoursPerWeek  BurnoutLevel  \
0        1001   50              14                47          3.37   
1        1002   36               1                59          7.39   
2        1003   29              13                59          7.10   
3        1004   42              15                31          4.18   
4        1005   40               6                34          8.28   

   JobSatisfaction  ProductivityScore  SleepHours  PhysicalActivityHrs  \
0             5.06               4.16         7.0                  7.9   
1             2.00               3.74         7.2                  9.0   
2             7.17               8.80         5.2                  9.7   
3             3.76               4.69         8.7                  5.8   
4             2.34               2.12         4.2                  3.3   

   CommuteTime  ...  HasTherapyAccess_Yes  SalaryRange_100K+  \
0          117  ...                   1.0                0.0   
1     