In [68]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

df=pd.read_csv('mental_health_workplace_survey.csv')
df.head()

Unnamed: 0,EmployeeID,Age,Gender,Country,JobRole,Department,YearsAtCompany,WorkHoursPerWeek,RemoteWork,BurnoutLevel,...,CommuteTime,HasMentalHealthSupport,ManagerSupportScore,HasTherapyAccess,MentalHealthDaysOff,SalaryRange,WorkLifeBalanceScore,TeamSize,CareerGrowthScore,BurnoutRisk
0,1001,50,Male,UK,Sales Associate,HR,14,47,No,3.37,...,117,No,3.15,Yes,8,40K-60K,8.82,6,9.2,0
1,1002,36,Male,Germany,Software Engineer,IT,1,59,Hybrid,7.39,...,8,Yes,4.4,Yes,4,80K-100K,2.8,45,8.46,1
2,1003,29,Non-binary,India,IT Admin,IT,13,59,Hybrid,7.1,...,75,No,3.63,No,6,80K-100K,7.28,7,7.96,1
3,1004,42,Male,Australia,HR Specialist,IT,15,31,Yes,4.18,...,43,Yes,4.5,Yes,9,60K-80K,1.31,11,8.9,0
4,1005,40,Male,Brazil,Customer Support,Support,6,34,Yes,8.28,...,58,Yes,5.51,Yes,6,<40K,1.17,18,8.88,1


In [69]:
from sklearn.metrics import accuracy_score

print("Before dropping features")
y=df['BurnoutRisk']
x=df.drop(['BurnoutRisk','BurnoutLevel'], axis=1)
X=pd.get_dummies(x)

scaler=StandardScaler()
numeric_cols=X.select_dtypes(include=['int64', 'float64']).columns
X[numeric_cols]=scaler.fit_transform(X[numeric_cols])
x_train,x_test,y_train,y_test= train_test_split(X,y,test_size=0.2,random_state=42)

decision=DecisionTreeClassifier(random_state=42)
decision.fit(x_train,y_train)
accuracy=accuracy_score(y_test, decision.predict(x_test))
print(f"Decision tree accuracy:{accuracy}")

randomforest=RandomForestClassifier(random_state=42)
randomforest.fit(x_train,y_train)
accuracy=accuracy_score(y_test,randomforest.predict(x_test))
print(f"Random forest accuracy:{accuracy}")

knn=KNeighborsClassifier()
knn.fit(x_train,y_train)
accuracy=accuracy_score(y_test,knn.predict(x_test))
print(f"kNN accuracy(best):{accuracy}")

Before dropping features
Decision tree accuracy:0.5283333333333333
Random forest accuracy:0.6666666666666666
kNN accuracy(best):0.5966666666666667


In [70]:
importance= randomforest.feature_importances_
importancedf=pd.Series(importance,index=X.columns)
importancedf=importancedf.sort_values(ascending=False)
print(importancedf.head(3))

top3=importancedf.head(3).index.tolist()
print("Top 3 features",top3)

ProductivityScore      0.058471
ManagerSupportScore    0.057015
StressLevel            0.056762
dtype: float64
Top 3 features ['ProductivityScore', 'ManagerSupportScore', 'StressLevel']


In [71]:
print("After dropping features by random forest importance")
X=X[top3]
x_train,x_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

decision.fit(x_train,y_train)
accuracy=accuracy_score(y_test,decision.predict(x_test))
print(f"Decision Tree accuracy when 3 feature:{accuracy}")
randomforest.fit(x_train,y_train)
accuracy=accuracy_score(y_test,randomforest.predict(x_test))    
print(f"Random Forest accuracy when 3 feature:{accuracy}")
knn.fit(x_train,y_train)
accuracy=accuracy_score(y_test,knn.predict(x_test))
print(f"kNN when 3 feature:{accuracy}")



After dropping features by random forest importance
Decision Tree accuracy when 3 feature:0.5366666666666666
Random Forest accuracy when 3 feature:0.6283333333333333
kNN when 3 feature:0.5783333333333334
