In [17]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import mutual_info_classif
from sklearn.metrics import accuracy_score

In [18]:
df = pd.read_csv("/content/mental_health_workplace_survey.csv")

df.drop(columns=["EmployeeID", "Country", "JobRole"], inplace=True)

We are dropping unnesessary columns like employee id country and job role as they dont have any meaning and are of high cardinality

In [19]:
for col in df.select_dtypes(include='object'):
    df[col] = LabelEncoder().fit_transform(df[col].astype(str))
df.dropna(inplace=True)

In [20]:
X = df.drop("BurnoutRisk", axis=1)
y = df["BurnoutRisk"]

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=0)

In [21]:
# Decision Tree
dt = DecisionTreeClassifier(random_state=0)
dt.fit(X_train, y_train)
dt_preds = dt.predict(X_test)
print("Decision Tree - Accuracy:", accuracy_score(y_test, dt_preds))

Decision Tree - Accuracy: 1.0


In [22]:
# Random Forest
rf = RandomForestClassifier(random_state=0)
rf.fit(X_train, y_train)
rf_preds = rf.predict(X_test)
print("Random Forest - Accuracy:", accuracy_score(y_test, rf_preds))


Random Forest - Accuracy: 1.0


In [23]:
#KNN
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
knn_preds = knn.predict(X_test)
print("k-NN - Accuracy:", accuracy_score(y_test, knn_preds))

k-NN - Accuracy: 0.7173333333333334


In [24]:
mi = mutual_info_classif(X, y, random_state=0)
mi_scores = pd.Series(mi, index=X.columns).sort_values(ascending=False)
top3 = mi_scores.head(3).index
print("Top 3 Features:")
print( mi_scores.head(3))

Top 3 Features:
BurnoutLevel    0.631643
Age             0.018457
SleepHours      0.010094
dtype: float64


In [25]:
X_train3 = X_train[top3]
X_test3 = X_test[top3]

In [26]:
#dec tree
dt.fit(X_train3, y_train)
dt_preds3 = dt.predict(X_test3)
print("Decision Tree (Top 3) - Accuracy:", accuracy_score(y_test, dt_preds3))


Decision Tree (Top 3) - Accuracy: 1.0


In [27]:
# random for
rf.fit(X_train3, y_train)
rf_preds3 = rf.predict(X_test3)
print("Random Forest (Top 3) - Accuracy:", accuracy_score(y_test, rf_preds3))


Random Forest (Top 3) - Accuracy: 1.0


In [28]:
#knn
knn.fit(X_train3, y_train)
knn_preds3 = knn.predict(X_test3)
print("k-NN (Top 3) - Accuracy:", accuracy_score(y_test, knn_preds3))


k-NN (Top 3) - Accuracy: 0.9706666666666667


In [29]:
print('comparison')
print('decision tree ',accuracy_score(y_test, dt_preds), accuracy_score(y_test, dt_preds3))
print('random forest ',accuracy_score(y_test, rf_preds), accuracy_score(y_test, rf_preds3))
print('knn ',accuracy_score(y_test, knn_preds), accuracy_score(y_test, knn_preds3))

comparison
decision tree  1.0 1.0
random forest  1.0 1.0
knn  0.7173333333333334 0.9706666666666667
