In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.feature_selection import mutual_info_classif

# Load data
df = pd.read_csv(r"C:\Users\sreek\OneDrive\Desktop\ACM\mental_health_workplace_survey.csv")

# Drop ID column
df.drop(columns=["EmployeeID"], inplace=True)

# Encode categorical variables
label_encoders = {}
categorical_cols = df.select_dtypes(include='object').columns
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# Define features and target
X = df.drop(columns=["BurnoutRisk"])
y = df["BurnoutRisk"]

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Scale data for k-NN
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train models with all features
models = {
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "k-NN": KNeighborsClassifier()
}

print("=== Full Feature Models ===")
for name, model in models.items():
    model.fit(X_train_scaled if name == "k-NN" else X_train, y_train)
    preds = model.predict(X_test_scaled if name == "k-NN" else X_test)
    print(f"\n--- {name} ---")
    print(classification_report(y_test, preds))

# Feature selection using Random Forest
rf_model = models["Random Forest"]
importances = pd.Series(rf_model.feature_importances_, index=X.columns).sort_values(ascending=False)
top_3_features = importances.head(3).index.tolist()
print("\nTop 3 features:", top_3_features)

# Reduce to top 3 features
X_reduced = X[top_3_features]
Xr_train, Xr_test, yr_train, yr_test = train_test_split(X_reduced, y, test_size=0.2, stratify=y, random_state=42)

# Scale reduced features for k-NN
Xr_train_scaled = scaler.fit_transform(Xr_train)
Xr_test_scaled = scaler.transform(Xr_test)

# Train models with top 3 features
print("\n=== Reduced Feature Models (Top 3) ===")
for name, model in models.items():
    model.fit(Xr_train_scaled if name == "k-NN" else Xr_train, yr_train)
    preds = model.predict(Xr_test_scaled if name == "k-NN" else Xr_test)
    print(f"\n--- Reduced {name} ---")
    print(classification_report(yr_test, preds))


=== Full Feature Models ===

--- Decision Tree ---
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       404
           1       1.00      1.00      1.00       196

    accuracy                           1.00       600
   macro avg       1.00      1.00      1.00       600
weighted avg       1.00      1.00      1.00       600


--- Random Forest ---
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       404
           1       1.00      1.00      1.00       196

    accuracy                           1.00       600
   macro avg       1.00      1.00      1.00       600
weighted avg       1.00      1.00      1.00       600


--- k-NN ---
              precision    recall  f1-score   support

           0       0.86      0.92      0.89       404
           1       0.81      0.68      0.74       196

    accuracy                           0.84       600
   macro avg       0.83      0.80      0