In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
# 1-Load and preprocess data
df = pd.read_csv("/content/mental_health_workplace_survey.csv")
all_features = ['WorkHoursPerWeek', 'JobSatisfaction', 'SleepHours', 'Age', 'YearsAtCompany',
                'SalaryRange', 'TeamSize', 'StressLevel', 'ProductivityScore', 'BurnoutLevel',
                'PhysicalActivityHrs', 'CommuteTime', 'HasMentalHealthSupport', 'ManagerSupportScore',
                'HasTherapyAccess', 'MentalHealthDaysOff', 'WorkLifeBalanceScore', 'CareerGrowthScore',
                'Gender', 'Country', 'JobRole', 'Department', 'RemoteWork']
X = df[all_features]
y = df['BurnoutRisk']

# Define categorical and numerical columns
cat_cols = X.select_dtypes(exclude=[np.number]).columns.tolist()
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()

# Create preprocessor for all features
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore', drop='first'), cat_cols),
        ('num', StandardScaler(), num_cols)
    ])
# 2-Selecting top 3 features using Random Forest
rf_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('rf', RandomForestClassifier(random_state=46))
])
rf_pipeline.fit(X, y)

# Get feature importances
ohe_feature_names = rf_pipeline.named_steps['preprocessor'].named_transformers_['cat'].get_feature_names_out(cat_cols)
all_feature_names = list(ohe_feature_names) + num_cols
importances = pd.Series(rf_pipeline.named_steps['rf'].feature_importances_, index=all_feature_names)
top3_features = importances.nlargest(3).index.tolist()

# Map back to original features
original_top3 = []
for feature in top3_features:
    if feature in num_cols and feature not in original_top3:
        original_top3.append(feature)
    else:
        for col in cat_cols:
            if feature.startswith(f"{col}_") and col not in original_top3:
                original_top3.append(col)
                break

print("Top 3 Selected Features (after preprocessing):", top3_features)
print("Corresponding Original Top 3 Features:", original_top3)

# 3-Train minimal Logistic Regression model on top 3 features
X_top3 = df[original_top3]

# Create preprocessor for top 3 features
top3_cat_cols = X_top3.select_dtypes(exclude=[np.number]).columns.tolist()
top3_num_cols = X_top3.select_dtypes(include=[np.number]).columns.tolist()
top3_preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore', drop='first'), top3_cat_cols),
        ('num', StandardScaler(), top3_num_cols)
    ])

# Split data
X_train, X_test, y_train, y_test = train_test_split(X_top3, y, test_size=0.3, random_state=46, stratify=y)

# Create and train Logistic Regression pipeline
log_reg_pipeline = Pipeline([
    ('preprocessor', top3_preprocessor),
    ('classifier', LogisticRegression(max_iter=1500, random_state=46))
])
log_reg_pipeline.fit(X_train, y_train)

# Evaluate model
y_pred = log_reg_pipeline.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

# Cross-validation
X_top3_processed = top3_preprocessor.fit_transform(X_top3)
cv_scores = cross_val_score(LogisticRegression(max_iter=1500, random_state=46), X_top3_processed, y, cv=5, scoring='accuracy')

# 4-results
print(" Minimal Burnout Model")
print(" Original Features:", original_top3)
print(f"Accuracy on Test Set: {accuracy * 100:.3f}%")
print(f"Cross-Validation Accuracy: {cv_scores.mean() * 100:.2f}% (+/- {cv_scores.std() * 100:.2f}%)")
print("The top 3 features are the best  predictors of burnout risk, enabling a simple, interpretable model.")

Top 3 Selected Features (after preprocessing): ['BurnoutLevel', 'ProductivityScore', 'StressLevel']
Corresponding Original Top 3 Features: ['BurnoutLevel', 'ProductivityScore', 'StressLevel']
 Minimal Burnout Model
 Original Features: ['BurnoutLevel', 'ProductivityScore', 'StressLevel']
Accuracy on Test Set: 99.889%
Cross-Validation Accuracy: 99.80% (+/- 0.19%)
The top 3 features are the best  predictors of burnout risk, enabling a simple, interpretable model.
