<a href="https://colab.research.google.com/github/PunitRaveendran/ACM/blob/main/Feature%20Showdown.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# -------------------------------
# 3-Feature Showdown – Burnout Prediction
# -------------------------------

import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import numpy as np

# -----------------------------------
# 1. Load your burnout dataset
# -----------------------------------
df = pd.read_csv("/content/mental_health_workplace_survey.csv")

# Corrected feature names based on the dataframe
all_features = ['WorkHoursPerWeek', 'JobSatisfaction', 'SleepHours',
                'Age', 'YearsAtCompany', 'SalaryRange', 'TeamSize',
                'StressLevel', 'ProductivityScore', 'BurnoutLevel',
                'PhysicalActivityHrs', 'CommuteTime', 'HasMentalHealthSupport',
                'ManagerSupportScore', 'HasTherapyAccess', 'MentalHealthDaysOff',
                'WorkLifeBalanceScore', 'CareerGrowthScore', 'Gender', 'Country',
                'JobRole', 'Department', 'RemoteWork']

# Corrected target variable name
y = df['BurnoutRisk']
X_all = df[all_features]

# Identify categorical and numerical columns for preprocessing
cat_cols = X_all.select_dtypes(exclude=[np.number]).columns.tolist()
num_cols = X_all.select_dtypes(include=[np.number]).columns.tolist()

# Preprocessing pipeline for all features
preprocessor_all = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore', drop='first'), cat_cols),
        ('num', StandardScaler(), num_cols)
    ],
    remainder='passthrough')

X_all_processed = preprocessor_all.fit_transform(X_all)


# -----------------------------------
# 2. Selecting top 3 features using Random Forest feature importance
# -----------------------------------
rf = RandomForestClassifier(random_state=46)
rf.fit(X_all_processed, y)

# Get feature names after one-hot encoding
ohe_feature_names = preprocessor_all.named_transformers_['cat'].get_feature_names_out(cat_cols)
all_processed_feature_names = list(ohe_feature_names) + num_cols

importances = pd.Series(rf.feature_importances_, index=all_processed_feature_names)
importances = importances.sort_values(ascending=False)

top3_features_processed = importances.head(3).index.tolist()
print("Top 3 Selected Features (after preprocessing):", top3_features_processed)


# To map back to original features, we can inspect the selected processed feature names
# and see which original columns they correspond to.
original_top3_features = []
for feature in top3_features_processed:
    if feature in num_cols and feature not in original_top3_features:
        original_top3_features.append(feature)
    else:
        for col in cat_cols:
            if feature.startswith(col + '_') and col not in original_top3_features:
                original_top3_features.append(col)
                break # Stop searching once the original column is found


print("Corresponding Original Top 3 Features:", original_top3_features)


# -----------------------------------
# 3. Train minimal model (Logistic Regression) on these 3 features
# -----------------------------------
X_top3_orig = df[original_top3_features]

# Create a new preprocessor for the top 3 original features
top3_cat_cols_orig = X_top3_orig.select_dtypes(exclude=[np.number]).columns.tolist()
top3_num_cols_orig = X_top3_orig.select_dtypes(include=[np.number]).columns.tolist()

preprocessor_top3 = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore', drop='first'), top3_cat_cols_orig),
        ('num', StandardScaler(), top3_num_cols_orig)
    ],
    remainder='passthrough')


X_train, X_test, y_train, y_test = train_test_split(X_top3_orig, y, test_size=0.3, random_state=46, stratify=y)


# Create pipeline for Logistic Regression with top 3 features
log_reg_pipeline = Pipeline(steps=[('preprocessor', preprocessor_top3), ('classifier', LogisticRegression(max_iter=1500, random_state=46))])

log_reg_pipeline.fit(X_train, y_train)

y_pred = log_reg_pipeline.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

# -----------------------------------
# 4. Print results
# -----------------------------------
print("\n--- Minimal Burnout Model ---")
print("Chosen Original Features:", original_top3_features)
print("Accuracy on Test Set: {:.3f}%".format(accuracy * 100))

#Cross-validation for robustness
# Apply preprocessing before cross-validation
X_top3_processed_cv = preprocessor_top3.fit_transform(X_top3_orig)
cv_scores = cross_val_score(LogisticRegression(max_iter=1500, random_state=46), X_top3_processed_cv, y, cv=5, scoring='accuracy')
print("Cross-Validation Accuracy: {:.2f}% (+/- {:.2f}%)".format(cv_scores.mean()*100, cv_scores.std()*100))

# -----------------------------------
# 5. Reasoning
# -----------------------------------
print("The selected top 3 features are likely  to be strong indicators of burnout risk based on Random Forest's conclusion of their predictive power.")
print("Training a minimal model on these features helps understand the core drivers of burnout in the dataset and potentially uilding a simpler,more  interpretabble model")


Top 3 Selected Features (after preprocessing): ['BurnoutLevel', 'ProductivityScore', 'StressLevel']
Corresponding Original Top 3 Features: ['BurnoutLevel', 'ProductivityScore', 'StressLevel']

--- Minimal Burnout Model ---
Chosen Original Features: ['BurnoutLevel', 'ProductivityScore', 'StressLevel']
Accuracy on Test Set: 99.889%
Cross-Validation Accuracy: 99.80% (+/- 0.19%)
The selected top 3 features are likely  to be strong indicators of burnout risk based on Random Forest's conclusion of their predictive power.
Training a minimal model on these features helps understand the core drivers of burnout in the dataset and potentially uilding a simpler,more  interpretabble model
