In [9]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report

import pandas as pd

# Load the dataset
data = pd.read_csv('WorkNMentalHealth.csv')

# Display the first few rows of the dataset to understand its structure
data.head()

# Drop the Employee_ID column since it's not relevant for prediction
data_clean = data.drop('Employee_ID', axis=1)

# Handle missing values (if any) by filling with mode for categorical and median for numeric
for column in data_clean.select_dtypes(include=['object']).columns:
    data_clean[column].fillna(data_clean[column].mode()[0], inplace=True)
for column in data_clean.select_dtypes(include=['number']).columns:
    data_clean[column].fillna(data_clean[column].median(), inplace=True)

# Encode categorical variables using LabelEncoder
label_encoders = {}
for column in data_clean.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    data_clean[column] = le.fit_transform(data_clean[column])
    label_encoders[column] = le

# Separate features and target variable
X = data_clean.drop('Stress_Level', axis=1)
y = data_clean['Stress_Level']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the Random Forest classifier
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions
y_pred = rf_model.predict(X_test)

# Evaluate the model
classification_rep = classification_report(y_test, y_pred)

# Get feature importance
feature_importances = rf_model.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)


# Displaying the top 10 important features for simplicity
top_features = feature_importance_df.head(10)

# Show the classification report and top features
top_features

Unnamed: 0,Feature,Importance
6,Hours_Worked_Per_Week,0.104873
0,Age,0.102625
4,Years_of_Experience,0.101018
7,Number_of_Virtual_Meetings,0.084885
3,Industry,0.063249
2,Job_Role,0.063068
17,Region,0.057619
14,Company_Support_for_Remote_Work,0.055194
12,Social_Isolation_Rating,0.052952
8,Work_Life_Balance_Rating,0.050956


In [14]:
# Dropping the specified columns and retraining the model

X_reduced = X.drop(['Job_Role', 'Industry', 'Region', 'Years_of_Experience', 'Age', 'Gender', 'Number_of_Virtual_Meetings', 'Company_Support_for_Remote_Work', 'Productivity_Change',], axis=1)

# Splitting the data again with the reduced feature set
X_train_reduced, X_test_reduced, y_train_reduced, y_test_reduced = train_test_split(X_reduced, y, test_size=0.2, random_state=42)

# Re-initializing and training the Random Forest classifier
rf_model_reduced = RandomForestClassifier(random_state=42)
rf_model_reduced.fit(X_train_reduced, y_train_reduced)

# Making predictions with the reduced feature set
y_pred_reduced = rf_model_reduced.predict(X_test_reduced)

# Evaluating the model with the reduced feature set
classification_rep_reduced = classification_report(y_test_reduced, y_pred_reduced)

# Getting feature importance for the reduced model
feature_importances_reduced = rf_model_reduced.feature_importances_
feature_importance_df_reduced = pd.DataFrame({'Feature': X_reduced.columns, 'Importance': feature_importances_reduced})
feature_importance_df_reduced = feature_importance_df_reduced.sort_values(by='Importance', ascending=False)

# Display the top 10 important features from the reduced model
feature_importance_df_reduced.head(10)


Unnamed: 0,Feature,Importance
1,Hours_Worked_Per_Week,0.34158
5,Social_Isolation_Rating,0.127975
2,Work_Life_Balance_Rating,0.112315
0,Work_Location,0.085844
8,Sleep_Quality,0.081028
6,Satisfaction_with_Remote_Work,0.07796
3,Mental_Health_Condition,0.07669
4,Access_to_Mental_Health_Resources,0.050987
7,Physical_Activity,0.045621


In [16]:
from xgboost import XGBClassifier

# Adjust XGBoost settings to limit boosting rounds for faster computation
xgb_model_optimized = XGBClassifier(n_estimators=100, use_label_encoder=False, random_state=42, eval_metric="mlogloss")

# Train the optimized XGBoost model
xgb_model_optimized.fit(X_train_reduced, y_train_reduced)

# Get feature importance for the optimized XGBoost model
xgb_feature_importances_optimized = xgb_model_optimized.feature_importances_
xgb_feature_importance_df_optimized = pd.DataFrame({'Feature': X_reduced.columns, 'Importance': xgb_feature_importances_optimized})
xgb_feature_importance_df_optimized = xgb_feature_importance_df_optimized.sort_values(by='Importance', ascending=False)

# Display the top 10 important features for XGBoost
xgb_feature_importance_df_optimized.head(10)



Unnamed: 0,Feature,Importance
8,Sleep_Quality,0.116019
1,Hours_Worked_Per_Week,0.113813
7,Physical_Activity,0.111718
0,Work_Location,0.111501
4,Access_to_Mental_Health_Resources,0.111265
3,Mental_Health_Condition,0.110862
6,Satisfaction_with_Remote_Work,0.109522
2,Work_Life_Balance_Rating,0.108102
5,Social_Isolation_Rating,0.107198
