In [3]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report

import pandas as pd

# Load the dataset
data = pd.read_csv('WorkNMentalHealth.csv')

# Display the first few rows of the dataset to understand its structure
data.head()

# Drop the Employee_ID column since it's not relevant for prediction
data_clean = data.drop('Employee_ID', axis=1)

# Handle missing values (if any) by filling with mode for categorical and median for numeric
for column in data_clean.select_dtypes(include=['object']).columns:
    data_clean[column].fillna(data_clean[column].mode()[0], inplace=True)
for column in data_clean.select_dtypes(include=['number']).columns:
    data_clean[column].fillna(data_clean[column].median(), inplace=True)

# Encode categorical variables using LabelEncoder
label_encoders = {}
for column in data_clean.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    data_clean[column] = le.fit_transform(data_clean[column])
    label_encoders[column] = le

# Separate features and target variable
X = data_clean.drop('Stress_Level', axis=1)
y = data_clean['Stress_Level']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the Random Forest classifier
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions
y_pred = rf_model.predict(X_test)

# Evaluate the model
classification_rep = classification_report(y_test, y_pred)

# Get feature importance
feature_importances = rf_model.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)


# Displaying the top 10 important features for simplicity
top_features = feature_importance_df.head(10)

# Show the classification report and top features
top_features

Unnamed: 0,Feature,Importance
6,Hours_Worked_Per_Week,0.104873
0,Age,0.102625
4,Years_of_Experience,0.101018
7,Number_of_Virtual_Meetings,0.084885
3,Industry,0.063249
2,Job_Role,0.063068
17,Region,0.057619
14,Company_Support_for_Remote_Work,0.055194
12,Social_Isolation_Rating,0.052952
8,Work_Life_Balance_Rating,0.050956


In [4]:
# Dropping the specified columns and retraining the model

X_reduced = X.drop(['Job_Role', 'Industry', 'Region', 'Years_of_Experience', 'Age', 'Gender', 'Number_of_Virtual_Meetings', 'Company_Support_for_Remote_Work', 'Productivity_Change',], axis=1)

# Splitting the data again with the reduced feature set
X_train_reduced, X_test_reduced, y_train_reduced, y_test_reduced = train_test_split(X_reduced, y, test_size=0.2, random_state=42)

# Re-initializing and training the Random Forest classifier
rf_model_reduced = RandomForestClassifier(random_state=42)
rf_model_reduced.fit(X_train_reduced, y_train_reduced)

# Making predictions with the reduced feature set
y_pred_reduced = rf_model_reduced.predict(X_test_reduced)

# Evaluating the model with the reduced feature set
classification_rep_reduced = classification_report(y_test_reduced, y_pred_reduced)

# Getting feature importance for the reduced model
feature_importances_reduced = rf_model_reduced.feature_importances_
feature_importance_df_reduced = pd.DataFrame({'Feature': X_reduced.columns, 'Importance': feature_importances_reduced})
feature_importance_df_reduced = feature_importance_df_reduced.sort_values(by='Importance', ascending=False)

# Display the top 10 important features from the reduced model
feature_importance_df_reduced.head(10)


Unnamed: 0,Feature,Importance
1,Hours_Worked_Per_Week,0.34158
5,Social_Isolation_Rating,0.127975
2,Work_Life_Balance_Rating,0.112315
0,Work_Location,0.085844
8,Sleep_Quality,0.081028
6,Satisfaction_with_Remote_Work,0.07796
3,Mental_Health_Condition,0.07669
4,Access_to_Mental_Health_Resources,0.050987
7,Physical_Activity,0.045621


In [6]:
from xgboost import XGBClassifier
import warnings

# Adjust XGBoost settings to limit boosting rounds for faster computation
xgb_model_optimized = XGBClassifier(n_estimators=100, use_label_encoder=False, random_state=42, eval_metric="mlogloss")

# Train the optimized XGBoost model
xgb_model_optimized.fit(X_train_reduced, y_train_reduced)

# Get feature importance for the optimized XGBoost model
xgb_feature_importances_optimized = xgb_model_optimized.feature_importances_
xgb_feature_importance_df_optimized = pd.DataFrame({'Feature': X_reduced.columns, 'Importance': xgb_feature_importances_optimized})
xgb_feature_importance_df_optimized = xgb_feature_importance_df_optimized.sort_values(by='Importance', ascending=False)

# Display the top 10 important features for XGBoost
xgb_feature_importance_df_optimized.head(10)


Unnamed: 0,Feature,Importance
8,Sleep_Quality,0.116019
1,Hours_Worked_Per_Week,0.113813
7,Physical_Activity,0.111718
0,Work_Location,0.111501
4,Access_to_Mental_Health_Resources,0.111265
3,Mental_Health_Condition,0.110862
6,Satisfaction_with_Remote_Work,0.109522
2,Work_Life_Balance_Rating,0.108102
5,Social_Isolation_Rating,0.107198


In [1]:
# Updated feature importance values with additional features
expanded_feature_importances = {
    'Hours_Worked_Per_Day': 0.341580,
    'Sleep_Quality': 0.081028,
    'Social_Isolation_Rating': 0.127975,
    'Physical_Activity': 0.045621,
    'Satisfaction_with_Remote_Work': 0.077960,
    'Mental_Health_Condition': 0.076690
}

# Total importance of the updated features
expanded_total_importance = sum(expanded_feature_importances.values())

# Function to calculate stress level based on expanded features
def calculate_expanded_stress_level(feature_values):
    stress_level = 0
    for feature, importance in expanded_feature_importances.items():
        stress_level += feature_values[feature] * (importance / expanded_total_importance)
    return stress_level

# Simulated user inputs for the expanded set of features
simulated_feature_values_expanded = {
    'Hours_Worked_Per_Day': 8.0,                 # User worked 8 hours today
    'Sleep_Quality': 6.0,                        # User rates their sleep as 6/10
    'Social_Isolation_Rating': 4.0,              # User feels moderately isolated
    'Physical_Activity': 7.0,                    # User engaged in physical activity
    'Satisfaction_with_Remote_Work': 5.0,        # User is somewhat satisfied with remote work
    'Mental_Health_Condition': 2.0               # User feels minor mental health issues
}

# Calculate the stress level using the expanded feature set
expanded_stress_level = calculate_expanded_stress_level(simulated_feature_values_expanded)

# Display the estimated stress level for the expanded model
expanded_stress_level

6.11734771340367

In [2]:
# Function to update weights based on feedback using proportional adjustment
def update_weights(feature_values, estimated_stress, actual_stress, feature_importances, total_importance):
    error = actual_stress - estimated_stress
    learning_rate = 0.1  # Determines how aggressively the weights are updated

    # Update each feature's importance based on its contribution to the error
    updated_importances = {}
    for feature, importance in feature_importances.items():
        contribution = feature_values[feature] * (importance / total_importance)
        updated_importance = importance + (learning_rate * error * contribution)
        updated_importances[feature] = max(updated_importance, 0)  # Ensure non-negative importance

    # Normalize the updated importances so that they sum to the same total importance
    updated_total_importance = sum(updated_importances.values())
    normalized_importances = {feature: imp * (total_importance / updated_total_importance) for feature, imp in updated_importances.items()}

    return normalized_importances

# Simulated feature values for Day 1
feature_values_day1 = {
    'Hours_Worked_Per_Day': 8.0,
    'Sleep_Quality': 6.0,
    'Social_Isolation_Rating': 4.0,
    'Physical_Activity': 7.0,
    'Satisfaction_with_Remote_Work': 5.0,
    'Mental_Health_Condition': 2.0
}

# Initial calculation of stress level for Day 1
estimated_stress_day1 = calculate_expanded_stress_level(feature_values_day1)

# Simulated user feedback (actual stress level for Day 1)
actual_stress_day1 = 7.0  # User felt more stressed than the estimated value

# Update the feature importance based on feedback
updated_feature_importances = update_weights(
    feature_values_day1,
    estimated_stress_day1,
    actual_stress_day1,
    expanded_feature_importances,
    expanded_total_importance
)

# Calculate stress for Day 2 with updated weights (simulate new day with same feature values)
expanded_total_importance_day2 = sum(updated_feature_importances.values())
estimated_stress_day2 = calculate_expanded_stress_level(feature_values_day1)

# Display the updated weights and stress level for Day 2
updated_feature_importances, estimated_stress_day2

({'Hours_Worked_Per_Day': 0.3855536183655537,
  'Sleep_Quality': 0.08037781132061808,
  'Social_Isolation_Rating': 0.1094462135854744,
  'Physical_Activity': 0.04837449380658646,
  'Satisfaction_with_Remote_Work': 0.072003518082759,
  'Mental_Health_Condition': 0.055098344839008495},
 6.11734771340367)

In [3]:
import numpy as np

# Define a simplified Kalman Filter class for stress level estimation
class KalmanFilterStress:
    def __init__(self, initial_stress, process_variance, measurement_variance, estimated_error):
        self.stress_level = initial_stress  # Initial predicted stress level
        self.process_variance = process_variance  # Variance of the stress process (prediction uncertainty)
        self.measurement_variance = measurement_variance  # Variance of the measurements (observation uncertainty)
        self.estimated_error = estimated_error  # Initial estimation error
        self.kalman_gain = 0  # Initial Kalman gain
    
    def predict(self, control_input, control_factor=1.0):
        """Predict the next stress level based on control input (feature values) and a control factor."""
        self.stress_level += control_factor * control_input
        self.estimated_error += self.process_variance  # Increase the error due to the uncertainty in the process
        
    def correct(self, measurement):
        """Update the stress level estimate based on a new measurement."""
        self.kalman_gain = self.estimated_error / (self.estimated_error + self.measurement_variance)
        self.stress_level = self.stress_level + self.kalman_gain * (measurement - self.stress_level)
        self.estimated_error = (1 - self.kalman_gain) * self.estimated_error  # Update the error estimate

# Initialize the Kalman Filter with some example values
kf = KalmanFilterStress(
    initial_stress=5.0,  # Start with a moderate stress level of 5
    process_variance=1.0,  # Variance in the prediction process
    measurement_variance=2.0,  # Variance in the measurement (observation)
    estimated_error=1.0  # Initial estimate error
)

# Simulated feature inputs for the current day (for the prediction phase)
feature_input = {
    'Hours_Worked_Per_Day': 9.0,      # User worked 9 hours today
    'Sleep_Quality': 5.0,             # User rates their sleep as 5/10
    'Social_Isolation_Rating': 6.0,    # User rates their isolation as 6/10
    'Physical_Activity': 4.0,         # User was mildly active
    'Satisfaction_with_Remote_Work': 7.0,  # Satisfied with work environment
    'Mental_Health_Condition': 2.0    # Feels slightly anxious today
}

# Simplified control input as a sum of weighted feature values (weighing according to importance)
control_input_value = sum([
    feature_input['Hours_Worked_Per_Day'] * 0.341580,
    feature_input['Sleep_Quality'] * 0.081028,
    feature_input['Social_Isolation_Rating'] * 0.127975,
    feature_input['Physical_Activity'] * 0.045621,
    feature_input['Satisfaction_with_Remote_Work'] * 0.077960,
    feature_input['Mental_Health_Condition'] * 0.076690
])

# Predict the next stress level based on today's feature inputs
kf.predict(control_input=control_input_value)

# Simulated "measurement" or observable behaviour (e.g., productivity or physiological sensor data)
observed_stress_measurement = 6.5  # Example observed stress level from other indicators

# Correct the predicted stress level using the observation
kf.correct(observed_stress_measurement)

# Output the updated stress level after prediction and correction
kf.stress_level

8.314397

In [4]:
# Import necessary libraries
import numpy as np

# Feature importance values (same as before)
feature_importances_ema = {
    'Hours_Worked_Per_Day': 0.341580,
    'Sleep_Quality': 0.081028,
    'Social_Isolation_Rating': 0.127975,
    'Physical_Activity': 0.045621,
    'Satisfaction_with_Remote_Work': 0.077960,
    'Mental_Health_Condition': 0.076690
}

# Total importance of the selected features
total_importance_ema = sum(feature_importances_ema.values())

# Function to calculate stress level for a single day based on current feature values
def calculate_daily_stress_level(feature_values, feature_importances, total_importance):
    stress_level = 0
    for feature, importance in feature_importances.items():
        stress_level += feature_values[feature] * (importance / total_importance)
    return stress_level

# Exponential Moving Average (EMA) based stress prediction
def ema_stress_prediction(current_features, previous_stress, alpha=0.3):
    # Calculate the current day's stress based on feature values
    current_stress = calculate_daily_stress_level(current_features, feature_importances_ema, total_importance_ema)
    
    # Apply EMA formula: S_t = α * X_t + (1 - α) * S_{t-1}
    updated_stress = alpha * current_stress + (1 - alpha) * previous_stress
    return updated_stress

# Simulated feature values for three days (day 1, day 2, day 3)
day_1_features = {
    'Hours_Worked_Per_Day': 8.0,
    'Sleep_Quality': 6.0,
    'Social_Isolation_Rating': 4.0,
    'Physical_Activity': 7.0,
    'Satisfaction_with_Remote_Work': 5.0,
    'Mental_Health_Condition': 2.0
}

day_2_features = {
    'Hours_Worked_Per_Day': 9.0,
    'Sleep_Quality': 5.0,
    'Social_Isolation_Rating': 6.0,
    'Physical_Activity': 6.0,
    'Satisfaction_with_Remote_Work': 6.0,
    'Mental_Health_Condition': 3.0
}

day_3_features = {
    'Hours_Worked_Per_Day': 7.0,
    'Sleep_Quality': 7.0,
    'Social_Isolation_Rating': 3.0,
    'Physical_Activity': 8.0,
    'Satisfaction_with_Remote_Work': 4.0,
    'Mental_Health_Condition': 1.0
}

# Initial stress level (assume initial value)
initial_stress = 5.0

# Calculate stress for Day 1 using EMA
stress_day_1 = ema_stress_prediction(day_1_features, initial_stress)
# Calculate stress for Day 2 using EMA (with Day 1's stress as previous stress)
stress_day_2 = ema_stress_prediction(day_2_features, stress_day_1)
# Calculate stress for Day 3 using EMA (with Day 2's stress as previous stress)
stress_day_3 = ema_stress_prediction(day_3_features, stress_day_2)

# Display the estimated stress levels for each day
stress_day_1, stress_day_2, stress_day_3


(5.335204314021101, 5.819775149363258, 5.7102508809968375)