In [5]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.feature_selection import mutual_info_regression

# Load data
df = pd.read_csv(r"C:\Users\sreek\OneDrive\Desktop\ACM\mental_health_workplace_survey.csv")

# Display first few rows
print(df.head())

# Drop rows with missing target
df = df.dropna(subset=["StressLevel"])

# Drop rows with other missing values for simplicity
df = df.dropna()


   EmployeeID  Age      Gender    Country            JobRole Department  \
0        1001   50        Male         UK    Sales Associate         HR   
1        1002   36        Male    Germany  Software Engineer         IT   
2        1003   29  Non-binary      India           IT Admin         IT   
3        1004   42        Male  Australia      HR Specialist         IT   
4        1005   40        Male     Brazil   Customer Support    Support   

   YearsAtCompany  WorkHoursPerWeek RemoteWork  BurnoutLevel  ...  \
0              14                47         No          3.37  ...   
1               1                59     Hybrid          7.39  ...   
2              13                59     Hybrid          7.10  ...   
3              15                31        Yes          4.18  ...   
4               6                34        Yes          8.28  ...   

   CommuteTime  HasMentalHealthSupport  ManagerSupportScore  HasTherapyAccess  \
0          117                      No               

In [6]:
# Set target
target = "StressLevel"

# Identify categorical and numerical columns
categorical_cols = df.select_dtypes(include='object').columns.tolist()
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Remove target from numerical list if it's there
if target in numerical_cols:
    numerical_cols.remove(target)


In [12]:
# Basic check for needed columns before interaction
required_cols = ['WorkHoursPerWeek', 'SleepHours', 'StressLevel']
for col in required_cols:
    if col not in df.columns:
        raise ValueError(f"Missing expected column: {col}")

# Create interaction features
df['Work_Sleep_Interaction'] = df['WorkHoursPerWeek'] * df['SleepHours']
df['Stress_Sleep'] = df['StressLevel'] * df['SleepHours']


In [13]:
# Add interaction features to numerical list
feature_candidates = numerical_cols + ['Work_Sleep_Interaction', 'Stress_Sleep']
X_temp = df[feature_candidates]
y = df[target]

# Compute mutual information scores
mi_scores = mutual_info_regression(X_temp, y)
mi_series = pd.Series(mi_scores, index=X_temp.columns).sort_values(ascending=False)

# Select top 5 features
top_features = mi_series.head(5).index.tolist()
X = df[top_features]


In [14]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [15]:
# Define models
models = {
    "Linear Regression": LinearRegression(),
    "Ridge Regression": Ridge(alpha=1.0),
    "Lasso Regression": Lasso(alpha=0.1)
}

# Store results
results = {}

# Train and evaluate
for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    results[name] = {'MSE': mse, 'R2': r2}


In [16]:
# Print performance
for model, scores in results.items():
    print(f"{model}:")
    print(f"  MSE: {scores['MSE']:.4f}")
    print(f"  R²: {scores['R2']:.4f}\n")

# Determine best model
best_model = max(results.items(), key=lambda x: x[1]['R2'])
print(f"✅ Best Model: {best_model[0]} with R² = {best_model[1]['R2']:.4f}")


Linear Regression:
  MSE: 0.3109
  R²: 0.9531

Ridge Regression:
  MSE: 0.3110
  R²: 0.9530

Lasso Regression:
  MSE: 0.3573
  R²: 0.9460

✅ Best Model: Linear Regression with R² = 0.9531
