In [1]:
from google.colab import files
uploaded = files.upload()


Saving mental_health_workplace_survey.csv to mental_health_workplace_survey.csv


In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.feature_selection import mutual_info_regression


In [4]:
df = pd.read_csv("mental_health_workplace_survey.csv")
df.head()


Unnamed: 0,EmployeeID,Age,Gender,Country,JobRole,Department,YearsAtCompany,WorkHoursPerWeek,RemoteWork,BurnoutLevel,...,CommuteTime,HasMentalHealthSupport,ManagerSupportScore,HasTherapyAccess,MentalHealthDaysOff,SalaryRange,WorkLifeBalanceScore,TeamSize,CareerGrowthScore,BurnoutRisk
0,1001,50,Male,UK,Sales Associate,HR,14,47,No,3.37,...,117,No,3.15,Yes,8,40K-60K,8.82,6,9.2,0
1,1002,36,Male,Germany,Software Engineer,IT,1,59,Hybrid,7.39,...,8,Yes,4.4,Yes,4,80K-100K,2.8,45,8.46,1
2,1003,29,Non-binary,India,IT Admin,IT,13,59,Hybrid,7.1,...,75,No,3.63,No,6,80K-100K,7.28,7,7.96,1
3,1004,42,Male,Australia,HR Specialist,IT,15,31,Yes,4.18,...,43,Yes,4.5,Yes,9,60K-80K,1.31,11,8.9,0
4,1005,40,Male,Brazil,Customer Support,Support,6,34,Yes,8.28,...,58,Yes,5.51,Yes,6,<40K,1.17,18,8.88,1


In [8]:
ordinal_cols = ['JobSatisfaction', 'WorkLifeBalanceScore', 'CareerGrowthScore']
encoder = OrdinalEncoder()
df[ordinal_cols] = encoder.fit_transform(df[ordinal_cols])



In [9]:
nominal_cols = [
    'Gender', 'Country', 'JobRole', 'Department',
    'RemoteWork', 'HasMentalHealthSupport', 'HasTherapyAccess', 'SalaryRange'
]

df = pd.get_dummies(df, columns=nominal_cols, drop_first=True)


In [10]:
# Exclude target
numerical_cols = df.select_dtypes(include=np.number).columns.drop('StressLevel')

scaler = StandardScaler()
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])


In [11]:
# Correlation with target
corr = df.corr()['StressLevel'].abs().sort_values(ascending=False)
top_corr = corr[corr > 0.1].index.tolist()

# Mutual Information
X_all = df.drop('StressLevel', axis=1)
y = df['StressLevel']

mi = mutual_info_regression(X_all, y)
mi_series = pd.Series(mi, index=X_all.columns).sort_values(ascending=False)
top_mi = mi_series[mi_series > 0.01].index.tolist()

# Final selected features
selected_features = list(set(top_corr + top_mi))


In [12]:
# Interaction: StressLevel × WorkHoursPerWeek and × SleepHours
df['stress_work'] = df['StressLevel'] * df['WorkHoursPerWeek']
df['stress_sleep'] = df['StressLevel'] * df['SleepHours']

# Add to features
selected_features += ['stress_work', 'stress_sleep']


In [13]:
X = df[selected_features]
y = df['StressLevel']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [14]:
models = {
    'Linear': LinearRegression(),
    'Ridge': Ridge(alpha=1.0),
    'Lasso': Lasso(alpha=0.1)
}

results = {}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    results[name] = {'MSE': mse, 'R2': r2}


In [15]:
for model, scores in results.items():
    print(f"{model} Regression:")
    print(f"   MSE: {scores['MSE']:.4f}")
    print(f"   R² : {scores['R2']:.4f}\n")

best = max(results, key=lambda x: results[x]['R2'])
print(f"✅ Best Model: {best} Regression")


Linear Regression:
   MSE: 0.0000
   R² : 1.0000

Ridge Regression:
   MSE: 0.0000
   R² : 1.0000

Lasso Regression:
   MSE: 0.0014
   R² : 0.9998

✅ Best Model: Linear Regression
