In [2]:
import pandas as pd
import seaborn as sns
import warnings
import numpy as np
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.feature_selection import SelectKBest, mutual_info_regression

warnings.filterwarnings("ignore")
sns.set(style="whitegrid")
pd.set_option('display.max_columns', None)

file_path="./mental_health_workplace_survey.csv"
data=pd.read_csv(file_path)  

def remove_outliers_iqr(data, col):
    Q1 = data[col].quantile(0.25)
    Q3 = data[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return data[(data[col] >= lower_bound) & (data[col] <= upper_bound)]

for col in ['WorkHoursPerWeek', 'SleepHours', 'StressLevel']:
    data = remove_outliers_iqr(data, col)

data['Stress_WorkHours'] = data['StressLevel'] * data['WorkHoursPerWeek']
data['Sleep_vs_Stress'] = data['SleepHours'] / (data['StressLevel'] + 1)

X = data.drop(["StressLevel","EmployeeID","BurnoutRisk"], axis=1)
y = data["StressLevel"]

X_train, X_valid, y_train, y_valid=train_test_split(X,y,test_size=0.2,random_state=0)
    
categorical_features = [col for col in X_train.columns if X_train[col].dtype == "object"]
for col in categorical_features:
    data[col].fillna(data[col].mode()[0], inplace=True)

numerical_features = [col for col in X_train.columns if col not in categorical_features]
for col in numerical_features:
    data[col].fillna(data[col].median(), inplace=True)

# Fill missing values (if not already done)
for col in categorical_features:
    data[col].fillna(data[col].mode()[0], inplace=True)

# Ordinal Encoding
ordinal_encoder = OrdinalEncoder()
X_train[categorical_features] = ordinal_encoder.fit_transform(X_train[categorical_features])
X_valid[categorical_features] = ordinal_encoder.transform(X_valid[categorical_features])

# Standardize the features cuz models like Ridge and Lasso Regression are sensitive to the size of the features.
#Helps in maintaining the contibution by each feature. Feature Scaling is not always required for Linear Regression models.
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_valid_scaled = scaler.transform(X_valid)

# Feature selection using Mutual Information
selector = SelectKBest(score_func=mutual_info_regression,k=24) #Choosing all 24 features cuz it's already a small enough dataset
selector.fit(X_train_scaled, y_train) #Decreasing it more will decrease the R²

selected_features_mask = selector.get_support()
selected_features = X_train.columns[selected_features_mask]

# Filter features
X_train_selected = X_train_scaled[:, selected_features_mask]
X_valid_selected = X_valid_scaled[:, selected_features_mask]

models = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(),
    'Lasso Regression': Lasso()
}

results = {}
for name, model in models.items():
    model.fit(X_train_selected, y_train) #X_train_scaled
    y_pred = model.predict(X_valid_selected) #X_valid_scaled
    mse = mean_squared_error(y_valid, y_pred)
    r2 = r2_score(y_valid, y_pred)
    results[name] = {'MSE': mse, 'R²': r2}


print("Evaluation Results:")
results_data = pd.DataFrame(results).T
print(results_data)

best_model = results_data['R²'].idxmax()
print(f"Best Performing Model: {best_model}")
print(f"Reason: {best_model} achieved the highest R² is because the dataset isn't very high-dimensional leading to underfitting (Especially in lasso)")
print("Ridge and Linear are very close to each other, indicating the L2 penalty is very small.")

Evaluation Results:
                        MSE        R²
Linear Regression  0.204590  0.968755
Ridge Regression   0.204648  0.968746
Lasso Regression   1.941367  0.703512
Best Performing Model: Linear Regression
Reason: Linear Regression achieved the highest R² is because the dataset isn't very high-dimensional leading to underfitting (Especially in lasso)
Ridge and Linear are very close to each other, indicating the L2 penalty is very small.
