***EMPLOYEE BURNOUT ANALYSIS PREDICTION USING REGRESSION TECHNIQUES***

In [None]:
from google.colab import drive
#mount Google Drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
#path to the data set which is in google drive
dataset_path = '/content/drive/MyDrive/data/employee_burnout_analysis.xlsx'

In [None]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.svm import LinearSVR, SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor


import warnings
warnings.filterwarnings(action='ignore')


In [None]:

# Step 1: Load the data into a pandas DataFrame
data = pd.read_excel(dataset_path)

In [None]:
data


Unnamed: 0,Employee ID,Date of Joining,Gender,Company Type,WFH Setup Available,Designation,Resource Allocation,Mental Fatigue Score,Burn Rate
0,fffe32003000360033003200,2008-09-30,Female,Service,No,2,3.0,3.8,0.16
1,fffe3700360033003500,2008-11-30,Male,Service,Yes,1,2.0,5.0,0.36
2,fffe31003300320037003900,2008-03-10,Female,Product,Yes,2,,5.8,0.49
3,fffe32003400380032003900,2008-11-03,Male,Service,Yes,1,1.0,2.6,0.20
4,fffe31003900340031003600,2008-07-24,Female,Service,No,3,7.0,6.9,0.52
...,...,...,...,...,...,...,...,...,...
22745,fffe31003500370039003100,2008-12-30,Female,Service,No,1,3.0,,0.41
22746,fffe33003000350031003800,2008-01-19,Female,Product,Yes,3,6.0,6.7,0.59
22747,fffe390032003000,2008-11-05,Male,Service,Yes,3,7.0,,0.72
22748,fffe33003300320036003900,2008-01-10,Female,Service,No,2,5.0,5.9,0.52


In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22750 entries, 0 to 22749
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   Employee ID           22750 non-null  object        
 1   Date of Joining       22750 non-null  datetime64[ns]
 2   Gender                22750 non-null  object        
 3   Company Type          22750 non-null  object        
 4   WFH Setup Available   22750 non-null  object        
 5   Designation           22750 non-null  int64         
 6   Resource Allocation   21369 non-null  float64       
 7   Mental Fatigue Score  20633 non-null  float64       
 8   Burn Rate             21626 non-null  float64       
dtypes: datetime64[ns](1), float64(3), int64(1), object(4)
memory usage: 1.6+ MB


***PREPROCESSING DATA***

In [None]:
def preprocess_inputs(df):
    df = df.copy()

    # Drop Employee ID column
    df = df.drop('Employee ID', axis=1)

    # Drop rows with missing target values
    missing_target_rows = df.loc[df['Burn Rate'].isna(), :].index
    df = df.drop(missing_target_rows, axis=0).reset_index(drop=True)

    # Fill remaining missing values with column means
    for column in ['Resource Allocation', 'Mental Fatigue Score']:
        df[column] = df[column].fillna(df[column].mean())

    # Extract date features
    df['Date of Joining'] = pd.to_datetime(df['Date of Joining'])
    df['Join Month'] = df['Date of Joining'].apply(lambda x: x.month)
    df['Join Day'] = df['Date of Joining'].apply(lambda x: x.day)
    df = df.drop('Date of Joining', axis=1)

    # Binary encoding
    df['Gender'] = df['Gender'].replace({'Female': 0, 'Male': 1})
    df['Company Type'] = df['Company Type'].replace({'Product': 0, 'Service': 1})
    df['WFH Setup Available'] = df['WFH Setup Available'].replace({'No': 0, 'Yes': 1})
    # Split df into X and y
    y = df['Burn Rate']
    X = df.drop('Burn Rate', axis=1)

    # Train-test split
     # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=1)

    # Scale X
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train = pd.DataFrame(scaler.transform(X_train), index=X_train.index, columns=X_train.columns)
    X_test = pd.DataFrame(scaler.transform(X_test), index=X_test.index, columns=X_test.columns)

    return X_train, X_test, y_train, y_test


In [None]:
X_train, X_test, y_train, y_test = preprocess_inputs(data)

In [None]:
X_train

Unnamed: 0,Gender,Company Type,WFH Setup Available,Designation,Resource Allocation,Mental Fatigue Score,Join Month,Join Day
8275,-0.954022,-1.379211,-1.087295,0.725025,0.768001,0.475128,0.433442,-0.649693
21284,1.048194,0.725052,-1.087295,1.604608,1.270205,1.131455,1.596251,-0.536187
16802,1.048194,0.725052,-1.087295,-0.154557,0.768001,0.420434,1.305549,0.371860
3271,1.048194,-1.379211,-1.087295,1.604608,2.274612,1.733089,0.142739,1.620424
5302,-0.954022,-1.379211,-1.087295,-0.154557,-0.236406,0.475128,0.724144,-0.422682
...,...,...,...,...,...,...,...,...
10955,-0.954022,0.725052,-1.087295,-0.154557,0.768001,0.803292,-1.020070,-1.444234
17289,-0.954022,0.725052,0.919713,0.725025,-0.236406,-0.509363,-0.147963,0.712377
5192,-0.954022,0.725052,0.919713,0.725025,0.265797,-1.165690,1.014847,0.031342
12172,1.048194,-1.379211,0.919713,-1.913723,-1.743017,-1.220384,0.433442,-1.671246


In [None]:
y_train

8275     0.61
21284    0.81
16802    0.62
3271     0.73
5302     0.43
         ... 
10955    0.58
17289    0.39
5192     0.24
12172    0.18
235      0.00
Name: Burn Rate, Length: 15138, dtype: float64

***USING LINEAR REGRESSION MODEL***

In [None]:
#using the linear regression model
reg_model = LinearRegression()
reg_model.fit(X_train, y_train)

In [None]:
# Step 6: Evaluate the model
y_pred = reg_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print('Mean Squared Error:', mse)

Mean Squared Error: 0.005078047784380777


In [None]:
y_pred

array([0.15725658, 0.52729648, 0.42790565, ..., 0.4167308 , 0.89698611,
       0.57097392])

In [None]:
reg_model.intercept_

0.45177632448143745

In [None]:
#  Evaluate the model
y_pred = reg_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print('Mean Squared Error:', mse)

Mean Squared Error: 0.005078047784380777


In [None]:
#  Interpret the results
coefficients = reg_model.coef_
intercept = reg_model.intercept_
print('Coefficients:', coefficients)
print('Intercept:', intercept)

Coefficients: [ 0.00326134  0.00034996 -0.00817894  0.01000201  0.06284658  0.12218851
 -0.00034051 -0.00070782]
Intercept: 0.45177632448143745


***USING DIFFERENT REGRESSION TECHNIQUES***

In [None]:
models = {
    "                     Linear Regression": LinearRegression(),
    " Linear Regression (L2 Regularization)": Ridge(),
    " Linear Regression (L1 Regularization)": Lasso(),
    "                   K-Nearest Neighbors": KNeighborsRegressor(),
    "                        Neural Network": MLPRegressor(),
    "Support Vector Machine (Linear Kernel)": LinearSVR(),
    "   Support Vector Machine (RBF Kernel)": SVR(),
    "                         Decision Tree": DecisionTreeRegressor(),
    "                         Random Forest": RandomForestRegressor(),
    "                     Gradient Boosting": GradientBoostingRegressor(),
    "                               XGBoost": XGBRegressor(),
    "                              LightGBM": LGBMRegressor(),

}

for name, model in models.items():
    model.fit(X_train, y_train)
    print(name + " trained.")

                     Linear Regression trained.
 Linear Regression (L2 Regularization) trained.
 Linear Regression (L1 Regularization) trained.
                   K-Nearest Neighbors trained.
                        Neural Network trained.
Support Vector Machine (Linear Kernel) trained.
   Support Vector Machine (RBF Kernel) trained.
                         Decision Tree trained.
                         Random Forest trained.
                     Gradient Boosting trained.
                               XGBoost trained.
                              LightGBM trained.


In [None]:
for name, model in models.items():
    print(name + " R^2 Score: {:.5f}".format(model.score(X_test, y_test)))

                     Linear Regression R^2 Score: 0.87075
 Linear Regression (L2 Regularization) R^2 Score: 0.87075
 Linear Regression (L1 Regularization) R^2 Score: -0.00001
                   K-Nearest Neighbors R^2 Score: 0.85603
                        Neural Network R^2 Score: 0.87250
Support Vector Machine (Linear Kernel) R^2 Score: 0.86812
   Support Vector Machine (RBF Kernel) R^2 Score: 0.88430
                         Decision Tree R^2 Score: 0.81684
                         Random Forest R^2 Score: 0.89772
                     Gradient Boosting R^2 Score: 0.90257
                               XGBoost R^2 Score: 0.90310
                              LightGBM R^2 Score: 0.90912
