<a href="https://colab.research.google.com/github/Praneeth-gamidi/Praneeth-gamidi/blob/main/EMPLOYEE_BURNOUT_PREDICTION.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Employee Burnout Prediction

In [None]:
#importing necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error,mean_absolute_error,r2_score
import pickle as pickle
import os


Loading Dataset

In [None]:
data=pd.read_excel('/content/employee_burnout_analysis-AI (1).xlsx')

In [None]:
data.head(4)


In [None]:
data.info()


In [None]:
data.isnull().sum()

In [None]:
data.isnull().sum().values.sum()

In [None]:
data.tail(4)

In [None]:
data.describe()

In [None]:
data.columns.tolist()


#NUMBER OF CATOGORIES IN EACH COLUMN

In [None]:
data.nunique()

INFORMATION OF DATA

In [None]:
data.info()

In [None]:
data=data.dropna()

In [None]:
data.shape

In [None]:
data.isnull()

In [None]:
data.isnull().sum().values.sum()

**EXPLORATORY DATA ANALYSIS**

There are NaN values on our target("Burn rate") and also in resource allocation and mental fatigue score columns.As we going to perform supervised linear regression,our target variable is needed to do so.Therefore,these 1124 rows with NaN Values must be dropped off of our dataframe.

In [None]:
data.corr(numeric_only=True)['Burn Rate'][:-1]

In [None]:
sns.pairplot(data)
plt.show()

In [None]:
data = data.dropna()

In [None]:
data.shape

In [None]:
data.dtypes

In [None]:
data = data.drop('Employee ID',axis = 1)

CHECKING THE CORRELATION OF  DATE OF JOINING AND TARGET VARIABLE.

In [None]:
data.head(5)

In [None]:
data.isna().sum()

In [None]:
data.nunique()

In [None]:
data

In [None]:
data.head()

In [None]:
data.dtypes


In [None]:
print(f"Min date{data['Date of Joining'].min()}")
print(f"Max date{data['Date of Joining'].max()}")
data_month = data.copy()
data_month["Date of Joining"] = data_month['Date of Joining'].astype("datetime64[ns]")
data_month["Date of Joining"].groupby(data_month['Date of Joining'].dt.month).count().plot(kind = "bar",xlabel='Month',ylabel='Hired employees')

In [None]:
data_2008 = pd.to_datetime(["2008-01-01"]*len(data))
data["Days"] = data['Date of Joining'].astype("datetime64[ns]").sub(data_2008).dt.days
data.Days

In [None]:
numeric_data = data.select_dtypes(include=['number'])
correlation = numeric_data.corr()['Burn Rate']
print(correlation)

In [None]:
data.corr(numeric_only=True)['Burn Rate'][:]

In [None]:
data = data.drop(['Date of Joining','Days'],axis = 1)

In [None]:
data.head()

NOW ANALYZING THE CATEGORICAL VALUES

In [None]:
data.dtypes

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

cat_columns = data.select_dtypes(object).columns
fig, ax = plt.subplots(nrows=1, ncols=len(cat_columns), sharey=True, figsize=(10, 5))

for i, c in enumerate(cat_columns):
    sns.countplot(x=c, data=data, ax=ax[i])

plt.show()


ONE-HOT ENCODING FOR CATEGORICAL FEATURES

In [None]:
if all(col in data.columns for col in ['Company Type','WFH Setup Available','Gender']):
  data = pd.get_dummies(data, columns=['Company Type','WFH Setup Available','Gender'], drop_first=True)
  data.head()
  encoded_columns = data.columns
else:
    print("Error:One or more of the specified columns are not present in the DataFrame.")
    print(data.columns)

**Preprocessing**

In [None]:
#split data into X and Y
y = data['Burn Rate']
X =data.drop('Burn Rate', axis=1)

In [None]:
# Train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,train_size=0.7, shuffle=True, random_state=1)

# Scale X
scaler = StandardScaler()
scaler.fit(X_train)
X_train = pd.DataFrame(scaler.transform(X_train), index=X_train.index, columns=X_train.columns)
X_test = pd.DataFrame(scaler.transform(X_test), index=X_test.index, columns=X_test.columns)

In [None]:
import os
import pickle

scaler_filename = '../models/scaler.pkl'
#Create the 'models' directory if it doesnt exist
os.makedirs(os.path.dirname(scaler_filename), exist_ok=True)

#use pickle to save the scaler to the file
with open(scaler_filename,'wb') as scaler_file:
    pickle.dump(scaler, scaler_file) # Removed extra space here

In [None]:
X_train

In [None]:
y_train

In [None]:
import os
import pickle

#saving the processed data
path = '../data/processed/'

#create the directory if it doesnt exist
os.makedirs(path, exist_ok=True)

# Use the correct syntax to save the DataFrames to CSV files
X_train.to_csv(path + 'X_train_processed.csv', index=False) # Changed 'to.' to 'to'
y_train.to_csv(path + 'y_train_processed.csv', index=False) # Changed 'to.' to 'to'

**MODELLING**

Linear **Regression**

In [None]:
#from sklearn.linear_model import Linear Regression
#create an instance of the Linear Regression class
linear_regression_model = LinearRegression()

#Train the model
linear_regression_model.fit(X_train,y_train)

In [None]:
#Linear Regression Model Performance Metrics

print("Linear Regression Model Performance Metrics:\n")
#Make predictions on the test set
y_pred = linear_regression_model.predict(X_test)

#Calculate mean-squared error
mse = mean_squared_error(y_test,y_pred)
print("Mean Squared Error:",mse)

#Calculate root mean squared error
rmse = mean_squared_error(y_test,y_pred, squared=False)
print("Root Mean Squared Error:", rmse)

#calculate mean absolute error
mae = mean_absolute_error(y_test,y_pred)
print("Mean Absolute Error:", mae)

#calculate R-squared score
r2 = r2_score(y_test,y_pred)
print("R-squared score:",r2)

# Prediction and **Recommendations**

In [None]:
y_pred

In [None]:
def categorize_risk(y_pred):
    if y_pred < 0.3:
        return 'Low'
    elif y_pred < 0.6:
        return 'Moderate'
    else:
        return 'High'

In [None]:
risk_categories = [categorize_risk(pred) for pred in y_pred]


In [None]:
for i in range(len(y_pred)):
    print(f"Predicted Burn Rate: {y_pred[i]:.2f}, Risk Category: {risk_categories[i]}")

In [None]:
# Assuming 'Burn Rate' is the actual column containing numerical burn rate values
data['Risk Category'] = data['Burn Rate'].apply(categorize_risk)

In [None]:
print(data)