In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error


from sklearn.preprocessing import StandardScaler

import pickle
from urllib.parse import urlparse

from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
import catboost as cb
from lightgbm import LGBMRegressor

In [3]:
from mlflow.models import ModelSignature
from mlflow.types.schema import Schema, ColSpec
from mlflow.models import infer_signature

# Setup MLFLOW

In [24]:
import mlflow
import mlflow.sklearn
mlflow.set_experiment('Employee-Burn-Out-New-Exp')

2024/09/15 20:09:44 INFO mlflow.tracking.fluent: Experiment with name 'Employee-Burn-Out-New-Exp' does not exist. Creating a new experiment.


<Experiment: artifact_location='file:///c:/Users/Windows.10/Employee-Burn-Out/mlruns/845213557949273536', creation_time=1726420184733, experiment_id='845213557949273536', last_update_time=1726420184733, lifecycle_stage='active', name='Employee-Burn-Out-New-Exp', tags={}>

# Read Data

In [6]:
train=pd.read_csv(r'D:/Users/user/mlops/work_burn_out/data/train.csv')
test=pd.read_csv(r'D:/Users/user/mlops/work_burn_out/data/test.csv')

# Actions made to data

In [7]:
class Actions:
    def __init__(self,data):
        self.data=data        
    def preprocess(self):
        self.data.rename(columns={'WFH Setup Available': 'Remote','Resource Allocation':'Resource'}, inplace=True)
        self.data.drop_duplicates(inplace=True)
        self.data=self.data.drop('Employee ID',axis=1)
        self.data=self.data.dropna()
        return self.data
    def encoding(self,data):
        self.data=data
        self.data['Gender'] = self.data['Gender'].replace({'Male': 1,'Female': 0})
        self.data['Company Type'] = self.data['Company Type'].replace({'Service': 1, 'Product': 0})
        self.data['Remote'] = self.data['Remote'].replace({'Yes': 1, 'No': 0})
        return self.data

# Preprocessing
**1. remove duplicates**

**2. drop Employee ID**

**3. rename columns**



In [8]:
actions_to_train_data=Actions(train)
actions_to_test_data=Actions(test)

train=actions_to_train_data.preprocess()
test=actions_to_test_data.preprocess()

# Encoding
# **Categorical data:**
Date of Joining, Gender, Company Type, WFH Setup Available
# **Numerical data:**
Designation, Resource Allocation, Mental Fatigue Score, Burn Rate

In [10]:
train=actions_to_train_data.encoding(train)
test=actions_to_test_data.encoding(test)

In [None]:
print(train['Date of Joining'].min(),'\n'+str(train['Date of Joining'].max()))

2008-01-01 
2008-12-31


In [11]:
#difference between the time employee arrived and the time of the survay which is 2020 kis 4 years
date=np.ones((len(train['Date of Joining']),1))
train['Work Period']=date*12

In [12]:
#no need for the date of joining column any more
train=train.drop('Date of Joining',axis=1)

# **Training**

In [13]:
X=train.drop('Burn Rate',axis=1)
Y=train['Burn Rate']

In [14]:
x_train,x_test,y_train,y_test=train_test_split(X,Y,test_size=0.2,random_state=42)

**xgboost**

In [19]:
feature_names = ['Gender', 'Company Type', 'Remote', 'Designation', 'Resource', 
                 'Mental Fatigue Score', 'Work Period']


xgboost_model=XGBRegressor(learning_rate=0.1, n_estimators=1000,feature_names=feature_names)
xgboost_model.fit(x_train,y_train)

Parameters: { "feature_names" } are not used.



**catboost**

In [None]:

catboost_model = cb.CatBoostRegressor(iterations=200,
            learning_rate=0.1,
            depth=2)

catboost_model.fit(x_train,y_train)

**lightgbm**

In [None]:
lightgbm_model=LGBMRegressor(num_leaves=31,learning_rate=0.05,n_estimators=500)
lightgbm_model.fit(x_train, y_train)

# **mean square error**

# XGBoost

In [22]:
mean_squared_error(y_test, xgboost_model.predict(x_test))

0.0029822811372870748

# catboost

In [17]:
mean_squared_error(y_test, catboost_model.predict(x_test))

0.0029095589725981726

# LightGBM

In [23]:
mean_squared_error(y_test, lightgbm_model.predict(x_test))

0.002804352340351687

# Save model

In [19]:
pickle.dump(catboost_model , open('burn-out-model.pk1' , 'wb'))

In [21]:
model = pickle.load(open('burn-out-model.pk1' , 'rb'))

# Burn out rate prediction

In [43]:
x_test.iloc[0]

Gender                   1.0
Company Type             1.0
Remote                   1.0
Designation              1.0
Resource                 2.0
Mental Fatigue Score     3.9
Work Period             12.0
Name: 8815, dtype: float64

In [84]:
pred=randomforest_model.predict([x_test.iloc[0]])



In [85]:
print('employee',8815,'predicted burn out rate is',round(pred[0]*100,2),'%')

employee 8815 predicted burn out rate is 32.32 %


In [87]:
print('employee',8815,'actual burn out rate is',round(y_test.iloc[0]*100,2),'%')

employee 8815 actual burn out rate is 18.0 %


# Test the insights

In [None]:
train['Resource'].value_counts()

Resource
4.0     3396
5.0     3364
3.0     2761
6.0     2590
2.0     1794
7.0     1697
1.0     1551
8.0      907
9.0      396
10.0     134
Name: count, dtype: int64

In [None]:
'''an employee:
1. is a male=1
2. the Company Type is service=1
3. work is not remote=0
4. Designation is high=4.0
5. have high resource=8
6. Mental Fatigue Score =8.0'''
gender=1
company_type=1
remote=0
designation=4.0
resource=8
mental_fatigue_score=8.0
work_period=4.0

data=[gender,company_type,remote,designation,resource,mental_fatigue_score,work_period]
print('Burn out rate is:',round(model.predict(data)*100,2),'%')

Burn out rate is: 72.6 %


In [None]:
#for a female
data=[0,1,0,4.0,8.0,8.0,4.0]
print('Burn out rate is:',round(model.predict(data)*100,2),'%')

Burn out rate is: 72.47 %


In [None]:
#remote work
data=[1,1,1,4.0,8.0,8.0,4.0]
print('Burn out rate is:',round(model.predict(data)*100,2),'%')

Burn out rate is: 70.43 %


In [None]:
#remote work , designation low, resource high
data=[1,1,1,2.0,8.0,8.0,4.0]
print('Burn out rate is:',round(model.predict(data)*100,2),'%')

Burn out rate is: 77.47 %


In [None]:
#female, remote work , designation low, resource high
data=[0,1,1,2.0,8.0,8.0,4.0]
print('Burn out rate is:',round(model.predict(data)*100,2),'%')

Burn out rate is: 76.75 %


In [None]:
#female, remote work , designation low, resource low
data=[0,1,1,2.0,2.0,8.0,4.0]
print('Burn out rate is:',round(model.predict(data)*100,2),'%')

Burn out rate is: 48.07 %


In [None]:
#female, remote work , designation low, resource low,low mental fatigue
data=[0,1,1,2.0,2.0,2.0,4.0]
print('Burn out rate is:',round(model.predict(data)*100,2),'%')

Burn out rate is: 12.91 %


In [None]:
#female, remote work , designation low, resource low
data=[0,1,1,1.0,1.0,1.0,4.0]
print('Burn out rate is:',round(model.predict(data)*100,2),'%')

Burn out rate is: 3.31 %


In [None]:
test_data=np.array([[1,1,0,4.0,8,8.0,4.0],[0,1,0,4.0,8.0,8.0,4.0],[1,1,1,4.0,8.0,8.0,4.0],
  [1,1,1,2.0,8.0,8.0,4.0],[0,1,1,2.0,8.0,8.0,4.0],[0,1,1,2.0,2.0,8.0,4.0],
  [0,1,1,2.0,2.0,2.0,4.0],[0,1,1,1.0,1.0,1.0,4.0]])

In [None]:
test=pd.DataFrame(test_data,columns=['Gender','Company Type','Remote','Designation','Resource','Mental Fatigue Score','Work Period'])

In [None]:
predictions=model.predict(test)

In [None]:
test['predictions']=predictions*100

In [None]:
test.head(8)

  return method()
  return method()


Unnamed: 0,Gender,Company Type,Remote,Designation,Resource,Mental Fatigue Score,Work Period,predictions
0,1.0,1.0,0.0,4.0,8.0,8.0,4.0,72.603221
1,0.0,1.0,0.0,4.0,8.0,8.0,4.0,72.47193
2,1.0,1.0,1.0,4.0,8.0,8.0,4.0,70.432557
3,1.0,1.0,1.0,2.0,8.0,8.0,4.0,77.472639
4,0.0,1.0,1.0,2.0,8.0,8.0,4.0,76.748176
5,0.0,1.0,1.0,2.0,2.0,8.0,4.0,48.071661
6,0.0,1.0,1.0,2.0,2.0,2.0,4.0,12.909549
7,0.0,1.0,1.0,1.0,1.0,1.0,4.0,3.313869


# MLflow

In [15]:
def calculate_rmse(actual_values, predicted_values):
    """
    Calculate the Root Mean Square Error (RMSE) between actual_values and predicted_values.
    
    Parameters:
    - actual_values: Array-like object containing the actual values.
    - predicted_values: Array-like object containing the predicted values.
    
    Returns:
    - rmse: The Root Mean Square Error as a float.
    """
    # Ensure inputs are NumPy arrays
    actual_values = np.array(actual_values)
    predicted_values = np.array(predicted_values)
    
    # Calculate the difference between actual and predicted values
    differences = actual_values - predicted_values
    
    # Square the differences
    squared_differences = differences ** 2
    
    # Calculate the mean of the squared differences
    mean_squared_difference = np.mean(squared_differences)
    
    # Take the square root of the mean squared difference to get RMSE
    rmse = np.sqrt(mean_squared_difference)
    
    return rmse

In [25]:
models={'1':{'name':'xgboost','model':XGBRegressor(learning_rate=0.1, n_estimators=1000)},
        '2':{'name':'catboost','model':cb.CatBoostRegressor(iterations=200,learning_rate=0.1,depth=2)},
        '3':{'name':'lightgbm','model':LGBMRegressor(num_leaves=31,learning_rate=0.05,n_estimators=500)}
        }
for key in ['1','2','3']:
    with mlflow.start_run():
     model=models[key]['model']
     model.fit(x_train,y_train)
     predict=model.predict(x_test)
     rmse=np.sqrt(calculate_rmse(y_test,predict))
     mlflow.log_metric("rmse",rmse)
     signature=infer_signature(x_train,predict)
     tracking_url_type_store=urlparse(mlflow.get_tracking_uri()).scheme
     if tracking_url_type_store!="file":
        mlflow.sklearn.log_model(model,"model",registered_model_name=models[key]['name'], signature=signature)
     else:
        mlflow.sklearn.log_model(model,"model", signature=signature)
   



0:	learn: 0.1846865	total: 4.65ms	remaining: 926ms
1:	learn: 0.1724098	total: 7.17ms	remaining: 710ms
2:	learn: 0.1609908	total: 9.28ms	remaining: 609ms
3:	learn: 0.1511941	total: 14.9ms	remaining: 728ms
4:	learn: 0.1417847	total: 19.3ms	remaining: 753ms
5:	learn: 0.1331957	total: 23.9ms	remaining: 773ms
6:	learn: 0.1262241	total: 29.8ms	remaining: 821ms
7:	learn: 0.1190350	total: 34.5ms	remaining: 827ms
8:	learn: 0.1128067	total: 64.4ms	remaining: 1.37s
9:	learn: 0.1076561	total: 68.9ms	remaining: 1.31s
10:	learn: 0.1023663	total: 73.8ms	remaining: 1.27s
11:	learn: 0.0976709	total: 80.5ms	remaining: 1.26s
12:	learn: 0.0932804	total: 85.3ms	remaining: 1.23s
13:	learn: 0.0895690	total: 89.7ms	remaining: 1.19s
14:	learn: 0.0861403	total: 96.8ms	remaining: 1.19s
15:	learn: 0.0832705	total: 121ms	remaining: 1.39s
16:	learn: 0.0802041	total: 127ms	remaining: 1.37s
17:	learn: 0.0775330	total: 159ms	remaining: 1.6s
18:	learn: 0.0750922	total: 162ms	remaining: 1.54s
19:	learn: 0.0730637	total:



In [None]:
 with mlflow.start_run():
        model = cb.CatBoostRegressor(loss_function='RMSE')
        model.fit(x_train,y_train)
        predict=model.predict(y_test)
        rmse=np.sqrt(calculate_rmse(y_test,predict))
        mlflow.log_metric("rmse",rmse)
        signature=infer_signature(x_train,predict)

        tracking_url_type_store=urlparse(mlflow.get_tracking_uri()).scheme
        if tracking_url_type_store!="file":
            mlflow.sklearn.log_model(model,"model",registered_model_name="Catboost", signature=signature)
        else:
            mlflow.sklearn.log_model(model,"model", signature=signature)
