In [17]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error


from sklearn.preprocessing import StandardScaler

import pickle
from urllib.parse import urlparse

from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
import catboost as cb
import lightgbm as lgbm

In [28]:
from mlflow.models import ModelSignature
from mlflow.types.schema import Schema, ColSpec
from mlflow.models import infer_signature
from mlflow import MlflowClient

# Setup MLFLOW

In [18]:
import mlflow
import mlflow.sklearn
import mlflow.catboost 
import mlflow.xgboost
import mlflow.lightgbm 
mlflow.set_experiment('Employee-Burn-Out-New-Exp')

<Experiment: artifact_location='file:///c:/Users/Windows.10/Employee-Burn-Out/mlruns/845213557949273536', creation_time=1726420184733, experiment_id='845213557949273536', last_update_time=1726420184733, lifecycle_stage='active', name='Employee-Burn-Out-New-Exp', tags={}>

# Read Data

In [5]:
train=pd.read_csv(r'D:/Users/user/mlops/work_burn_out/data/train.csv')
test=pd.read_csv(r'D:/Users/user/mlops/work_burn_out/data/test.csv')

# Actions made to data

In [6]:
class Actions:
    def __init__(self,data):
        self.data=data        
    def preprocess(self):
        self.data.rename(columns={'WFH Setup Available': 'Remote','Resource Allocation':'Resource'}, inplace=True)
        self.data.drop_duplicates(inplace=True)
        self.data=self.data.drop('Employee ID',axis=1)
        self.data=self.data.dropna()
        return self.data
    def encoding(self,data):
        self.data=data
        self.data['Gender'] = self.data['Gender'].replace({'Male': 1,'Female': 0})
        self.data['Company Type'] = self.data['Company Type'].replace({'Service': 1, 'Product': 0})
        self.data['Remote'] = self.data['Remote'].replace({'Yes': 1, 'No': 0})
        return self.data

# Preprocessing
**1. remove duplicates**

**2. drop Employee ID**

**3. rename columns**



In [7]:
actions_to_train_data=Actions(train)
actions_to_test_data=Actions(test)

train=actions_to_train_data.preprocess()
test=actions_to_test_data.preprocess()

# Encoding
# **Categorical data:**
Date of Joining, Gender, Company Type, WFH Setup Available
# **Numerical data:**
Designation, Resource Allocation, Mental Fatigue Score, Burn Rate

In [8]:
train=actions_to_train_data.encoding(train)
test=actions_to_test_data.encoding(test)

In [None]:
print(train['Date of Joining'].min(),'\n'+str(train['Date of Joining'].max()))

2008-01-01 
2008-12-31


In [9]:
#difference between the time employee arrived and the time of the survay which is 2020 kis 4 years
date=np.ones((len(train['Date of Joining']),1))
train['Work Period']=date*12

In [10]:
#no need for the date of joining column any more
train=train.drop('Date of Joining',axis=1)

# **Training**

In [11]:
X=train.drop('Burn Rate',axis=1)
Y=train['Burn Rate']

In [12]:
x_train,x_test,y_train,y_test=train_test_split(X,Y,test_size=0.2,random_state=42)

# MLflow

In [23]:
class LossFunctions:
    def init(self, actual_values, predicted_values):
        self.actual_values=np.array(actual_values)
        self.predicted_values=np.array(predicted_values)
        self.differences=self.actual_values-self.predicted_values

    def calculate_mae(self):
        return np.absolute(np.mean(self.differences))

    def calculate_mse(self):
        # Square the differences
        squared_differences = self.differences ** 2
        return np.mean(squared_differences)

    def calculate_rmse(self):            
        # Take the square root of the mean squared difference to get RMSE
        rmse = np.sqrt(self.calculate_mse())    
        return rmse

In [26]:
losses=LossFunctions()
models={1:{'model':xgb.XGBRegressor(learning_rate=0.1, n_estimators=1000)},
        2:{'model':cb.CatBoostRegressor(iterations=200,learning_rate=0.1,depth=2)},
        3:{'model':lgbm.LGBMRegressor(num_leaves=31,learning_rate=0.05,n_estimators=500)}
        }
model_names={1:'xgboost',
             2:'catboost',
             3:'lightgbm'}
for key in [1,2,3]:
    with mlflow.start_run():
     model=models[key]['model']
     model.fit(x_train,y_train)
     predict=model.predict(x_test)
     losses.init(y_test,predict)
     mlflow.log_metric("rmse",losses.calculate_rmse())
     mlflow.log_metric("mse",losses.calculate_mse())
     mlflow.log_metric("mae",losses.calculate_mae())
     signature=infer_signature(x_train,predict)
     tracking_url_type_store=urlparse(mlflow.get_tracking_uri()).scheme
     mlflow.sklearn.log_model(model,"model",registered_model_name=model_names[key], signature=signature)

Registered model 'xgboost' already exists. Creating a new version of this model...
Created version '2' of model 'xgboost'.


0:	learn: 0.1846865	total: 35.5ms	remaining: 7.07s
1:	learn: 0.1724098	total: 38ms	remaining: 3.76s
2:	learn: 0.1609908	total: 40.4ms	remaining: 2.65s
3:	learn: 0.1511941	total: 42.7ms	remaining: 2.09s
4:	learn: 0.1417847	total: 45.1ms	remaining: 1.76s
5:	learn: 0.1331957	total: 47.6ms	remaining: 1.54s
6:	learn: 0.1262241	total: 50.1ms	remaining: 1.38s
7:	learn: 0.1190350	total: 52.3ms	remaining: 1.25s
8:	learn: 0.1128067	total: 54.7ms	remaining: 1.16s
9:	learn: 0.1076561	total: 57ms	remaining: 1.08s
10:	learn: 0.1023663	total: 59.4ms	remaining: 1.02s
11:	learn: 0.0976709	total: 61.8ms	remaining: 968ms
12:	learn: 0.0932804	total: 65.1ms	remaining: 936ms
13:	learn: 0.0895690	total: 69.7ms	remaining: 927ms
14:	learn: 0.0861403	total: 75.6ms	remaining: 932ms
15:	learn: 0.0832705	total: 79.3ms	remaining: 912ms
16:	learn: 0.0802041	total: 83.9ms	remaining: 903ms
17:	learn: 0.0775330	total: 87.5ms	remaining: 884ms
18:	learn: 0.0750922	total: 91ms	remaining: 867ms
19:	learn: 0.0730637	total: 

Registered model 'catboost' already exists. Creating a new version of this model...
Created version '12' of model 'catboost'.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000308 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 124
[LightGBM] [Info] Number of data points in the train set: 14872, number of used features: 6
[LightGBM] [Info] Start training from score 0.452021


Registered model 'lightgbm' already exists. Creating a new version of this model...
Created version '2' of model 'lightgbm'.


# Retrive best model

In [40]:
client = mlflow.MlflowClient()

latest_model_version = client.get_latest_versions(name="catboost")

  latest_model_version = client.get_latest_versions(name="catboost")


In [45]:


# Load the model
loaded_model = mlflow.pyfunc.load_model(f"runs:/CatBoost/{'Employee-Burn-Out-New-Exp'}/model.pkl")

KeyError: 'experiment_id'