In [1]:
! pip install -q pandas numpy scikit-learn xgboost


[notice] A new release of pip is available: 23.2.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


# IMPORT

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split

# LOAD DATA

In [3]:
df = pd.read_csv('../KNN_Imputation/train_imputed_stacking.csv')
df

Unnamed: 0,Employee ID,Date of Joining,Gender,Company Type,WFH Setup Available,Designation,Resource Allocation,Mental Fatigue Score,Burn Rate
0,8387,2008-08-13,Male,Service,No,2,5,6.70,0.51
1,1817,2008-12-04,Male,Service,Yes,3,5,5.94,0.45
2,4386,2008-02-23,Male,Service,No,3,6,7.00,0.62
3,9317,2008-08-07,Female,Service,No,2,3,2.70,
4,4575,2008-10-01,Female,Service,No,2,5,6.00,0.45
...,...,...,...,...,...,...,...,...,...
8081,5734,2008-11-02,Female,Service,No,4,7,8.70,0.79
8082,5191,2008-12-17,Female,Service,Yes,2,4,5.30,0.32
8083,5390,2008-03-24,Female,Service,No,2,4,6.60,0.57
8084,860,2008-08-30,Female,Service,No,1,2,5.60,0.40


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8086 entries, 0 to 8085
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Employee ID           8086 non-null   int64  
 1   Date of Joining       8086 non-null   object 
 2   Gender                8086 non-null   object 
 3   Company Type          8086 non-null   object 
 4   WFH Setup Available   8086 non-null   object 
 5   Designation           8086 non-null   int64  
 6   Resource Allocation   8086 non-null   int64  
 7   Mental Fatigue Score  8086 non-null   float64
 8   Burn Rate             7702 non-null   float64
dtypes: float64(2), int64(3), object(4)
memory usage: 568.7+ KB


# EDA & Preprocessing

In [5]:
df.isnull().sum()

Employee ID               0
Date of Joining           0
Gender                    0
Company Type              0
WFH Setup Available       0
Designation               0
Resource Allocation       0
Mental Fatigue Score      0
Burn Rate               384
dtype: int64

In [6]:
df = df.dropna()
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7702 entries, 0 to 8085
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Employee ID           7702 non-null   int64  
 1   Date of Joining       7702 non-null   object 
 2   Gender                7702 non-null   object 
 3   Company Type          7702 non-null   object 
 4   WFH Setup Available   7702 non-null   object 
 5   Designation           7702 non-null   int64  
 6   Resource Allocation   7702 non-null   int64  
 7   Mental Fatigue Score  7702 non-null   float64
 8   Burn Rate             7702 non-null   float64
dtypes: float64(2), int64(3), object(4)
memory usage: 601.7+ KB


In [7]:
# datetime
df['Date of Joining'] = pd.to_datetime(df['Date of Joining'])

reference_date = pd.to_datetime(pd.read_csv('../train.csv')['Date of Joining']).min()
df['Days_with_company'] = (df['Date of Joining'] - reference_date).dt.days

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Date of Joining'] = pd.to_datetime(df['Date of Joining'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Days_with_company'] = (df['Date of Joining'] - reference_date).dt.days


In [8]:
df_encoded = pd.get_dummies(df, columns=['Gender', 'Company Type', 'WFH Setup Available'], drop_first=True)

df_X = df_encoded[[
        'Designation', 'Resource Allocation',
        'Mental Fatigue Score', 'Days_with_company', 'Gender_Male',
        'Company Type_Service', 'WFH Setup Available_Yes'
    ]]
df_y = df_encoded['Burn Rate']

In [9]:
df_X

Unnamed: 0,Designation,Resource Allocation,Mental Fatigue Score,Days_with_company,Gender_Male,Company Type_Service,WFH Setup Available_Yes
0,2,5,6.70,225,True,True,False
1,3,5,5.94,338,True,True,True
2,3,6,7.00,53,True,True,False
4,2,5,6.00,274,False,True,False
5,2,3,4.60,14,True,True,True
...,...,...,...,...,...,...,...
8081,4,7,8.70,306,False,True,False
8082,2,4,5.30,351,False,True,True
8083,2,4,6.60,83,False,True,False
8084,1,2,5.60,242,False,True,False


In [10]:
df_y

0       0.51
1       0.45
2       0.62
4       0.45
5       0.27
        ... 
8081    0.79
8082    0.32
8083    0.57
8084    0.40
8085    0.63
Name: Burn Rate, Length: 7702, dtype: float64

# Split train test

In [11]:
X_train, X_test, y_train, y_test = train_test_split(df_X, df_y, test_size=0.2, random_state=42)

# XGBoost

In [12]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV

In [13]:
# param_grid_xgb = {
#     'learning_rate': [0.1, 0.05],
#     'n_estimators': [400, 600, 800],
#     'max_depth': [3, 5, 7, 9, 12],  
#     'gamma': [0, 0.1],      
#     'subsample': [0.8, 1.0],
#     'colsample_bytree': [0.8, 1.0],
#     'objective': ['reg:absoluteerror', 'reg:squarederror']
# }

param_grid_xgb = {
    # 'booster': ['gbtree', 'gblinear'],
    'learning_rate': [0.01, 0.001],
    'n_estimators': [800, 1000, 1200],
    'max_depth': [5],  
    # 'gamma': [0, 0.1],      
    'subsample': [0.8, 1.0],
    # 'colsample_bytree': [0.8, 1.0],
    'objective': ['reg:absoluteerror']
}

In [14]:
xgb_model = xgb.XGBRegressor(
    random_state = 42, 
    n_job = -3
)

grid_search = GridSearchCV(
    estimator=xgb_model,
    param_grid=param_grid_xgb,
    cv=5,
    scoring='neg_mean_absolute_error', 
    n_jobs=-3, 
    verbose=2 
)

grid_search.fit(X_train, y_train)
print(grid_search.best_params_)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


Parameters: { "n_job" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


{'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 1000, 'objective': 'reg:absoluteerror', 'subsample': 0.8}


In [15]:
best_mae = -grid_search.best_score_
print(best_mae)

0.04886037496463496


In [16]:
xgb_model = grid_search.best_estimator_

In [17]:
import joblib
joblib.dump(xgb_model, 'stacking_xgb_model.joblib')
# xgb_model = joblib.load('../XGBoost/final_xgb_model.joblib')

['stacking_xgb_model.joblib']

In [18]:
y_pred = xgb_model.predict(X_test)

In [19]:
from sklearn.metrics import mean_absolute_error, r2_score
import numpy as np

In [20]:
mae = mean_absolute_error(y_pred=y_pred, y_true=y_test)
r2 = r2_score(y_test, y_pred)

print(f"Mean Absolute Error (MAE): {mae}")
print(f"R-squared (R2) Score: {r2}")

Mean Absolute Error (MAE): 0.0496925428934902
R-squared (R2) Score: 0.88896873794843
