In [1]:
! pip install -q pandas numpy scikit-learn lightgbm


[notice] A new release of pip is available: 23.2.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


# IMPORT

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split

# LOAD DATA

In [3]:
df = pd.read_csv('../KNN_Imputation/train_imputed_stacking.csv')
df

Unnamed: 0,Employee ID,Date of Joining,Gender,Company Type,WFH Setup Available,Designation,Resource Allocation,Mental Fatigue Score,Burn Rate
0,8387,2008-08-13,Male,Service,No,2,5,6.70,0.51
1,1817,2008-12-04,Male,Service,Yes,3,5,5.94,0.45
2,4386,2008-02-23,Male,Service,No,3,6,7.00,0.62
3,9317,2008-08-07,Female,Service,No,2,3,2.70,
4,4575,2008-10-01,Female,Service,No,2,5,6.00,0.45
...,...,...,...,...,...,...,...,...,...
8081,5734,2008-11-02,Female,Service,No,4,7,8.70,0.79
8082,5191,2008-12-17,Female,Service,Yes,2,4,5.30,0.32
8083,5390,2008-03-24,Female,Service,No,2,4,6.60,0.57
8084,860,2008-08-30,Female,Service,No,1,2,5.60,0.40


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8086 entries, 0 to 8085
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Employee ID           8086 non-null   int64  
 1   Date of Joining       8086 non-null   object 
 2   Gender                8086 non-null   object 
 3   Company Type          8086 non-null   object 
 4   WFH Setup Available   8086 non-null   object 
 5   Designation           8086 non-null   int64  
 6   Resource Allocation   8086 non-null   int64  
 7   Mental Fatigue Score  8086 non-null   float64
 8   Burn Rate             7702 non-null   float64
dtypes: float64(2), int64(3), object(4)
memory usage: 568.7+ KB


# EDA & Preprocessing

In [5]:
df.isnull().sum()

Employee ID               0
Date of Joining           0
Gender                    0
Company Type              0
WFH Setup Available       0
Designation               0
Resource Allocation       0
Mental Fatigue Score      0
Burn Rate               384
dtype: int64

In [6]:
df = df.dropna()
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7702 entries, 0 to 8085
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Employee ID           7702 non-null   int64  
 1   Date of Joining       7702 non-null   object 
 2   Gender                7702 non-null   object 
 3   Company Type          7702 non-null   object 
 4   WFH Setup Available   7702 non-null   object 
 5   Designation           7702 non-null   int64  
 6   Resource Allocation   7702 non-null   int64  
 7   Mental Fatigue Score  7702 non-null   float64
 8   Burn Rate             7702 non-null   float64
dtypes: float64(2), int64(3), object(4)
memory usage: 601.7+ KB


In [7]:
# datetime
df['Date of Joining'] = pd.to_datetime(df['Date of Joining'])

reference_date = pd.to_datetime(pd.read_csv('../train.csv')['Date of Joining']).min()
df['Days_with_company'] = (df['Date of Joining'] - reference_date).dt.days

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Date of Joining'] = pd.to_datetime(df['Date of Joining'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Days_with_company'] = (df['Date of Joining'] - reference_date).dt.days


In [8]:
# lightGBM不用one hot
categorical_features = ['Gender', 'Company Type', 'WFH Setup Available']


for col in categorical_features:
    df[col] = df[col].astype('category')

df_X = df[[
        'Designation', 'Resource Allocation',
        'Mental Fatigue Score', 'Days_with_company',
        'Gender', 'Company Type', 'WFH Setup Available'  # <-- 使用原始欄位名
    ]]
df_y = df['Burn Rate']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].astype('category')


In [9]:
df_X

Unnamed: 0,Designation,Resource Allocation,Mental Fatigue Score,Days_with_company,Gender,Company Type,WFH Setup Available
0,2,5,6.70,225,Male,Service,No
1,3,5,5.94,338,Male,Service,Yes
2,3,6,7.00,53,Male,Service,No
4,2,5,6.00,274,Female,Service,No
5,2,3,4.60,14,Male,Service,Yes
...,...,...,...,...,...,...,...
8081,4,7,8.70,306,Female,Service,No
8082,2,4,5.30,351,Female,Service,Yes
8083,2,4,6.60,83,Female,Service,No
8084,1,2,5.60,242,Female,Service,No


In [10]:
df_y

0       0.51
1       0.45
2       0.62
4       0.45
5       0.27
        ... 
8081    0.79
8082    0.32
8083    0.57
8084    0.40
8085    0.63
Name: Burn Rate, Length: 7702, dtype: float64

# Split train test

In [11]:
X_train, X_test, y_train, y_test = train_test_split(df_X, df_y, test_size=0.2, random_state=42)

# lightGBM

In [12]:
import lightgbm as lgb
# 先用early stop看estimator
X_train_part, X_val, y_train_part, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

model = lgb.LGBMRegressor(
    n_estimators=2000,
    learning_rate=0.05,
    random_state=42
)

model.fit(
    X_train_part,
    y_train_part,
    eval_set=[(X_val, y_val)],
    eval_metric='mae',
    callbacks=[lgb.early_stopping(stopping_rounds=50, verbose=True)]
)

optimal_n_estimators = model.best_iteration_
print(f"n_estimators: {optimal_n_estimators}")

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000046 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 467
[LightGBM] [Info] Number of data points in the train set: 4620, number of used features: 7
[LightGBM] [Info] Start training from score 0.452526
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[90]	valid_0's l1: 0.0503561	valid_0's l2: 0.00427579
n_estimators: 90


In [13]:
import lightgbm as lgb
from sklearn.model_selection import GridSearchCV

In [14]:
param_grid_lgbm = {
    'learning_rate': [0.1, 0.05, 0.01],             
    'n_estimators': [80, 200, 400, 600, 800],    
}

In [15]:
lgbm_model = lgb.LGBMRegressor(random_state=42, n_jobs=-3)
grid_search = GridSearchCV(
    estimator=lgbm_model,
    param_grid=param_grid_lgbm,
    cv=5, # 5 fold
    scoring='neg_mean_absolute_error', 
    n_jobs=-3, 
    verbose=2 
)

grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 15 candidates, totalling 75 fits
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000251 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 476
[LightGBM] [Info] Number of data points in the train set: 6161, number of used features: 7
[LightGBM] [Info] Start training from score 0.451638


0,1,2
,estimator,LGBMRegressor...ndom_state=42)
,param_grid,"{'learning_rate': [0.1, 0.05, ...], 'n_estimators': [80, 200, ...]}"
,scoring,'neg_mean_absolute_error'
,n_jobs,-3
,refit,True
,cv,5
,verbose,2
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,-1
,learning_rate,0.05
,n_estimators,80
,subsample_for_bin,200000
,objective,
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [16]:
print(grid_search.best_params_)
print("Best MAE: ", -grid_search.best_score_) 

{'learning_rate': 0.05, 'n_estimators': 80}
Best MAE:  0.04889930681043185


In [17]:
lgbm_model = grid_search.best_estimator_

In [18]:
import joblib
joblib.dump(lgbm_model, 'stacking_lgbm_model.joblib')

['stacking_lgbm_model.joblib']

In [19]:
y_pred = lgbm_model.predict(X_test)

In [20]:
from sklearn.metrics import mean_absolute_error, r2_score
import numpy as np

In [21]:
mae = mean_absolute_error(y_pred=y_pred, y_true=y_test)
r2 = r2_score(y_test, y_pred)

print(f"Mean Absolute Error (MAE): {mae}")
print(f"R-squared (R2) Score: {r2}")

Mean Absolute Error (MAE): 0.04954702612739721
R-squared (R2) Score: 0.891399694250681
