In [53]:
! pip install -q pandas numpy scikit-learn lightgbm


[notice] A new release of pip is available: 23.2.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


# IMPORT

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split

# LOAD DATA

In [6]:
df = pd.read_csv('../KNN_Imputation/train_imputed.csv')
df

Unnamed: 0,Employee ID,Date of Joining,Gender,Company Type,WFH Setup Available,Designation,Resource Allocation,Mental Fatigue Score,Burn Rate
0,0,2008-09-30,Female,Service,No,2,3,3.80,0.16
1,1,2008-11-30,Male,Service,Yes,1,2,5.00,0.36
2,2,2008-03-10,Female,Product,Yes,2,4,5.80,0.49
3,3,2008-11-03,Male,Service,Yes,1,1,2.60,0.20
4,4,2008-07-24,Female,Service,No,3,7,6.90,0.52
...,...,...,...,...,...,...,...,...,...
10103,10103,2008-07-04,Female,Product,No,3,6,5.90,0.51
10104,10104,2008-02-29,Male,Service,No,3,6,6.30,0.62
10105,10105,2008-01-25,Male,Service,No,3,5,5.70,0.47
10106,10106,2008-12-26,Male,Service,Yes,2,3,5.50,0.40


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10108 entries, 0 to 10107
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Employee ID           10108 non-null  int64  
 1   Date of Joining       10108 non-null  object 
 2   Gender                10108 non-null  object 
 3   Company Type          10108 non-null  object 
 4   WFH Setup Available   10108 non-null  object 
 5   Designation           10108 non-null  int64  
 6   Resource Allocation   10108 non-null  int64  
 7   Mental Fatigue Score  10108 non-null  float64
 8   Burn Rate             9624 non-null   float64
dtypes: float64(2), int64(3), object(4)
memory usage: 710.8+ KB


# EDA & Preprocessing

In [8]:
df.isnull().sum()

Employee ID               0
Date of Joining           0
Gender                    0
Company Type              0
WFH Setup Available       0
Designation               0
Resource Allocation       0
Mental Fatigue Score      0
Burn Rate               484
dtype: int64

In [9]:
df = df.dropna()
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9624 entries, 0 to 10107
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Employee ID           9624 non-null   int64  
 1   Date of Joining       9624 non-null   object 
 2   Gender                9624 non-null   object 
 3   Company Type          9624 non-null   object 
 4   WFH Setup Available   9624 non-null   object 
 5   Designation           9624 non-null   int64  
 6   Resource Allocation   9624 non-null   int64  
 7   Mental Fatigue Score  9624 non-null   float64
 8   Burn Rate             9624 non-null   float64
dtypes: float64(2), int64(3), object(4)
memory usage: 751.9+ KB


In [10]:
# datetime
df['Date of Joining'] = pd.to_datetime(df['Date of Joining'])

reference_date = df['Date of Joining'].max()
df['Days_with_company'] = (reference_date - df['Date of Joining']).dt.days

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Date of Joining'] = pd.to_datetime(df['Date of Joining'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Days_with_company'] = (reference_date - df['Date of Joining']).dt.days


In [11]:
# lightGBM不用one hot
categorical_features = ['Gender', 'Company Type', 'WFH Setup Available']


for col in categorical_features:
    df[col] = df[col].astype('category')

df_X = df[[
        'Designation', 'Resource Allocation',
        'Mental Fatigue Score', 'Days_with_company',
        'Gender', 'Company Type', 'WFH Setup Available'  # <-- 使用原始欄位名
    ]]
df_y = df['Burn Rate']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].astype('category')


In [12]:
df_X

Unnamed: 0,Designation,Resource Allocation,Mental Fatigue Score,Days_with_company,Gender,Company Type,WFH Setup Available
0,2,3,3.80,92,Female,Service,No
1,1,2,5.00,31,Male,Service,Yes
2,2,4,5.80,296,Female,Product,Yes
3,1,1,2.60,58,Male,Service,Yes
4,3,7,6.90,160,Female,Service,No
...,...,...,...,...,...,...,...
10103,3,6,5.90,180,Female,Product,No
10104,3,6,6.30,306,Male,Service,No
10105,3,5,5.70,341,Male,Service,No
10106,2,3,5.50,5,Male,Service,Yes


In [13]:
df_y

0        0.16
1        0.36
2        0.49
3        0.20
4        0.52
         ... 
10103    0.51
10104    0.62
10105    0.47
10106    0.40
10107    0.74
Name: Burn Rate, Length: 9624, dtype: float64

# Split train test

In [14]:
X_train, X_test, y_train, y_test = train_test_split(df_X, df_y, test_size=0.2, random_state=42)

# lightGBM

In [35]:
import lightgbm as lgb
# 先用early stop看estimator
X_train_part, X_val, y_train_part, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

model = lgb.LGBMRegressor(
    n_estimators=2000,
    learning_rate=0.05,
    random_state=42
)

model.fit(
    X_train_part,
    y_train_part,
    eval_set=[(X_val, y_val)],
    eval_metric='mae',
    callbacks=[lgb.early_stopping(stopping_rounds=50, verbose=True)]
)

optimal_n_estimators = model.best_iteration_
print(f"n_estimators: {optimal_n_estimators}")

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000096 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 470
[LightGBM] [Info] Number of data points in the train set: 5774, number of used features: 7
[LightGBM] [Info] Start training from score 0.456297
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[84]	valid_0's l1: 0.048166	valid_0's l2: 0.00380978
n_estimators: 84


In [97]:
import lightgbm as lgb
from sklearn.model_selection import GridSearchCV

In [98]:
param_grid_lgbm = {
    'learning_rate': [0.1, 0.05, 0.01],             
    'n_estimators': [80, 200, 400, 600, 800],    
}

In [99]:
lgbm_model = lgb.LGBMRegressor(random_state=42, n_jobs=-3)
grid_search = GridSearchCV(
    estimator=lgbm_model,
    param_grid=param_grid_lgbm,
    cv=5, # 5 fold
    scoring='neg_mean_absolute_error', 
    n_jobs=-3, 
    verbose=2 
)

grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 15 candidates, totalling 75 fits


0,1,2
,estimator,LGBMRegressor...ndom_state=42)
,param_grid,"{'learning_rate': [0.1, 0.05, ...], 'n_estimators': [80, 200, ...]}"
,scoring,'neg_mean_absolute_error'
,n_jobs,-3
,refit,True
,cv,5
,verbose,2
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,-1
,learning_rate,0.05
,n_estimators,80
,subsample_for_bin,200000
,objective,
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [100]:
print(grid_search.best_params_)
print("Best MAE: ", -grid_search.best_score_) 

{'learning_rate': 0.05, 'n_estimators': 80}
Best MAE:  0.04854788328740608


In [103]:
lgbm_model = grid_search.best_estimator_

In [104]:
import joblib
joblib.dump(lgbm_model, '../lightGBM/final_lgbm_model.joblib')

['../lightGBM/final_lgbm_model.joblib']

In [105]:
y_pred = lgbm_model.predict(X_test)

In [106]:
from sklearn.metrics import mean_absolute_error, r2_score
import numpy as np

In [107]:
mae = mean_absolute_error(y_pred=y_pred, y_true=y_test)
r2 = r2_score(y_test, y_pred)

print(f"Mean Absolute Error (MAE): {mae}")
print(f"R-squared (R2) Score: {r2}")

Mean Absolute Error (MAE): 0.04820176910553892
R-squared (R2) Score: 0.9014433036841141


# test data

In [108]:
test_df = pd.read_csv('../KNN_Imputation/test_imputed.csv')

In [109]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1980 entries, 0 to 1979
Data columns (total 8 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Employee ID           1980 non-null   int64  
 1   Date of Joining       1980 non-null   object 
 2   Gender                1980 non-null   object 
 3   Company Type          1980 non-null   object 
 4   WFH Setup Available   1980 non-null   object 
 5   Designation           1980 non-null   int64  
 6   Resource Allocation   1980 non-null   int64  
 7   Mental Fatigue Score  1980 non-null   float64
dtypes: float64(1), int64(3), object(4)
memory usage: 123.9+ KB


In [110]:
# test_df['Resource Allocation'] = test_df['Resource Allocation'].fillna(df_X['Resource Allocation'].mode()[0])
# test_df['Mental Fatigue Score'] = test_df['Mental Fatigue Score'].fillna(df_X['Mental Fatigue Score'].mean())
# test_df.info()

In [111]:
# datetime
test_df['Date of Joining'] = pd.to_datetime(test_df['Date of Joining'])

reference_date = test_df['Date of Joining'].max()
test_df['Days_with_company'] = (reference_date - test_df['Date of Joining']).dt.days

In [112]:
for col in categorical_features: 
    test_df[col] = test_df[col].astype('category')

df_X_test = test_df[[
        'Designation', 'Resource Allocation',
        'Mental Fatigue Score', 'Days_with_company',
        'Gender', 'Company Type', 'WFH Setup Available' # <-- 使用原始欄位名
    ]]

In [113]:
y_pred = lgbm_model.predict(df_X_test)

In [114]:
submission_df = pd.DataFrame({
    'Employee ID': test_df['Employee ID'],
    'Burn Rate': y_pred
})

In [115]:
submission_df

Unnamed: 0,Employee ID,Burn Rate
0,0,0.015895
1,1,0.724962
2,2,0.754256
3,3,0.403526
4,4,0.485286
...,...,...
1975,1975,0.197022
1976,1976,0.457417
1977,1977,0.350411
1978,1978,0.585874


In [116]:
submission_df.to_csv('lightGBM_rough_kfoldGridsearch_KNNimputed_v0.1.csv', index=False)