In [51]:
! pip install -q pandas numpy scikit-learn


[notice] A new release of pip is available: 23.2.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


# IMPORT

In [52]:
import pandas as pd
from sklearn.model_selection import train_test_split

# LOAD DATA

In [53]:
df = pd.read_csv('../KNN_Imputation/train_imputed.csv')
df

Unnamed: 0,Employee ID,Date of Joining,Gender,Company Type,WFH Setup Available,Designation,Resource Allocation,Mental Fatigue Score,Burn Rate
0,0,2008-09-30,Female,Service,No,2,3,3.80,0.16
1,1,2008-11-30,Male,Service,Yes,1,2,5.00,0.36
2,2,2008-03-10,Female,Product,Yes,2,4,5.80,0.49
3,3,2008-11-03,Male,Service,Yes,1,1,2.60,0.20
4,4,2008-07-24,Female,Service,No,3,7,6.90,0.52
...,...,...,...,...,...,...,...,...,...
10103,10103,2008-07-04,Female,Product,No,3,6,5.90,0.51
10104,10104,2008-02-29,Male,Service,No,3,6,6.30,0.62
10105,10105,2008-01-25,Male,Service,No,3,5,5.70,0.47
10106,10106,2008-12-26,Male,Service,Yes,2,3,5.50,0.40


In [54]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10108 entries, 0 to 10107
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Employee ID           10108 non-null  int64  
 1   Date of Joining       10108 non-null  object 
 2   Gender                10108 non-null  object 
 3   Company Type          10108 non-null  object 
 4   WFH Setup Available   10108 non-null  object 
 5   Designation           10108 non-null  int64  
 6   Resource Allocation   10108 non-null  int64  
 7   Mental Fatigue Score  10108 non-null  float64
 8   Burn Rate             9624 non-null   float64
dtypes: float64(2), int64(3), object(4)
memory usage: 710.8+ KB


# EDA & Preprocessing

In [55]:
df.isnull().sum()

Employee ID               0
Date of Joining           0
Gender                    0
Company Type              0
WFH Setup Available       0
Designation               0
Resource Allocation       0
Mental Fatigue Score      0
Burn Rate               484
dtype: int64

In [56]:
df = df.dropna()
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9624 entries, 0 to 10107
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Employee ID           9624 non-null   int64  
 1   Date of Joining       9624 non-null   object 
 2   Gender                9624 non-null   object 
 3   Company Type          9624 non-null   object 
 4   WFH Setup Available   9624 non-null   object 
 5   Designation           9624 non-null   int64  
 6   Resource Allocation   9624 non-null   int64  
 7   Mental Fatigue Score  9624 non-null   float64
 8   Burn Rate             9624 non-null   float64
dtypes: float64(2), int64(3), object(4)
memory usage: 751.9+ KB


In [57]:
# datetime
df['Date of Joining'] = pd.to_datetime(df['Date of Joining'])

# reference_date = df['Date of Joining'].min()
reference_date = pd.to_datetime(pd.read_csv('../train.csv')['Date of Joining']).min()
df['Days_with_company'] = (df['Date of Joining'] - reference_date).dt.days

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Date of Joining'] = pd.to_datetime(df['Date of Joining'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Days_with_company'] = (df['Date of Joining'] - reference_date).dt.days


In [58]:
df_encoded = pd.get_dummies(df, columns=['Gender', 'Company Type', 'WFH Setup Available'], drop_first=True)

df_X = df_encoded[[
        'Designation', 'Resource Allocation',
        'Mental Fatigue Score', 'Days_with_company', 'Gender_Male',
        'Company Type_Service', 'WFH Setup Available_Yes'
    ]]
df_y = df_encoded['Burn Rate']

In [59]:
df_X

Unnamed: 0,Designation,Resource Allocation,Mental Fatigue Score,Days_with_company,Gender_Male,Company Type_Service,WFH Setup Available_Yes
0,2,3,3.80,273,False,True,False
1,1,2,5.00,334,True,True,True
2,2,4,5.80,69,False,False,True
3,1,1,2.60,307,True,True,True
4,3,7,6.90,205,False,True,False
...,...,...,...,...,...,...,...
10103,3,6,5.90,185,False,False,False
10104,3,6,6.30,59,True,True,False
10105,3,5,5.70,24,True,True,False
10106,2,3,5.50,360,True,True,True


In [60]:
df_y

0        0.16
1        0.36
2        0.49
3        0.20
4        0.52
         ... 
10103    0.51
10104    0.62
10105    0.47
10106    0.40
10107    0.74
Name: Burn Rate, Length: 9624, dtype: float64

# Split train test

In [61]:
X_train, X_test, y_train, y_test = train_test_split(df_X, df_y, test_size=0.2, random_state=42)

# RandomForestRegressor

In [62]:
n_estimators = 500
# param_grid = {
#     'n_estimators': [50, 100, 300],
#     'max_features': ['sqrt', 0.3, 0.5],
#     'max_depth': [5, 10, 20, None],
#     'min_samples_leaf': [1, 2, 4]
# }
param_grid = {
    'n_estimators': [500, 500],
    'max_features': [0.5],
    'max_depth': [10],
    'min_samples_leaf': [1]
}

In [63]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
# 初始化模型
rf_model = RandomForestRegressor(
        n_estimators=n_estimators,
        criterion='absolute_error',
        random_state=42,
        n_jobs=-3
    )

grid_search = GridSearchCV(
    estimator=rf_model,
    param_grid=param_grid,
    cv=5,
    scoring='neg_mean_absolute_error',
    n_jobs=-3,
    verbose=2
)

grid_search.fit(X_train, y_train)
print(grid_search.best_params_)
# 使用訓練資料進行訓練
# rf_model.fit(X_train, y_train)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
{'max_depth': 10, 'max_features': 0.5, 'min_samples_leaf': 1, 'n_estimators': 500}


In [64]:
best_mae = -grid_search.best_score_
print(best_mae)

0.048396588277090015


In [65]:
final_model = grid_search.best_estimator_

In [66]:
import joblib
joblib.dump(final_model, '../RFR/final_rf_model.joblib')
rf_model = joblib.load('../RFR/final_rf_model.joblib')

In [67]:
y_pred = rf_model.predict(X_test)

In [68]:
from sklearn.metrics import mean_absolute_error, r2_score
import numpy as np

In [69]:
mae = mean_absolute_error(y_pred=y_pred, y_true=y_test)
r2 = r2_score(y_test, y_pred)

print(f"Mean Absolute Error (MAE): {mae}")
print(f"R-squared (R2) Score: {r2}")

Mean Absolute Error (MAE): 0.04763369350649351
R-squared (R2) Score: 0.9020207441775061


# test data

In [71]:
test_df = pd.read_csv('../KNN_Imputation/test_imputed.csv')

In [72]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1980 entries, 0 to 1979
Data columns (total 8 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Employee ID           1980 non-null   int64  
 1   Date of Joining       1980 non-null   object 
 2   Gender                1980 non-null   object 
 3   Company Type          1980 non-null   object 
 4   WFH Setup Available   1980 non-null   object 
 5   Designation           1980 non-null   int64  
 6   Resource Allocation   1980 non-null   int64  
 7   Mental Fatigue Score  1980 non-null   float64
dtypes: float64(1), int64(3), object(4)
memory usage: 123.9+ KB


In [73]:
test_df['Resource Allocation'] = test_df['Resource Allocation'].fillna(df_X['Resource Allocation'].mode()[0])
test_df['Mental Fatigue Score'] = test_df['Mental Fatigue Score'].fillna(df_X['Mental Fatigue Score'].mean())
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1980 entries, 0 to 1979
Data columns (total 8 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Employee ID           1980 non-null   int64  
 1   Date of Joining       1980 non-null   object 
 2   Gender                1980 non-null   object 
 3   Company Type          1980 non-null   object 
 4   WFH Setup Available   1980 non-null   object 
 5   Designation           1980 non-null   int64  
 6   Resource Allocation   1980 non-null   int64  
 7   Mental Fatigue Score  1980 non-null   float64
dtypes: float64(1), int64(3), object(4)
memory usage: 123.9+ KB


In [74]:
# datetime
test_df['Date of Joining'] = pd.to_datetime(test_df['Date of Joining'])

reference_date = pd.to_datetime(pd.read_csv('../train.csv')['Date of Joining']).min()
test_df['Days_with_company'] = (test_df['Date of Joining'] - reference_date).dt.days

In [75]:
df_encoded = pd.get_dummies(test_df, columns=['Gender', 'Company Type', 'WFH Setup Available'], drop_first=True)

df_X = df_encoded[[
        'Designation', 'Resource Allocation',
        'Mental Fatigue Score', 'Days_with_company', 'Gender_Male',
        'Company Type_Service', 'WFH Setup Available_Yes'
    ]]

In [76]:
y_pred = rf_model.predict(df_X)

In [77]:
submission_df = pd.DataFrame({
    'Employee ID': test_df['Employee ID'],
    'Burn Rate': y_pred
})

In [78]:
submission_df

Unnamed: 0,Employee ID,Burn Rate
0,0,0.00581
1,1,0.72384
2,2,0.75365
3,3,0.39803
4,4,0.45976
...,...,...
1975,1975,0.18883
1976,1976,0.45302
1977,1977,0.34601
1978,1978,0.60039


In [79]:
submission_df.to_csv('random_forest_regressor_rough_Gridsearch_KNNimputed_v0.2.csv', index=False)