# Import

In [1]:
import pandas as pd
import numpy as np
import joblib

In [2]:
import sys
from pathlib import Path

project_root = Path.cwd().parent
if str(project_root) not in sys.path:
    sys.path.append(str(project_root))

In [3]:
from lightGBM import lgbm_inference
from MLP import MLP_inference
from RFR import RFR_inference
from XGBoost import xgb_inference

# Holdout data

In [4]:
holdout_df = pd.read_csv('../KNN_Imputation/validation_imputed_stacking.csv')
holdout_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2022 entries, 0 to 2021
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Employee ID           2022 non-null   int64  
 1   Date of Joining       2022 non-null   object 
 2   Gender                2022 non-null   object 
 3   Company Type          2022 non-null   object 
 4   WFH Setup Available   2022 non-null   object 
 5   Designation           2022 non-null   int64  
 6   Resource Allocation   2022 non-null   int64  
 7   Mental Fatigue Score  2022 non-null   float64
 8   Burn Rate             1922 non-null   float64
dtypes: float64(2), int64(3), object(4)
memory usage: 142.3+ KB


In [5]:
holdout_df = holdout_df.dropna()
holdout_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1922 entries, 0 to 2021
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Employee ID           1922 non-null   int64  
 1   Date of Joining       1922 non-null   object 
 2   Gender                1922 non-null   object 
 3   Company Type          1922 non-null   object 
 4   WFH Setup Available   1922 non-null   object 
 5   Designation           1922 non-null   int64  
 6   Resource Allocation   1922 non-null   int64  
 7   Mental Fatigue Score  1922 non-null   float64
 8   Burn Rate             1922 non-null   float64
dtypes: float64(2), int64(3), object(4)
memory usage: 150.2+ KB


In [6]:
target_col = 'Burn Rate'
X_holdout = holdout_df.drop(target_col, axis=1)
y_holdout = holdout_df[target_col]

In [7]:
pred_lgbm = lgbm_inference.inference(X_holdout)
pred_mlp = MLP_inference.inference(X_holdout)
pred_rf = RFR_inference.inference(X_holdout)
pred_xgb = xgb_inference.inference(X_holdout)

In [8]:
X_meta = np.column_stack((
    pred_lgbm,
    pred_mlp,
    pred_rf,
    pred_xgb,
))
y_meta = y_holdout

# meta model

In [9]:
from sklearn.linear_model import Ridge

meta_model = Ridge()

In [10]:
meta_model.fit(X_meta, y_meta)
joblib.dump(meta_model, 'meta_model.joblib')

['meta_model.joblib']

# test

In [11]:
test_df = pd.read_csv("../KNN_Imputation/test_imputed.csv")

In [12]:
test_pred_lgbm = lgbm_inference.inference(test_df)
test_pred_mlp = MLP_inference.inference(test_df)
test_pred_rf = RFR_inference.inference(test_df)
test_pred_xgb = xgb_inference.inference(test_df)

In [13]:
test_X_meta = np.column_stack((
    test_pred_lgbm,
    test_pred_mlp,
    test_pred_rf,
    test_pred_xgb,
))

In [14]:
predictions = meta_model.predict(test_X_meta)

In [16]:
submission_df = pd.DataFrame({
    'Employee ID': test_df['Employee ID'],
    'Burn Rate': predictions
})
submission_df

Unnamed: 0,Employee ID,Burn Rate
0,0,0.002920
1,1,0.735057
2,2,0.773233
3,3,0.415981
4,4,0.464116
...,...,...
1975,1975,0.187367
1976,1976,0.449295
1977,1977,0.346039
1978,1978,0.569244


In [17]:
submission_df.to_csv('stacking_v0.csv', index=False)