##  Model Training 

In this notebook, I train two models to predict calorie burn

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_log_error
from catboost import CatBoostRegressor
from xgboost import XGBRegressor

# 1. Read Data
train = pd.read_csv("train_with_te.csv")
test = pd.read_csv("test_with_te.csv")
submission = pd.read_csv('sample_submission.csv')

# 2.encode
def clean_object_columns(df):
    for col in df.columns:
        if df[col].dtype == 'object':
            df[col] = LabelEncoder().fit_transform(df[col].astype(str))
    return df

train = clean_object_columns(train)
test = clean_object_columns(test)

# 3. test and train
drop_cols = ['Calories']
if 'id' in train.columns:
    drop_cols.append('id')

X = train.drop(columns=drop_cols)
y = np.log1p(train['Calories'])
X_test = test.drop(columns=['id'], errors='ignore')



# 4. create 
kf = KFold(n_splits=5, shuffle=True, random_state=42)
cat_preds = np.zeros(len(X_test))
xgb_preds = np.zeros(len(X_test))
oof_preds = np.zeros(len(X))

# 5. Start CV
for fold, (train_idx, val_idx) in enumerate(kf.split(X), 1):
    print(f"\n Fold {fold}")
    X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]

    # --- CatBoost ---
    cat = CatBoostRegressor(
        iterations=3000,
        learning_rate=0.05,
        depth=8,
        loss_function='RMSE',
        random_seed=42,
        verbose=0
    )
    cat.fit(X_tr, y_tr, eval_set=(X_val, y_val), early_stopping_rounds=50)
    oof_preds[val_idx] += cat.predict(X_val) * 0.5
    cat_preds += cat.predict(X_test) / 5

    # --- XGBoost ---
    xgb = XGBRegressor(
        n_estimators=3000,
        learning_rate=0.05,
        max_depth=10, 
        subsample=0.9,
        colsample_bytree=0.7,
        gamma=0.01,
        random_state=42,
        verbosity=0
)

    xgb.fit(X_tr, y_tr, eval_set=[(X_val, y_val)])
    oof_preds[val_idx] += xgb.predict(X_val) * 0.5
    xgb_preds += xgb.predict(X_test) / 5

    fold_rmsle = np.sqrt(mean_squared_log_error(np.expm1(y_val), np.expm1(oof_preds[val_idx])))
    print(f" Fold {fold} RMSLE: {fold_rmsle:.5f}")

# 6. Final 
final_rmsle = np.sqrt(mean_squared_log_error(np.expm1(y), np.expm1(oof_preds)))
print(f"\n Final OOF RMSLE: {final_rmsle:.5f}")

# 7.  submission
final_preds = 0.5 * cat_preds + 0.5 * xgb_preds
submission['Calories'] = np.clip(np.expm1(final_preds), 1, 314)
submission.to_csv('submission_Target.csv', index=False)
print("\n submission_Target.csv ")










✅ 確認所有欄位 dtype：
Body_Temp     float64
Temp_Level      int64
Duration      float64
Sex             int64
Weight        float64
Intensity       int64
Age             int64
Age_Group       int64
Heart_Rate    float64
BMI           float64
Height        float64
HRxTime       float64
dtype: object

🔁 Fold 1
[0]	validation_0-rmse:0.90093
[1]	validation_0-rmse:0.85691
[2]	validation_0-rmse:0.81455
[3]	validation_0-rmse:0.77428
[4]	validation_0-rmse:0.73612
[5]	validation_0-rmse:0.69980
[6]	validation_0-rmse:0.66531
[7]	validation_0-rmse:0.63288
[8]	validation_0-rmse:0.60228
[9]	validation_0-rmse:0.57277
[10]	validation_0-rmse:0.54501
[11]	validation_0-rmse:0.51833
[12]	validation_0-rmse:0.49340
[13]	validation_0-rmse:0.46926
[14]	validation_0-rmse:0.44646
[15]	validation_0-rmse:0.42482
[16]	validation_0-rmse:0.40461
[17]	validation_0-rmse:0.38542
[18]	validation_0-rmse:0.36677
[19]	validation_0-rmse:0.34919
[20]	validation_0-rmse:0.33241
[21]	validation_0-rmse:0.31649
[22]	validation_0-rmse:

# Step 2 Stacking

In [None]:
#  OOF  DataFrame
stacking_train = pd.DataFrame({
    'CatBoost_Pred': np.zeros(len(X)),
    'XGBoost_Pred': np.zeros(len(X)),
    'Target': y
})

stacking_test = pd.DataFrame({
    'CatBoost_Pred': np.zeros(len(X_test)),
    'XGBoost_Pred': np.zeros(len(X_test))
})

#  KFold create Level 1 model prediction，for stacking
for fold, (train_idx, val_idx) in enumerate(kf.split(X), 1):
    X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]

    # Train CatBoost
    cat = CatBoostRegressor(
        iterations=3000,
        learning_rate=0.05,
        depth=8,
        loss_function='RMSE',
        random_seed=42,
        verbose=0
    )
    cat.fit(X_tr, y_tr, eval_set=(X_val, y_val), early_stopping_rounds=50)
    stacking_train.loc[val_idx, 'CatBoost_Pred'] = cat.predict(X_val)
    stacking_test['CatBoost_Pred'] += cat.predict(X_test) / kf.n_splits

    # Train XGBoost
    xgb = XGBRegressor(
        n_estimators=3000,
        learning_rate=0.05,
        max_depth=10,
        subsample=0.9,
        colsample_bytree=0.7,
        gamma=0.01,
        random_state=42,
        verbosity=0
    )
    xgb.fit(X_tr, y_tr, eval_set=[(X_val, y_val)])
    stacking_train.loc[val_idx, 'XGBoost_Pred'] = xgb.predict(X_val)
    stacking_test['XGBoost_Pred'] += xgb.predict(X_test) / kf.n_splits

print("✅ Stacking Done")




[0]	validation_0-rmse:0.90093
[1]	validation_0-rmse:0.85691
[2]	validation_0-rmse:0.81455
[3]	validation_0-rmse:0.77428
[4]	validation_0-rmse:0.73612
[5]	validation_0-rmse:0.69980
[6]	validation_0-rmse:0.66531
[7]	validation_0-rmse:0.63288
[8]	validation_0-rmse:0.60228
[9]	validation_0-rmse:0.57277
[10]	validation_0-rmse:0.54501
[11]	validation_0-rmse:0.51833
[12]	validation_0-rmse:0.49340
[13]	validation_0-rmse:0.46926
[14]	validation_0-rmse:0.44646
[15]	validation_0-rmse:0.42482
[16]	validation_0-rmse:0.40461
[17]	validation_0-rmse:0.38542
[18]	validation_0-rmse:0.36677
[19]	validation_0-rmse:0.34919
[20]	validation_0-rmse:0.33241
[21]	validation_0-rmse:0.31649
[22]	validation_0-rmse:0.30147
[23]	validation_0-rmse:0.28716
[24]	validation_0-rmse:0.27373
[25]	validation_0-rmse:0.26091
[26]	validation_0-rmse:0.24896
[27]	validation_0-rmse:0.23766
[28]	validation_0-rmse:0.22675
[29]	validation_0-rmse:0.21641
[30]	validation_0-rmse:0.20673
[31]	validation_0-rmse:0.19748
[32]	validation_0-

# step III Model Ensembling

In [None]:
from lightgbm import LGBMRegressor

# 1. create model training Data
X_stack = stacking_train[['CatBoost_Pred', 'XGBoost_Pred']]
y_stack = stacking_train['Target']
X_stack_test = stacking_test[['CatBoost_Pred', 'XGBoost_Pred']]

# 2. Train LGBM model
meta_model = LGBMRegressor(
    n_estimators=1000,
    learning_rate=0.03,
    max_depth=4,
    random_state=42
)
meta_model.fit(X_stack, y_stack)

# 3. predict
stack_preds = meta_model.predict(X_stack_test)
submission['Calories'] = np.clip(np.expm1(stack_preds), 1, 314)

# 4. output
submission.to_csv("submission_level2_stack.csv", index=False)
print(" Model Ensembling！submission_level2_stack.csv 已儲存")



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009339 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 510
[LightGBM] [Info] Number of data points in the train set: 746234, number of used features: 2
[LightGBM] [Info] Start training from score 4.153119
✅ 融合模型完成！submission_level2_stack.csv 已儲存


# model essemble

In [None]:
import pandas as pd
import numpy as np

# === read submission fodler ===
df1 = pd.read_csv("ensemble_submission_log_0.05678.csv")   
df2 = pd.read_csv("submission_mean_V4_0.05676.csv")   
df3 = pd.read_csv("ensemble_submission_3model_361_0.05678.csv")   # ...
df4 = pd.read_csv("submission_mean_V2_005677.csv")
df5 = pd.read_csv("ensemble_submission_log_60f_0.05679.csv")
df6 = pd.read_csv("ensemble_submission0.05679.csv")
df7 = pd.read_csv("submission_mean_V50.05675.csv")
df8 = pd.read_csv("ensemble_submission_3model_451_0.05678.csv")
df9 = pd.read_csv("submission_mean_V3_0.05675.csv")
df10 = pd.read_csv("ensemble_submission_3model_35605_0.05678.csv")


# === Stack all submission's Calories  ===
all_preds = np.stack([
    df1['Calories'],
    df2['Calories'],
    df3['Calories'],
    df4['Calories'],
    df5['Calories'],
    df6['Calories'],
    df7['Calories'],
    df8['Calories'],
    df9['Calories'],
    df10['Calories'],

], axis=1)

# ===  sample_submission  ===
sample = pd.read_csv("sample_submission.csv")

# === median ===
sample['Calories'] = np.median(all_preds, axis=1)
sample.to_csv("submission_median.csv", index=False)

# === mean ===
sample['Calories'] = np.mean(all_preds, axis=1)
sample.to_csv("submission_mean_V6.csv", index=False)