In [34]:
import numpy as np
import pandas as pd
import itertools
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import mean_squared_log_error

In [5]:
# Load data
train_df = pd.read_csv('playground-series-s5e5/train.csv')
test_df = pd.read_csv('playground-series-s5e5/test.csv')
train_df

Unnamed: 0,id,Sex,Age,Height,Weight,Duration,Heart_Rate,Body_Temp,Calories
0,0,male,36,189.0,82.0,26.0,101.0,41.0,150.0
1,1,female,64,163.0,60.0,8.0,85.0,39.7,34.0
2,2,female,51,161.0,64.0,7.0,84.0,39.8,29.0
3,3,male,20,192.0,90.0,25.0,105.0,40.7,140.0
4,4,female,38,166.0,61.0,25.0,102.0,40.6,146.0
...,...,...,...,...,...,...,...,...,...
749995,749995,male,28,193.0,97.0,30.0,114.0,40.9,230.0
749996,749996,female,64,165.0,63.0,18.0,92.0,40.5,96.0
749997,749997,male,60,162.0,67.0,29.0,113.0,40.9,221.0
749998,749998,male,45,182.0,91.0,17.0,102.0,40.3,109.0


In [35]:
def feature_engineering(df):
    df = df.copy()

    # Drop ID if present
    if 'id' in df.columns:
        df.drop(columns=['id'], inplace=True)

    # Encode 'Sex'
    df['Sex'] = df['Sex'].map({'female': 1, 'male': 2})

    # Age + Sex combined feature
    df['AgeSex'] = df['Age'].astype(str) + df['Sex'].astype(str)
    df['AgeSex'] = LabelEncoder().fit_transform(df['AgeSex']) + 1

    # BMI and per-minute metrics (no Calories)
    df['BMI'] = df['Weight'] / ((df['Height'] / 100) ** 2)
    df['HR_per_min'] = df['Heart_Rate'] / df['Duration']
    df['Temp_per_min'] = df['Body_Temp'] / df['Duration']

    # Age bucket (then one-hot encode)
    df['age_bucket'] = pd.cut(df['Age'], bins=[0, 20, 30, 40, 50, 60, 70, 80, 100],
                              labels=False, include_lowest=True)
    df = pd.get_dummies(df, columns=['age_bucket'], prefix='age_bin')

    # Log-transformed features
    for col in ['Weight', 'Height', 'Duration', 'Heart_Rate', 'BMI']:
        df[f'log_{col}'] = np.log1p(df[col])

    # Normalize core features (excluding categorical or engineered ones)
    features_to_normalize = ['Weight', 'Height', 'Body_Temp', 'Heart_Rate', 'Duration', 'Age', 'BMI']
    scaler = StandardScaler()
    df[features_to_normalize] = scaler.fit_transform(df[features_to_normalize])

    # Interaction features
    interaction_features = ['Weight', 'Height', 'Body_Temp', 'Heart_Rate', 'Duration', 'Age', 'Sex', 'AgeSex', 'BMI']
    for f1, f2 in itertools.combinations(interaction_features, 2):
        df[f'{f1}*{f2}'] = df[f1] * df[f2]
        df[f'{f1}/{f2}'] = df[f1] / (df[f2] + 1e-5)
        df[f'{f1}**{f2}'] = df[f1] * (df[f2] ** 2)
        df[f'{f2}**{f1}'] = df[f2] * (df[f1] ** 2)

    # Categorical encoding
    for col in ['Sex', 'Age', 'AgeSex']:
        df['CAT_' + col] = df[col].astype('category')

    # Replace inf and NaNs
    df.replace([np.inf, -np.inf], 0, inplace=True)
    for col in df.select_dtypes(exclude=['category']).columns:
        df[col].fillna(0, inplace=True)

    return df

In [36]:
train = feature_engineering(train_df)
test = feature_engineering(test_df)

  df[f'{f2}**{f1}'] = df[f2] * (df[f1] ** 2)
  df[f'{f1}*{f2}'] = df[f1] * df[f2]
  df[f'{f1}/{f2}'] = df[f1] / (df[f2] + 1e-5)
  df[f'{f1}**{f2}'] = df[f1] * (df[f2] ** 2)
  df[f'{f2}**{f1}'] = df[f2] * (df[f1] ** 2)
  df[f'{f1}*{f2}'] = df[f1] * df[f2]
  df[f'{f1}/{f2}'] = df[f1] / (df[f2] + 1e-5)
  df[f'{f1}**{f2}'] = df[f1] * (df[f2] ** 2)
  df[f'{f2}**{f1}'] = df[f2] * (df[f1] ** 2)
  df[f'{f1}*{f2}'] = df[f1] * df[f2]
  df[f'{f1}/{f2}'] = df[f1] / (df[f2] + 1e-5)
  df[f'{f1}**{f2}'] = df[f1] * (df[f2] ** 2)
  df[f'{f2}**{f1}'] = df[f2] * (df[f1] ** 2)
  df[f'{f1}*{f2}'] = df[f1] * df[f2]
  df[f'{f1}/{f2}'] = df[f1] / (df[f2] + 1e-5)
  df[f'{f1}**{f2}'] = df[f1] * (df[f2] ** 2)
  df[f'{f2}**{f1}'] = df[f2] * (df[f1] ** 2)
  df[f'{f1}*{f2}'] = df[f1] * df[f2]
  df[f'{f1}/{f2}'] = df[f1] / (df[f2] + 1e-5)
  df[f'{f1}**{f2}'] = df[f1] * (df[f2] ** 2)
  df[f'{f2}**{f1}'] = df[f2] * (df[f1] ** 2)
  df[f'{f1}*{f2}'] = df[f1] * df[f2]
  df[f'{f1}/{f2}'] = df[f1] / (df[f2] + 1e-5)
  df[f'

In [37]:
train

Unnamed: 0,Sex,Age,Height,Weight,Duration,Heart_Rate,Body_Temp,Calories,AgeSex,BMI,...,Sex/BMI,Sex**BMI,BMI**Sex,AgeSex*BMI,AgeSex/BMI,AgeSex**BMI,BMI**AgeSex,CAT_Sex,CAT_Age,CAT_AgeSex
0,2,-0.357192,1.115235,0.490201,1.266324,0.583714,1.235772,150.0,34,-0.939009,...,-2.129927,1.763476,-3.756036,-31.926310,-36.208767,29.979095,-1085.494526,2,-0.357192,34
1,1,1.487943,-0.912137,-1.083172,-0.888309,-1.109436,-0.431163,34.0,89,-1.185799,...,-0.843321,1.406118,-1.185799,-105.536075,-75.055537,125.144531,-9392.710713,1,1.487943,89
2,1,0.631273,-1.068088,-0.797104,-1.008011,-1.215258,-0.302938,29.0,63,0.208818,...,4.788633,0.043605,0.208818,13.155526,301.683861,2.747109,828.798151,1,0.631273,63
3,2,-1.411555,1.349162,1.062337,1.146622,1.007002,0.851095,140.0,2,0.025968,...,76.988844,0.001349,0.103871,0.051936,76.988844,0.001349,0.103871,2,-1.411555,2
4,1,-0.225397,-0.678209,-1.011655,1.146622,0.689536,0.722869,146.0,37,-1.480887,...,-0.675276,2.193026,-1.480887,-54.792812,-24.985198,81.141953,-2027.334048,1,-0.225397,37
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
749995,2,-0.884374,1.427138,1.562956,1.745131,1.959399,1.107547,230.0,18,1.102455,...,1.814116,2.430815,4.409821,19.844194,16.327046,21.877336,357.195494,2,-0.884374,18
749996,1,1.487943,-0.756185,-0.868621,0.308709,-0.368683,0.594643,96.0,89,-0.816723,...,-1.224420,0.667037,-0.816723,-72.688379,-108.973361,59.366297,-6469.265731,1,1.487943,89
749997,2,1.224352,-0.990113,-0.582554,1.625430,1.853577,1.107547,221.0,82,0.764124,...,2.617341,1.167772,3.056497,62.658186,107.310991,47.878637,5137.971212,2,1.224352,82
749998,2,0.235887,0.569404,1.133854,0.189007,0.689536,0.338192,109.0,52,2.049687,...,0.975754,8.402434,8.198748,106.583728,25.369602,218.463291,5542.353869,2,0.235887,52


In [38]:
test

Unnamed: 0,Sex,Age,Height,Weight,Duration,Heart_Rate,Body_Temp,AgeSex,BMI,HR_per_min,...,Sex/BMI,Sex**BMI,BMI**Sex,AgeSex*BMI,AgeSex/BMI,AgeSex**BMI,BMI**AgeSex,CAT_Sex,CAT_Age,CAT_AgeSex
0,2,0.233733,0.177381,0.418634,-1.007942,-0.897244,-0.303287,52,0.983561,12.428571,...,2.033408,1.934783,3.934242,51.145148,52.868603,50.304349,2659.547694,2,0.233733,52
1,2,-1.018101,1.971171,1.563168,0.549109,0.584215,0.595940,14,-0.077877,5.050000,...,-25.684873,0.012130,-0.311507,-1.090276,-179.794110,0.084907,-15.263862,2,-1.018101,14
2,1,-0.820443,1.035280,0.704768,0.070016,0.690034,0.467479,19,-0.210607,6.375000,...,-4.748403,0.044355,-0.210607,-4.001535,-90.219665,0.842752,-76.029164,1,-0.820443,19
3,1,-0.161583,-0.212574,-0.153633,0.549109,1.219126,0.724402,39,0.203583,5.350000,...,4.911755,0.041446,0.203583,7.939745,191.558463,1.616399,309.650043,1,-0.161583,39
4,1,-0.754557,-0.134583,-0.582834,0.070016,-0.156514,0.595940,21,-1.310668,5.875000,...,-0.762975,1.717851,-1.310668,-27.524033,-16.022484,36.074875,-578.004683,1,-0.754557,21
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
249995,1,0.958478,-1.226455,-0.940500,-1.127716,-1.108881,-0.817131,73,0.103598,14.166667,...,9.651778,0.010733,0.103598,7.562644,704.579773,0.783474,552.072977,1,0.958478,73
249996,2,-0.622785,2.127152,1.849302,-1.487035,-1.214700,-2.101742,26,0.254501,28.000000,...,7.858218,0.129541,1.018003,6.617016,102.156834,1.684035,172.042424,2,-0.622785,26
249997,1,-0.688671,-0.836501,-0.797434,-0.169530,0.266760,0.082096,23,-0.378612,7.000000,...,-2.641294,0.143347,-0.378612,-8.708083,-60.749762,3.296987,-200.285910,1,-0.688671,23
249998,1,1.353794,-1.304446,-1.012034,1.147974,1.113308,0.852863,85,0.044620,4.240000,...,22.406209,0.001991,0.044620,3.792741,1904.527739,0.169234,322.383017,1,1.353794,85


In [39]:
# Separate features and target
X = train.drop(columns=['Calories'])
y = train['Calories']

# Train-validation split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [62]:
# Identify categorical columns
cat_cols = X_train.select_dtypes(include='category').columns

# Ordinal encode
encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
X_train[cat_cols] = encoder.fit_transform(X_train[cat_cols])
X_val[cat_cols] = encoder.transform(X_val[cat_cols])

# Log-transform the target for RMSLE compatibility
y_train_log = np.log1p(y_train)

# Create DMatrix with log-transformed labels
dtrain = xgb.DMatrix(X_train, label=y_train_log, enable_categorical=True)

params = {
    'objective': 'reg:squarederror',
    'eval_metric': 'rmse',
    'tree_method': 'hist',
    'learning_rate': 0.03,
    'max_depth': 7,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'reg_alpha': 0.5,
    'reg_lambda': 1.0,
    'seed': 42,
    'verbosity': 1
}

cv_results = xgb.cv(
    params,
    dtrain,
    num_boost_round=10000,
    nfold=5,
    early_stopping_rounds=50,
    metrics='rmse',
    seed=42,
    verbose_eval=50,
    as_pandas=True,
    stratified=False
)

print(cv_results.tail())

best_num_boost_rounds = len(cv_results)

# Train final model using best number of boosting rounds
model = xgb.train(
    params,
    dtrain,
    num_boost_round=best_num_boost_rounds,
    verbose_eval=50
)

# Predict on validation set
dval = xgb.DMatrix(X_val, enable_categorical=True)
y_val_pred_log = model.predict(dval)
y_val_pred = np.expm1(y_val_pred_log)

rmsle_val = np.sqrt(mean_squared_log_error(y_val, y_val_pred))
print(f"RMSLE on validation data: {rmsle_val:.4f}")

[0]	train-rmse:0.93483+0.00032	test-rmse:0.93484+0.00125
[50]	train-rmse:0.21659+0.00009	test-rmse:0.21696+0.00028
[100]	train-rmse:0.07637+0.00015	test-rmse:0.07804+0.00085
[150]	train-rmse:0.05925+0.00014	test-rmse:0.06207+0.00117
[200]	train-rmse:0.05713+0.00012	test-rmse:0.06063+0.00122
[250]	train-rmse:0.05635+0.00009	test-rmse:0.06037+0.00122
[300]	train-rmse:0.05582+0.00008	test-rmse:0.06027+0.00121
[350]	train-rmse:0.05535+0.00008	test-rmse:0.06022+0.00120
[400]	train-rmse:0.05490+0.00008	test-rmse:0.06017+0.00120
[450]	train-rmse:0.05446+0.00006	test-rmse:0.06014+0.00120
[500]	train-rmse:0.05404+0.00005	test-rmse:0.06013+0.00120
[550]	train-rmse:0.05364+0.00005	test-rmse:0.06011+0.00119
[600]	train-rmse:0.05326+0.00004	test-rmse:0.06011+0.00119
[650]	train-rmse:0.05289+0.00004	test-rmse:0.06011+0.00119
[700]	train-rmse:0.05254+0.00005	test-rmse:0.06011+0.00119
[705]	train-rmse:0.05251+0.00005	test-rmse:0.06011+0.00119
     train-rmse-mean  train-rmse-std  test-rmse-mean  test-

In [63]:
import lightgbm as lgb

# Categorical column indices for LightGBM
cat_cols_idx = [X_train.columns.get_loc(col) for col in cat_cols]

# Create LightGBM datasets
train_data = lgb.Dataset(X_train, label=y_train_log, categorical_feature=cat_cols_idx)
val_data = lgb.Dataset(X_val, label=np.log1p(y_val), categorical_feature=cat_cols_idx, reference=train_data)

params_lgb = {
    'objective': 'regression',
    'metric': 'rmse',
    'learning_rate': 0.03,
    'num_leaves': 31,
    'max_depth': 7,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'lambda_l1': 0.5,
    'lambda_l2': 1.0,
    'verbose': -1,
    'seed': 42
}

# Use callback for early stopping instead of early_stopping_rounds parameter
callbacks = [lgb.early_stopping(stopping_rounds=50), lgb.log_evaluation(period=50)]

lgb_model = lgb.train(
    params_lgb,
    train_data,
    valid_sets=[train_data, val_data],
    num_boost_round=10000,
    callbacks=callbacks
)

# Predict on validation, inverse log transform
y_val_pred_log = lgb_model.predict(X_val, num_iteration=lgb_model.best_iteration)
y_val_pred = np.expm1(y_val_pred_log)

rmsle_val = np.sqrt(mean_squared_log_error(y_val, y_val_pred))
print(f"LightGBM RMSLE on validation data: {rmsle_val:.4f}")

Training until validation scores don't improve for 50 rounds
[50]	training's rmse: 0.230229	valid_1's rmse: 0.22994
[100]	training's rmse: 0.0853315	valid_1's rmse: 0.0850399
[150]	training's rmse: 0.0653953	valid_1's rmse: 0.0654262
[200]	training's rmse: 0.0622809	valid_1's rmse: 0.0626621
[250]	training's rmse: 0.0612525	valid_1's rmse: 0.0619335
[300]	training's rmse: 0.0606237	valid_1's rmse: 0.061567
[350]	training's rmse: 0.0600859	valid_1's rmse: 0.0612629
[400]	training's rmse: 0.0596821	valid_1's rmse: 0.0610917
[450]	training's rmse: 0.0593072	valid_1's rmse: 0.0609282
[500]	training's rmse: 0.0589639	valid_1's rmse: 0.0608073
[550]	training's rmse: 0.0586268	valid_1's rmse: 0.060665
[600]	training's rmse: 0.0583459	valid_1's rmse: 0.0605668
[650]	training's rmse: 0.0581072	valid_1's rmse: 0.0604947
[700]	training's rmse: 0.057867	valid_1's rmse: 0.0604185
[750]	training's rmse: 0.0576477	valid_1's rmse: 0.0603819
[800]	training's rmse: 0.0574436	valid_1's rmse: 0.0603305
[8

In [64]:
# XGBoost predictions (log scale)
y_val_pred_xgb_log = model.predict(xgb.DMatrix(X_val))
y_val_pred_xgb = np.expm1(y_val_pred_xgb_log)

# LightGBM predictions (log scale)
y_val_pred_lgb_log = lgb_model.predict(X_val, num_iteration=lgb_model.best_iteration)
y_val_pred_lgb = np.expm1(y_val_pred_lgb_log)

In [65]:
# Simple average ensemble
y_val_pred_ensemble = (y_val_pred_xgb + y_val_pred_lgb) / 2

rmsle_ensemble = np.sqrt(mean_squared_log_error(y_val, y_val_pred_ensemble))
print(f"Ensembled RMSLE on validation data: {rmsle_ensemble:.4f}")

Ensembled RMSLE on validation data: 0.0599


In [69]:
# Ordinal encode categorical columns
test[cat_cols] = encoder.transform(test[cat_cols])

# Convert category columns to int to avoid XGBoost dtype error
test[cat_cols] = test[cat_cols].astype(int)

# Ensure all columns are numeric
for col in test.columns:
    if test[col].dtype.name == 'category':
        test[col] = test[col].astype(int)

# Predict with XGBoost
y_test_pred_xgb_log = model.predict(xgb.DMatrix(test))
y_test_pred_xgb = np.expm1(y_test_pred_xgb_log)

# Predict with LightGBM (it can handle categorical features passed as int)
y_test_pred_lgb_log = lgb_model.predict(test, num_iteration=lgb_model.best_iteration)
y_test_pred_lgb = np.expm1(y_test_pred_lgb_log)

# Simple average ensemble
y_test_pred_ensemble = (y_test_pred_xgb + y_test_pred_lgb) / 2

# Ensure predictions are non-negative
y_test_pred_ensemble = np.maximum(y_test_pred_ensemble, 0)

# Save submission
submission_df = pd.DataFrame({
    'id': test_df['id'],
    'Calories_Burned': y_test_pred_ensemble
})

submission_df

Unnamed: 0,id,Calories_Burned
0,750000,28.090729
1,750001,109.709420
2,750002,88.053038
3,750003,126.778894
4,750004,76.186421
...,...,...
249995,999995,25.058289
249996,999996,8.228186
249997,999997,73.765053
249998,999998,169.255554


In [70]:
submission_df.to_csv('submission1.csv', index=False)
print("✅ submission1.csv saved successfully.")

✅ submission1.csv saved successfully.
