In [10]:
import pandas as pd
import numpy as np
import time
import warnings
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.metrics import mean_squared_log_error
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
import logging
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import skew
from sklearn.preprocessing import PowerTransformer


# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

warnings.filterwarnings("ignore")

In [None]:
def load_data():
    train = pd.read_csv("/kaggle/input/playground-series-s5e5/train.csv")
    test = pd.read_csv("/kaggle/input/playground-series-s5e5/test.csv")
    submission = pd.read_csv("/kaggle/input/playground-series-s5e5/sample_submission.csv")
    return train, test, submission

In [None]:
def preprocess_data(train, test):
    train['Sex'] = train['Sex'].map({'male': 1, 'female': 0})
    test['Sex'] = test['Sex'].map({'male': 1, 'female': 0})
    train = train.drop_duplicates(subset=train.columns).reset_index(drop=True)
    train = train.groupby(['Sex', 'Age', 'Height', 'Weight', 'Duration', 'Heart_Rate', 'Body_Temp'])['Calories'].min().reset_index()
    return train, test

In [5]:
numerical_features = [
    "Age",
    "Height",
    "Weight",
    "Duration",
    "Heart_Rate",
    "Body_Temp",
    "Calories",
    "BMR",
    'Metabolic_Efficiency',
    'Cardio_Stress',
    'Thermic_Effect',
    'Power_Output',
    'BVI',
    'Age_Adj_Intensity',
    'Gender_Metabolic',
    'HR_Drift',
    'BCI',
    'Thermal_Work',
    'Temp_Binary',
    'HeartRate_Binary',
    'Sex'
]
test_df=pd.read_csv("/kaggle/input/playground-series-s5e5/test.csv")

#### 1. Feature Engineering : All Derived Columns

In [None]:
def add_features(df, train):
    df['BMI'] = df['Weight'] / (df['Height'] / 100) ** 2
    df['Intensity'] = df['Heart_Rate'] / df['Duration']

    df['Sex_Reversed'] = 1 - df['Sex']
    for dur in df['Duration'].unique():
        df[f'HR_Dur_{int(dur)}'] = np.where(df['Duration'] == dur, df['Heart_Rate'], 0)
        df[f'Temp_Dur_{int(dur)}'] = np.where(df['Duration'] == dur, df['Body_Temp'], 0)
    for age in df['Age'].unique():
        df[f'HR_Age_{int(age)}'] = np.where(df['Age'] == age, df['Heart_Rate'], 0)
        df[f'Temp_Age_{int(age)}'] = np.where(df['Age'] == age, df['Body_Temp'], 0)

    for f1 in ['Duration', 'Heart_Rate', 'Body_Temp']:
        for f2 in ['Sex', 'Sex_Reversed']:
            df[f'{f1}_x_{f2}'] = df[f1] * df[f2]

    for col in ['Height', 'Weight', 'Heart_Rate', 'Body_Temp']:
        for agg in ['min', 'max']:
            agg_val = train.groupby('Sex')[col].agg(agg).rename(f'Sex_{col}_{agg}')
            df = df.merge(agg_val, on='Sex', how='left')

    df.drop(columns=['Sex_Reversed'], inplace=True)
    return df

In [6]:
# 0. Basal Metabolic Index (BMR)
train_df['BMR'] = train_df['Weight'] / ((train_df['Height'] / 100) ** 2)
test_df['BMR'] = test_df['Weight'] / ((test_df['Height'] / 100) ** 2)

# 1. Metabolic Efficiency Index (Combines BMR with active calorie burn factors)
train_df['Metabolic_Efficiency'] = train_df['BMR'] * (train_df['Heart_Rate'] / train_df['BMR'].median())
test_df['Metabolic_Efficiency'] = test_df['BMR'] * (test_df['Heart_Rate'] / test_df['BMR'].median())

# 2. Cardiovascular Stress (Heart rate relative to age-adjusted max)
train_df['Cardio_Stress'] = (train_df['Heart_Rate'] / (220 - train_df['Age'])) * train_df['Duration']
test_df['Cardio_Stress'] = (test_df['Heart_Rate'] / (220 - test_df['Age'])) * test_df['Duration']

# 3. Thermic Effect Ratio (Body temp impact per weight unit)
train_df['Thermic_Effect'] = (train_df['Body_Temp'] * 100) / (train_df['Weight'] ** 0.5)
test_df['Thermic_Effect'] = (test_df['Body_Temp'] * 100) / (test_df['Weight'] ** 0.5)

# 4. Power Output Estimate (Weight-based energy expenditure)
train_df['Power_Output'] = train_df['Weight'] * train_df['Duration'] * (train_df['Heart_Rate'] / 1000)
test_df['Power_Output'] = test_df['Weight'] * test_df['Duration'] * (test_df['Heart_Rate'] / 1000)

# 5. Body Volume Index (Alternative to BMI using cube root)
train_df['BVI'] = train_df['Weight'] / ((train_df['Height']/100) ** 3)
test_df['BVI'] = test_df['Weight'] / ((test_df['Height']/100) ** 3)

# 6. Age-Adjusted Intensity (Duration scaled by life stage)
bins = [18, 25, 35, 45, 55, 65, 100]
train_df['Age_Adj_Intensity'] = train_df['Duration'] * pd.cut(train_df['Age'], bins).cat.codes
test_df['Age_Adj_Intensity'] = test_df['Duration'] * pd.cut(test_df['Age'], bins).cat.codes

# 7. Gender-Specific Metabolic Rate (Differentiated energy coefficients)
gender_coeff = {'male': 1.67, 'female': 1.55}  # Based on metabolic studies
train_df['Gender_Metabolic'] = train_df['Sex'].map(gender_coeff) * train_df['BMR']
test_df['Gender_Metabolic'] = test_df['Sex'].map(gender_coeff) * test_df['BMR']

# 8. Cardiovascular Drift (Heart rate change per time unit)
# Assuming data is sorted chronologically
train_df['HR_Drift'] = train_df.groupby('Age')['Heart_Rate'].diff() / train_df['Duration']
test_df['HR_Drift'] = test_df.groupby('Age')['Heart_Rate'].diff() / test_df['Duration']

# 9. Body Composition Index (Height-weight ratio with age decay)
train_df['BCI'] = (train_df['Weight'] * 1000) / (train_df['Height'] ** 1.5) * (1 / (train_df['Age'] ** 0.2))
test_df['BCI'] = (test_df['Weight'] * 1000) / (test_df['Height'] ** 1.5) * (1 / (test_df['Age'] ** 0.2))

# 10. Thermal Work Capacity (Combined temp and duration impact)
train_df['Thermal_Work'] = (train_df['Body_Temp'] ** 2) * np.log1p(train_df['Duration'])
test_df['Thermal_Work'] = (test_df['Body_Temp'] ** 2) * np.log1p(test_df['Duration'])

# Binary classification based on temperature for train_df and test_df
train_df['Temp_Binary'] = np.where(train_df['Body_Temp'] <= 39.5, 0, 1)
test_df['Temp_Binary'] = np.where(test_df['Body_Temp'] <= 39.5, 0, 1)

# Binary classification based on heart rate for train_df and test_df
train_df['HeartRate_Binary'] = np.where(train_df['Heart_Rate'] <= 99.5, 0, 1)
test_df['HeartRate_Binary'] = np.where(test_df['Heart_Rate'] <= 99.5, 0, 1)

# Derived sex column for train_df and test_df
train_df['Sex'] = train_df['Sex'].map({'male': 1, 'female': 0})
test_df['Sex'] = test_df['Sex'].map({'male': 1, 'female': 0})

In [7]:
# Step 1: Select numerical columns
numeric_cols = [col for col in numerical_features if col != "Calories"]

# Step 2: Calculate original skewness on train
original_skewness = train_df[numeric_cols].skew().sort_values(ascending=False)

# Step 3: Initialize transformed DataFrames
train_df_transformed = train_df.copy()
test_df_transformed = test_df.copy()

# Store transformers for each column
transformers = {}

# Step 4: Apply skewness correction based on train_df
for col in numeric_cols:
    if train_df[col].nunique() <= 1:
        continue
    
    if original_skewness[col] > 0.5:  # Right skew
        if (train_df[col] > 0).all():
            # Log transform
            train_df_transformed[col] = np.log1p(train_df[col])
            test_df_transformed[col] = np.log1p(test_df[col])
        else:
            # Yeo-Johnson (handles zero/neg)
            pt = PowerTransformer(method='yeo-johnson')
            train_df_transformed[col] = pt.fit_transform(train_df[[col]])
            test_df_transformed[col] = pt.transform(test_df[[col]])
            transformers[col] = pt
    elif original_skewness[col] < -0.5:  # Left skew
        pt = PowerTransformer(method='yeo-johnson')
        train_df_transformed[col] = pt.fit_transform(train_df[[col]])
        test_df_transformed[col] = pt.transform(test_df[[col]])
        transformers[col] = pt

# Step 5: Calculate skewness after transformation
transformed_skewness = train_df_transformed[numeric_cols].skew().sort_values(ascending=False)

# Step 6: Print comparison
skew_df = pd.DataFrame({
    'Original Skew': original_skewness,
    'Transformed Skew': transformed_skewness
}).sort_values(by='Original Skew', ascending=False)

print(skew_df)

                      Original Skew  Transformed Skew
Age_Adj_Intensity          1.300348         -0.162279
HeartRate_Binary           0.598154          0.598154
Power_Output               0.533623         -0.894505
Age                        0.436397          0.436397
BVI                        0.399564          0.399564
Cardio_Stress              0.330227          0.330227
Weight                     0.211194          0.211194
BCI                        0.196546          0.196546
Thermic_Effect             0.193205          0.193205
Metabolic_Efficiency       0.155355          0.155355
Height                     0.051777          0.051777
Gender_Metabolic           0.041921          0.041921
Duration                   0.026259          0.026259
Sex                        0.003845          0.003845
BMR                       -0.003291         -0.003291
Heart_Rate                -0.005668         -0.005668
Thermal_Work              -0.809934         -0.312075
Body_Temp                 -1

In [8]:
import pandas as pd
import numpy as np

# Make copies to avoid changing the original
cleaned_train_df =train_df_transformed.copy()
cleaned_test_df = test_df_transformed.copy()

# Select numerical columns only (excluding any non-numeric or non-relevant columns)
numeric_cols = [col for col in numerical_features if col != "Calories"]

# Remove outliers using IQR for both train_df and test_df
for col in numeric_cols:
    # Train set
    Q1_train = cleaned_train_df[col].quantile(0.25)
    Q3_train = cleaned_train_df[col].quantile(0.75)
    IQR_train = Q3_train - Q1_train
    lower_bound_train = Q1_train - 1.5 * IQR_train
    upper_bound_train = Q3_train + 1.5 * IQR_train
    cleaned_train_df = cleaned_train_df[(cleaned_train_df[col] >= lower_bound_train) & (cleaned_train_df[col] <= upper_bound_train)]


# Binary classification based on temperature for train_df and test_df
cleaned_test_df=test_df.copy()

  return op(a, b)
  return op(a, b)


In [None]:
def align_columns(train, test):
    common_cols = [col for col in test.columns if col in train.columns and col != 'Calories']
    X = train[common_cols]
    y = np.log1p(train['Calories'])
    X_test = test[common_cols]
    return X, y, X_test

In [None]:
def final_blend(xgb_preds, cat_preds, submission):
    final_preds = 0.49 * np.expm1(xgb_preds) + 0.51 * np.expm1(cat_preds)
    submission['Calories'] = np.clip(final_preds, 1, 314)
    submission.to_csv('submission.csv', index=False)
    logger.info("submission.csv saved")

In [None]:
def diagnostics(cat_oof, xgb_oof, submission):
    plt.hist(np.expm1(cat_oof), bins=50, alpha=0.6, label='CatBoost OOF')
    plt.hist(np.expm1(xgb_oof), bins=50, alpha=0.6, label='XGBoost OOF')
    plt.title("OOF Prediction Distribution")
    plt.xlabel("Calories")
    plt.ylabel("Frequency")
    plt.legend()
    plt.show()

    logger.info("\nFinal Submission Preview:")
    logger.info(submission.describe())
    logger.info(submission.head())

In [None]:
def main():
    train, test, submission = load_data()
    train, test = preprocess_data(train, test)
    train = add_features(train, train)
    test = add_features(test, train)
    X, y, X_test = align_columns(train, test)

    cat_preds, cat_oof = train_catboost(X, y, X_test)
    xgb_preds, xgb_oof = train_xgboost(X, y, X_test)

    final_blend(xgb_preds, cat_preds, submission)
    diagnostics(cat_oof, xgb_oof, submission)

if __name__ == "__main__":
    main()

In [9]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from tqdm import tqdm  # Import tqdm for progress tracking

X = cleaned_train_df.drop(columns=['Calories', 'id'])
y = cleaned_train_df['Calories']

# Step 2: Split data into train and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 3: Initialize the XGBoost model
model = xgb.XGBRegressor(
    objective='reg:squarederror',  # Regression task
    colsample_bytree=0.3,          # Subsample ratio of columns
    learning_rate=0.1,             # Step size at each iteration
    max_depth=5,                   # Maximum depth of a tree
    alpha=10,                      # L2 regularization term
    n_estimators=1000,             # Number of trees
    random_state=42,
    verbose=200  # Set verbose to get more detailed output
)

# Step 4: Train the model with tqdm for progress tracking
for _ in tqdm(range(1), desc="Training Model", ncols=100):
    model.fit(X_train, y_train, eval_set=[(X_test, y_test)], 
              eval_metric="mae", early_stopping_rounds=50, verbose=200)

# Step 5: Prepare test data (remove 'id' column from test_df)
X_test_df = cleaned_test_df.drop(columns=['id'])  # Exclude id from features
y_test_pred = model.predict(X_test_df)  # Predictions for test_df

# Step 6: Prepare the submission file
submission = pd.DataFrame({
    'id': test_df['id'],  # 'id' column from test_df
    'Calories': y_test_pred.clip(0)   # Predictions for 'Calories'
})

# Step 7: Save the submission to a CSV file
submission.to_csv('submission_0.csv', index=False)
print("Submission file 'submission.csv' has been created.")

Training Model:   0%|                                                         | 0/1 [00:00<?, ?it/s]

[0]	validation_0-mae:42.93418


Parameters: { "verbose" } are not used.



[200]	validation_0-mae:2.83631
[400]	validation_0-mae:2.66417
[600]	validation_0-mae:2.60440
[800]	validation_0-mae:2.57861
[999]	validation_0-mae:2.56201


Training Model: 100%|█████████████████████████████████████████████████| 1/1 [00:37<00:00, 37.62s/it]


Submission file 'submission.csv' has been created.


In [None]:
def train_catboost(X, y, X_test):
    bins = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='quantile')
    duration_bins = bins.fit_transform(X[['Duration']]).astype(int).flatten()

    cat_params = {
        'iterations': 2500,
        'learning_rate': 0.02,
        'depth': 10,
        'loss_function': 'RMSE',
        'l2_leaf_reg': 3,
        'random_seed': 42,
        'eval_metric': 'RMSE',
        'early_stopping_rounds': 200,
        'cat_features': ['Sex'],
        'verbose': 0,
        'task_type': 'GPU'
    }

    cat_preds = np.zeros(len(X_test))
    cat_oof = np.zeros(len(X))
    cat_scores = []
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    for fold, (train_idx, val_idx) in enumerate(skf.split(X, duration_bins)):
        model = CatBoostRegressor(**cat_params)
        model.fit(X.iloc[train_idx], y.iloc[train_idx], eval_set=(X.iloc[val_idx], y.iloc[val_idx]), use_best_model=True)
        cat_oof[val_idx] = model.predict(X.iloc[val_idx])
        cat_preds += model.predict(X_test) / skf.n_splits
        fold_score = np.sqrt(mean_squared_log_error(np.expm1(y.iloc[val_idx]), np.expm1(cat_oof[val_idx])))
        logger.info(f"Fold {fold+1} - CatBoost RMSLE: {fold_score:.5f}")
        cat_scores.append(fold_score)
    logger.info(f"CatBoost Mean RMSLE: {np.mean(cat_scores):.5f}")
    return cat_preds, cat_oof

In [None]:
def train_xgboost(X, y, X_test):
    X_xgb = X.copy()
    X_test_xgb = X_test.copy()
    X_xgb['Sex'] = X_xgb['Sex'].astype(int)
    X_test_xgb['Sex'] = X_test_xgb['Sex'].astype(int)

    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    xgb_oof = np.zeros(len(X))
    xgb_preds = np.zeros(len(X_test))
    xgb_scores = []

    xgb_params = {
        'max_depth': 9,
        'colsample_bytree': 0.7,
        'subsample': 0.9,
        'n_estimators': 3000,
        'learning_rate': 0.01,
        'gamma': 0.01,
        'max_delta_step': 2,
        'eval_metric': 'rmse',
        'enable_categorical': False,
        'random_state': 42,
        'early_stopping_rounds': 100,
        'tree_method': 'gpu_hist'
    }

    for fold, (train_idx, val_idx) in enumerate(kf.split(X_xgb)):
        model = XGBRegressor(**xgb_params)
        model.fit(X_xgb.iloc[train_idx], y.iloc[train_idx], eval_set=[(X_xgb.iloc[val_idx], y.iloc[val_idx])], verbose=False)
        xgb_oof[val_idx] = model.predict(X_xgb.iloc[val_idx])
        xgb_preds += model.predict(X_test_xgb) / kf.n_splits
        fold_score = np.sqrt(mean_squared_log_error(np.expm1(y.iloc[val_idx]), np.expm1(xgb_oof[val_idx])))
        logger.info(f"Fold {fold+1} - XGBoost RMSLE: {fold_score:.5f}")
        xgb_scores.append(fold_score)
    logger.info(f"XGBoost Mean RMSLE: {np.mean(xgb_scores):.5f}")
    return xgb_preds, xgb_oof

In [None]:
# # === Final Blend (Weighted Average) ===
# final_preds = 0.49 * np.expm1(xgb_preds) + 0.51 * np.expm1(cat_preds)
# submission['Calories'] = np.clip(final_preds, 1, 314)
# submission.to_csv('submission.csv', index=False)
# print("\n submission.csv saved ")

In [None]:
# # === Diagnostics ===
# plt.hist(np.expm1(cat_oof), bins=50, alpha=0.6, label='CatBoost OOF')
# plt.hist(np.expm1(xgb_oof), bins=50, alpha=0.6, label='XGBoost OOF')
# plt.title("OOF Prediction Distribution")
# plt.xlabel("Calories")
# plt.ylabel("Frequency")
# plt.legend()
# plt.show()

# print("\n Final Submission Preview:")
# print(submission.describe())
# print(submission.head())