<style>
    @keyframes textAnimation10 {
        0% { color: #FF0000; } /* Changed to #FF0000 */
        50% { color: #E74C3C; }
        100% { color: #9B59B6; }
    }

    @keyframes pulseGlow {
        0% {
            box-shadow: 0 0 15px rgba(255, 0, 0, 0.5); /* Changed to rgba(255, 0, 0, 0.5) */
        }
        50% {
            box-shadow: 0 0 30px rgba(255, 0, 0, 0.8); /* Changed to rgba(255, 0, 0, 0.8) */
        }
        100% {
            box-shadow: 0 0 15px rgba(255, 0, 0, 0.5); /* Changed to rgba(255, 0, 0, 0.5) */
        }
    }
</style>

<div style="
    border: 3px solid #FF0000; /* Changed to #FF0000 */
    border-radius: 25px; 
    padding: 25px; 
    box-shadow: 0 5px 10px rgba(0, 0, 0, 0.3); 
    background-color: #F9EBEA; 
    max-width: 550px; 
    margin: 15px auto; 
    animation: pulseGlow 2s infinite; /* Glow animation for the box */
">
    <p style="
        font-family: 'Garamond', serif; 
        font-size: 42px; /* Increased font size for emphasis */
        text-align: center; 
        color: #FF0000; /* Changed to #FF0000 */
        font-style: italic; 
        font-weight: bold; 
        animation: textAnimation10 5s infinite alternate;
        text-shadow: 1px 1px 3px rgba(0, 0, 0, 0.3); /* Soft shadow for text */
    "> 
        S5E5 ðŸ§­
    </p>
</div>


# Introduction

**Note:** This notebook focuses solely on **feature engineering** and **model evaluation**.  
For **Exploratory Data Analysis (EDA)**, please refer to [this notebook](https://www.kaggle.com/code/swandipsingha/s5e5-eda-fe-xgboost).



<span style="font-family:cursive; font-size:36px; padding:8px 16px; border-radius:10px; border: 2px solid #FF0000; font-weight:bold; text-shadow: 2px 2px 5px rgba(4, 2, 0, 0.5); color:#FF0000;"> 
    1 âœ¨ | 
    <span style="color:#FF0000; font-style: italic;">Load Data
</span>


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.simplefilter('ignore')

train = pd.read_csv("/kaggle/input/playground-series-s5e5/train.csv")
test = pd.read_csv("/kaggle/input/playground-series-s5e5/test.csv")
submission = pd.read_csv("/kaggle/input/playground-series-s5e5/sample_submission.csv")

In [None]:
numerical_features = ['Age', 'Height', 'Weight', 'Duration', 'Heart_Rate', 'Body_Temp']

<span style="font-family:cursive; font-size:36px; padding:8px 16px; border-radius:10px; border: 2px solid #FF0000; font-weight:bold; text-shadow: 2px 2px 5px rgba(4, 2, 0, 0.5); color:#FF0000;"> 
    2 âœ¨ | 
    <span style="color:#FF0000; font-style: italic;">Feature engineering
</span>


In [None]:
import pandas as pd
import numpy as np
import itertools
from sklearn.preprocessing import LabelEncoder, PolynomialFeatures, StandardScaler

def add_feature_cross_terms(df, features):
    df = df.copy()
    df = df.loc[:, ~df.columns.duplicated()]  
    for i in range(len(features)):
        for j in range(i + 1, len(features)):
            f1 = features[i]
            f2 = features[j]
            df[f"{f1}_x_{f2}"] = df[f1] * df[f2]
    return df

def add_interaction_features(df, features):
    df_new = df.copy()
    for f1, f2 in itertools.combinations(features, 2):
        df_new[f"{f1}_plus_{f2}"] = df_new[f1] + df_new[f2]
        df_new[f"{f1}_minus_{f2}"] = df_new[f1] - df_new[f2]
        df_new[f"{f2}_minus_{f1}"] = df_new[f2] - df_new[f1]
        df_new[f"{f1}_div_{f2}"] = df_new[f1] / (df_new[f2] + 1e-5)
        df_new[f"{f2}_div_{f1}"] = df_new[f2] / (df_new[f1] + 1e-5)
    return df_new

def add_statistical_features(df, features):
    df_new = df.copy()
    df_new["row_mean"] = df[features].mean(axis=1)
    df_new["row_std"] = df[features].std(axis=1)
    df_new["row_max"] = df[features].max(axis=1)
    df_new["row_min"] = df[features].min(axis=1)
    df_new["row_median"] = df[features].median(axis=1)
    return df_new

train = add_feature_cross_terms(train, numerical_features)
test = add_feature_cross_terms(test, numerical_features)

train = add_interaction_features(train, numerical_features)
test = add_interaction_features(test, numerical_features)

train = add_statistical_features(train, numerical_features)
test = add_statistical_features(test, numerical_features)

le = LabelEncoder()
train['Sex'] = le.fit_transform(train['Sex'])
test['Sex'] = le.transform(test['Sex'])

train['Sex'] = train['Sex'].astype('category')
test['Sex'] = test['Sex'].astype('category')

poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
poly_train = poly.fit_transform(train[numerical_features])
poly_test = poly.transform(test[numerical_features])
poly_feature_names = poly.get_feature_names_out(numerical_features)

poly_train_df = pd.DataFrame(poly_train, columns=poly_feature_names)
poly_test_df = pd.DataFrame(poly_test, columns=poly_feature_names)

train = pd.concat([train.reset_index(drop=True), poly_train_df], axis=1)
test = pd.concat([test.reset_index(drop=True), poly_test_df], axis=1)

X = train.drop(columns=['id', 'Calories'])
y = np.log1p(train['Calories'])  
X_test = test.drop(columns=['id'])


FEATURES = X.columns.tolist()


<span style="font-family:cursive; font-size:36px; padding:8px 16px; border-radius:10px; border: 2px solid #FF0000; font-weight:bold; text-shadow: 2px 2px 5px rgba(4, 2, 0, 0.5); color:#FF0000;"> 
    3 âœ¨ | 
    <span style="color:#FF0000; font-style: italic;">Model evaluation
</span>


In [None]:
"""
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_log_error
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
import time

FOLDS = 7
kf = KFold(n_splits=FOLDS, shuffle=True, random_state=42)
models = {
    'CatBoost': CatBoostRegressor(verbose=100, random_seed=42, cat_features=['Sex'], early_stopping_rounds=100),
    'XGBoost': XGBRegressor(max_depth=10, colsample_bytree=0.7, subsample=0.9, n_estimators=2000, learning_rate=0.02,
                            gamma=0.01, max_delta_step=2, early_stopping_rounds=100, eval_metric='rmse',
                            enable_categorical=True, random_state=42),
    'LightGBM': LGBMRegressor(n_estimators=2000, learning_rate=0.02, max_depth=10, colsample_bytree=0.7,
                              subsample=0.9, random_state=42, verbose=-1)
}

results = {name: {'oof': np.zeros(len(train)), 'pred': np.zeros(len(test)), 'rmsle': []} for name in models}

for name, model in models.items():
    print(f"\n=== Training {name} ===")
    for i, (train_idx, valid_idx) in enumerate(kf.split(X, y)):
        print(f"\nFold {i+1}")
        x_train, y_train = X.iloc[train_idx], y[train_idx]
        x_valid, y_valid = X.iloc[valid_idx], y[valid_idx]
        
        x_train = x_train.loc[:, ~x_train.columns.duplicated()]
        x_valid = x_valid.loc[:, ~x_valid.columns.duplicated()]
        x_test = X_test.loc[:, ~X_test.columns.duplicated()].copy()

        start = time.time()
        
        if name == 'XGBoost':
            model.fit(x_train, y_train, eval_set=[(x_valid, y_valid)], verbose=100)
        elif name == 'CatBoost':
            model.fit(x_train, y_train, eval_set=(x_valid, y_valid))
        else:
            model.fit(x_train, y_train)

        oof_pred = model.predict(x_valid)
        test_pred = model.predict(x_test)
        
        results[name]['oof'][valid_idx] = oof_pred
        results[name]['pred'] += test_pred / FOLDS
        
        rmsle = np.sqrt(mean_squared_log_error(np.expm1(y_valid), np.expm1(oof_pred)))
        results[name]['rmsle'].append(rmsle)
        
        print(f"Fold {i+1} RMSLE: {rmsle:.4f}")
        print(f"Training time: {time.time() - start:.1f} sec")


print("\n=== Model Comparison ===")
for name in models:
    mean_rmsle = np.mean(results[name]['rmsle'])
    std_rmsle = np.std(results[name]['rmsle'])
    print(f"{name} - Mean RMSLE: {mean_rmsle:.4f} Â± {std_rmsle:.4f}")

"""

<span style="font-family:cursive; font-size:36px; padding:8px 16px; border-radius:10px; border: 2px solid #FF0000; font-weight:bold; text-shadow: 2px 2px 5px rgba(4, 2, 0, 0.5); color:#FF0000;"> 
    4 âœ¨ | 
    <span style="color:#FF0000; font-style: italic;">Submission
</span>


In [None]:
"""
from scipy.optimize import minimize
from sklearn.metrics import mean_squared_log_error

oof_preds = {name: np.expm1(results[name]['oof']) for name in results}
test_preds = {name: np.expm1(results[name]['pred']) for name in results}
y_true = np.expm1(y)

def rmsle_loss(weights):
    blended = (
        weights[0] * oof_preds['CatBoost'] +
        weights[1] * oof_preds['XGBoost'] +
        weights[2] * oof_preds['LightGBM']
    )
    return np.sqrt(mean_squared_log_error(y_true, blended))

initial_weights = [1/3, 1/3, 1/3]
constraints = ({'type': 'eq', 'fun': lambda w: 1 - sum(w)})
bounds = [(0, 1)] * 3

res = minimize(rmsle_loss, initial_weights, method='SLSQP', bounds=bounds, constraints=constraints)
best_weights = res.x

print(f"\nâœ… Optimized Weights:")
print(f"CatBoost = {best_weights[0]:.4f}")
print(f"XGBoost  = {best_weights[1]:.4f}")
print(f"LightGBM = {best_weights[2]:.4f}")

blended_preds = (
    best_weights[0] * test_preds['CatBoost'] +
    best_weights[1] * test_preds['XGBoost'] +
    best_weights[2] * test_preds['LightGBM']
)

blended_preds = np.clip(blended_preds, 1, 314)

submission['Calories'] = blended_preds
submission.to_csv('submission.csv', index=False)

print("\nSubmission Head:")
print(submission.head())

print(f"\nPredict Mean: {blended_preds.mean():.2f}")
print(f"Predict Median: {np.median(blended_preds):.2f}")
"""

<span style="font-family:cursive; font-size:36px; padding:8px 16px; border-radius:10px; border: 2px solid #FF0000; font-weight:bold; text-shadow: 2px 2px 5px rgba(4, 2, 0, 0.5); color:#FF0000;"> 
    5 âœ¨ | 
    <span style="color:#FF0000; font-style: italic;">Ensemble
</span>


In [None]:
import pandas as pd
import numpy as np

df1 = pd.read_csv("/kaggle/input/caloriecast-adaptive-ensemble-engine-for-s5e5/submission.csv")
df2 = pd.read_csv("/kaggle/input/ensemble-of-solutions/submission.csv")
df3 = pd.read_csv("/kaggle/input/ps-s5e5-log-blended-cat-xgboost-with-50-fold-cv/ensemble_submission.csv")


ground_truth = pd.read_csv("/kaggle/input/playground-series-s5e5/sample_submission.csv")  

ground_truth['Calories'] = (0.4 * df1['Calories']) + (0.3 * df2['Calories'])+(.3 * df3['Calories'])
ground_truth.to_csv('submission.csv', index=False)

# If you found this notebook helpful or learned something interesting, please consider **upvoting** it ðŸ˜Š
