In [2]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import Callback, EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.layers import Normalization
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import pandas as pd

# Set seeds for reproducibility
tf.random.set_seed(10)
np.random.seed(10)

In [3]:
# Load data
train_df = pd.read_csv('playground-series-s5e5/train.csv')

# Encode 'Sex' column: male=1, female=0
train_df['Sex'] = train_df['Sex'].map({'male': 1, 'female': 0})

# Features and target
X = train_df.drop(columns=['id', 'Calories']).values
y = train_df['Calories'].values

# Split into train and validation
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.1, random_state=42
)

# Create normalization layer and adapt on training features
normalizer = Normalization()
normalizer.adapt(X_train)

In [4]:
input_layer = Input(shape=(X_train.shape[1],))
x = normalizer(input_layer)

# Branch 1
x1 = Dense(128, activation='relu')(x)
x1 = Dense(64, activation='sigmoid')(x1)
x1 = Dense(32, activation='tanh')(x1)

# Branch 2
x2 = Dense(16, activation='sigmoid')(x)
x2 = Dense(8, activation='tanh')(x2)

combined = concatenate([x1, x2])
output = Dense(1, activation='relu')(combined)

model = Model(inputs=input_layer, outputs=output)

In [5]:
model.summary()

In [6]:
def rmsle(y_true, y_pred):
    y_true = tf.clip_by_value(y_true, 0, np.inf)
    y_pred = tf.clip_by_value(y_pred, 0, np.inf)
    first_log = tf.math.log1p(y_pred)
    second_log = tf.math.log1p(y_true)
    return tf.sqrt(tf.reduce_mean(tf.square(first_log - second_log)))

model.compile(optimizer=Adam(learning_rate=0.0001), loss=rmsle, metrics=[rmsle])


In [7]:
class RMSLELogger(Callback):
    def __init__(self, validation_data):
        super().__init__()
        self.validation_data = validation_data

    def on_epoch_end(self, epoch, logs=None):
        X_val, y_val = self.validation_data
        val_preds = self.model.predict(X_val, verbose=0)
        val_preds = np.clip(val_preds, 0, None)
        y_val_clip = np.clip(y_val, 0, None)
        val_rmsle = np.sqrt(np.mean(np.square(np.log1p(val_preds) - np.log1p(y_val_clip))))
        print(f"Epoch {epoch + 1}: val_RMSLE = {val_rmsle:.5f}")

In [None]:
callbacks = [
    EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True),
    ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2, verbose=1, min_lr=1e-6),
    RMSLELogger(validation_data=(X_val, y_val))
]

history = model.fit(
    X_train[:50000], y_train[:50000],
    validation_data=(X_val, y_val),
    epochs=200,
    batch_size=64,
    callbacks=callbacks,
    verbose=1
)

Epoch 1/200
[1m737/782[0m [32m━━━━━━━━━━━━━━━━━━[0m[37m━━[0m [1m0s[0m 546us/step - loss: 3.1299 - rmsle: 3.1306

In [None]:
plt.figure(figsize=(8,5))
plt.plot(history.history['rmsle'], label='Train RMSLE')
plt.plot(history.history['val_rmsle'], label='Validation RMSLE')
plt.xlabel('Epoch')
plt.ylabel('RMSLE')
plt.title('Train vs Validation RMSLE')
plt.legend()
plt.grid(True)
plt.show()

In [6]:
# Map 'Sex' to numerical values
sex_map = {'male': 0, 'female': 1}
train_df['Sex'] = train_df['Sex'].map(sex_map)
test_df['Sex'] = test_df['Sex'].map(sex_map)

# Calculate height in meters
train_df['Height_m'] = train_df['Height'] / 100
test_df['Height_m'] = test_df['Height'] / 100

# Calculate BMI
train_df['BMI'] = train_df['Weight'] / (train_df['Height_m'] ** 2)
test_df['BMI'] = test_df['Weight'] / (test_df['Height_m'] ** 2)

# Add Calories per Minute (for analysis only, not used in features)
train_df['Calories_per_min'] = train_df['Calories'] / train_df['Duration']

# Define features
features = ['Sex', 'Age', 'Height', 'Weight', 'Duration', 'Heart_Rate', 'Body_Temp', 'BMI']
X = train_df[features].values
y = np.log1p(train_df['Calories'].values)  # log1p for RMSLE handling
X_test = test_df[features].values

In [7]:
def rmsle(y_true, y_pred):
    y_true = tf.clip_by_value(y_true, 0, np.inf)
    y_pred = tf.clip_by_value(y_pred, 0, np.inf)
    first_log = tf.math.log1p(y_pred)
    second_log = tf.math.log1p(y_true)
    return tf.sqrt(tf.reduce_mean(tf.square(first_log - second_log)))

In [8]:
X_train_sub, X_val, y_train_sub, y_val = train_test_split(
    X_train, y_train, test_size=0.1, random_state=42
)

NameError: name 'X_train' is not defined

In [17]:
tf.random.set_seed(10)
np.random.seed(10)

# --- Input Layer ---
input_layer = Input(shape=(X_train.shape[1],))
x = normalizer(input_layer)

# --- Branch: model ---
x1 = Dense(128, activation='relu')(x)
x1 = Dense(64, activation='sigmoid')(x1)
x1 = Dense(32, activation='tanh')(x1)

# --- Branch: model1 ---
x2 = Dense(16, activation='sigmoid')(x)
x2 = Dense(8, activation='tanh')(x2)

# --- Concatenate ---
combined = concatenate([x1, x2])

output = Dense(1, activation='relu')(combined)

# --- Build Model ---
ensemble_model = Model(inputs=input_layer, outputs=output)
ensemble_model.compile(optimizer=Adam(learning_rate=0.0001), loss=rmsle, metrics=[rmsle])

# --- Callbacks ---
class RMSLELogger(Callback):
    def __init__(self, validation_data):
        super().__init__()
        self.validation_data = validation_data  # (X_val, y_val)

    def on_epoch_end(self, epoch, logs=None):
        logs = logs or {}
        X_val, y_val = self.validation_data
        val_preds = self.model.predict(X_val, verbose=0)
        val_preds = np.clip(val_preds, 0, None)
        y_val = np.clip(y_val, 0, None)
        rmsle_val = np.sqrt(np.mean(np.square(np.log1p(val_preds) - np.log1p(y_val))))
        print(f"Epoch {epoch+1}: val_RMSLE = {rmsle_val:.5f}")

In [18]:
ensemble_model.summary()

In [19]:
# Split into training and validation sets
X_train_sub, X_val, y_train_sub, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=42)
rmsle_logger = RMSLELogger(validation_data=(X_val, y_val))

In [20]:
# --- Train Model ---
history = ensemble_model.fit(
    X_train, y_train,
    validation_split=0.1,
    epochs=200,
    batch_size=256,
    callbacks=callbacks,
    verbose=1
)

# --- Plot Loss ---
plt.figure(figsize=(8, 5))
plt.plot(history.history['rmsle'], label='Train RMSLE')
plt.plot(history.history['val_rmsle'], label='Validation RMSLE')
plt.xlabel('Epoch')
plt.ylabel('RMSLE')
plt.title('Train vs Validation RMSLE')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

Epoch 1/200
[1m2327/2374[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 822us/step - loss: 1.6305 - rmsle: 1.6305

AttributeError: 'RMSLELogger' object has no attribute 'validation_data'

In [63]:
# --- RMSLE Evaluation ---
split_idx = int(0.9 * len(X_train))
train_preds_log = ensemble_model.predict(X_train[:split_idx], verbose=0)
train_true = y_train[:split_idx]

train_preds = np.expm1(train_preds_log)
train_true_exp = np.expm1(train_true)

rmsle_train = np.sqrt(np.mean((np.log1p(train_preds.flatten()) - np.log1p(train_true_exp))**2))
print(f"\nTrain RMSLE: {rmsle_train:.5f}")

val_preds_log = ensemble_model.predict(X_train[split_idx:], verbose=0)
val_true = y_train[split_idx:]

val_preds = np.expm1(val_preds_log)
val_true_exp = np.expm1(val_true)

rmsle_val = np.sqrt(np.mean((np.log1p(val_preds.flatten()) - np.log1p(val_true_exp))**2))
print(f"Validation RMSLE: {rmsle_val:.5f}")


Train RMSLE: 5.03627
Validation RMSLE: 5.03400


In [163]:
import xgboost as xgb
from sklearn.metrics import mean_squared_log_error
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# Standard scaling (XGBoost can work without scaling, but consistent normalization is good for comparison)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

xgb.XGBRegressor(
    learning_rate=0.02,
    n_estimators=800,
    max_depth=5,
    min_child_weight=5,
    gamma=1.0,
    subsample=0.7,
    colsample_bytree=0.6,
    reg_alpha=1.0,
    reg_lambda=2.0,
    random_state=42
)

# Train
xgb_model.fit(
    X_train_scaled, y_train,
    eval_set=[(X_val_scaled, y_val)],
    verbose=True
)

# Predict (log scale)
train_preds_log = xgb_model.predict(X_train_scaled)
val_preds_log = xgb_model.predict(X_val_scaled)

# Convert back from log1p
train_preds = np.expm1(train_preds_log)
train_true = np.expm1(y_train)

val_preds = np.expm1(val_preds_log)
val_true = np.expm1(y_val)

# Calculate RMSLE
rmsle_train = np.sqrt(mean_squared_log_error(train_true, train_preds))
rmsle_val = np.sqrt(mean_squared_log_error(val_true, val_preds))

print(f"\nXGBoost Train RMSLE: {rmsle_train:.5f}")
print(f"XGBoost Validation RMSLE: {rmsle_val:.5f}")


[0]	validation_0-rmse:0.91860
[1]	validation_0-rmse:0.87392
[2]	validation_0-rmse:0.83128
[3]	validation_0-rmse:0.79094
[4]	validation_0-rmse:0.75309
[5]	validation_0-rmse:0.71654
[6]	validation_0-rmse:0.68207
[7]	validation_0-rmse:0.64909
[8]	validation_0-rmse:0.61878
[9]	validation_0-rmse:0.58900
[10]	validation_0-rmse:0.56095
[11]	validation_0-rmse:0.53410
[12]	validation_0-rmse:0.50884
[13]	validation_0-rmse:0.48477
[14]	validation_0-rmse:0.46198
[15]	validation_0-rmse:0.44038
[16]	validation_0-rmse:0.42204
[17]	validation_0-rmse:0.40230
[18]	validation_0-rmse:0.38358
[19]	validation_0-rmse:0.36572
[20]	validation_0-rmse:0.34878
[21]	validation_0-rmse:0.33270
[22]	validation_0-rmse:0.31789
[23]	validation_0-rmse:0.30362
[24]	validation_0-rmse:0.29012
[25]	validation_0-rmse:0.27758
[26]	validation_0-rmse:0.26532
[27]	validation_0-rmse:0.25363
[28]	validation_0-rmse:0.24283
[29]	validation_0-rmse:0.23252
[30]	validation_0-rmse:0.22260
[31]	validation_0-rmse:0.21322
[32]	validation_0-

In [165]:
# --- Predict with ensemble model ---
split_idx = int(0.9 * len(X_train))

# Ensemble predictions (log scale)
ensemble_train_preds_log = ensemble_model.predict(X_train[:split_idx], verbose=0).flatten()
ensemble_val_preds_log = ensemble_model.predict(X_train[split_idx:], verbose=0).flatten()

# XGBoost predictions (log scale)
xgb_train_preds_log = xgb_model.predict(X_train_scaled[:split_idx])
xgb_val_preds_log = xgb_model.predict(X_train_scaled[split_idx:])

# --- Average predictions in log scale ---
avg_train_preds_log = (ensemble_train_preds_log + xgb_train_preds_log) / 2
avg_val_preds_log = (ensemble_val_preds_log + xgb_val_preds_log) / 2

# --- Convert back from log1p ---
avg_train_preds = np.expm1(avg_train_preds_log)
avg_val_preds = np.expm1(avg_val_preds_log)

# Ground truths (expm1)
train_true_exp = np.expm1(y_train[:split_idx])
val_true_exp = np.expm1(y_train[split_idx:])

# --- Calculate RMSLE for averaged predictions ---
rmsle_train_avg = np.sqrt(np.mean((np.log1p(avg_train_preds) - np.log1p(train_true_exp)) ** 2))
rmsle_val_avg = np.sqrt(np.mean((np.log1p(avg_val_preds) - np.log1p(val_true_exp)) ** 2))

print(f"\nAveraged Prediction Train RMSLE: {rmsle_train_avg:.5f}")
print(f"Averaged Prediction Validation RMSLE: {rmsle_val_avg:.5f}")



Averaged Prediction Train RMSLE: 0.05844
Averaged Prediction Validation RMSLE: 0.05752
