In [2]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import mean_squared_log_error
import xgboost as xgb

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
sample_submission = pd.read_csv('sample_submission.csv')

train['is_train'] = 1
test['is_train'] = 0
test['Calories'] = np.nan
combined = pd.concat([train, test], axis=0, ignore_index=True)

for col in combined.select_dtypes(include=['object']).columns:
    encoder = LabelEncoder()
    combined[col] = encoder.fit_transform(combined[col].astype(str))

train = combined[combined['is_train'] == 1].drop(columns=['is_train'])
test = combined[combined['is_train'] == 0].drop(columns=['is_train', 'Calories'])

X = train.drop(['id', 'Calories'], axis=1)
y = train['Calories']
test_ids = test['id']
X_test = test.drop('id', axis=1)

X.fillna(X.mean(), inplace=True)
X_test.fillna(X_test.mean(), inplace=True)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(X_test)

X_train, X_val, y_train, y_val = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

model = xgb.XGBRegressor(objective='reg:squaredlogerror', n_estimators=100, learning_rate=0.1, max_depth=6, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_val)
y_pred = np.clip(y_pred, 0, None)
rmsle = np.sqrt(mean_squared_log_error(y_val, y_pred))
print(f'Validation RMSLE: {rmsle}')

test_predictions = model.predict(X_test_scaled)
test_predictions = np.clip(test_predictions, 0, None)


submission = pd.DataFrame({'id': test_ids, 'Calories_Burned': test_predictions})
submission.to_csv('submission.csv', index=False)

if IN_COLAB:
    files.download('submission.csv')
else:
    print("✅ 'submission.csv' file created in your local working directory.")


Validation RMSLE: 0.0898304054563124


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>