In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import optuna
from sklearn.metrics import mean_squared_log_error
from sklearn.preprocessing import LabelEncoder


In [None]:
train_df = pd.read_csv('/kaggle/input/playground-series-s5e5/train.csv')
test_df = pd.read_csv('//kaggle/input/playground-series-s5e5/test.csv')

In [None]:
le = LabelEncoder()
train_df['Sex'] = le.fit_transform(train_df['Sex'])
test_df['Sex'] = le.transform(test_df['Sex'])

In [None]:
train_df

In [None]:
train_df.describe()

In [None]:
df_major = train_df[train_df['Age'].isin([20, 21])]
df_rest = train_df[~train_df['Age'].isin([20, 21])]

df_major_downsampled = (
    df_major.groupby('Age')
    .sample(n=18500, random_state=42)
)

train_df = pd.concat([df_major_downsampled, df_rest]).sample(frac=1, random_state=42)

df_major = train_df[train_df['Heart_Rate'].isin([91.0])]
df_rest = train_df[~train_df['Heart_Rate'].isin([91.0])]

df_major_downsampled = (
    df_major.groupby('Heart_Rate')
    .sample(n=28000, random_state=42)
)

train_df = pd.concat([df_major_downsampled, df_rest]).sample(frac=1, random_state=42)

df_major = train_df[train_df['Body_Temp'].isin([40.7])]
df_rest = train_df[~train_df['Body_Temp'].isin([40.7])]

df_major_downsampled = (
    df_major.groupby('Body_Temp')
    .sample(n=60000, random_state=42)
)

train_df = pd.concat([df_major_downsampled, df_rest]).sample(frac=1, random_state=42)

In [None]:
df_major = test_df[test_df['Age'].isin([20, 21])]
df_rest = test_df[~test_df['Age'].isin([20, 21])]

df_major_downsampled = (
    df_major.groupby('Age')
    .sample(n=18500, random_state=42)
)

test_df = pd.concat([df_major_downsampled, df_rest]).sample(frac=1, random_state=42)

df_major = test_df[test_df['Heart_Rate'].isin([91.0])]
df_rest = test_df[~test_df['Heart_Rate'].isin([91.0])]

df_major_downsampled = (
    df_major.groupby('Heart_Rate')
    .sample(n=28000, random_state=42)
)

test_df = pd.concat([df_major_downsampled, df_rest]).sample(frac=1, random_state=42)

df_major = test_df[test_df['Body_Temp'].isin([40.7])]
df_rest = test_df[~test_df['Body_Temp'].isin([40.7])]

df_major_downsampled = (
    df_major.groupby('Body_Temp')
    .sample(n=60000, random_state=42)
)

test_df = pd.concat([df_major_downsampled, df_rest]).sample(frac=1, random_state=42)

In [None]:
from scipy.stats.mstats import winsorize
from sklearn.preprocessing import PowerTransformer

train_df.drop(train_df[(train_df['Body_Temp'] < 37.7)].index, inplace=True)
train_df['Body_Temp'] = winsorize(train_df['Body_Temp'], limits=[0.005, 0.005])
pt = PowerTransformer(method='yeo-johnson')
train_df['Body_Temp'] = pt.fit_transform(train_df[['Body_Temp']])

In [None]:
test_df.drop(test_df[(test_df['Body_Temp'] < 37.7)].index, inplace=True)

In [None]:
test_df['Body_Temp'] = winsorize(test_df['Body_Temp'], limits=[0.005, 0.005])
pt = PowerTransformer(method='yeo-johnson')
test_df['Body_Temp'] = pt.fit_transform(test_df[['Body_Temp']])

In [None]:
y = np.log1p(train_df['Calories'])
X = train_df.drop(['Calories', 'id'], axis=1)

In [None]:
test_ids = test_df['id']
test_data = test_df.drop(['id'], axis=1)

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from xgboost import XGBRegressor

#optuned parameters
model = XGBRegressor(
objective='reg:squarederror',
eval_metric='rmse',
n_estimators=2634,
learning_rate=0.01740856597433328,
max_depth=7,
min_child_weight=2,
gamma=0.006932421256967691,
subsample=0.9177117774590345,
colsample_bytree=0.7202054848943814,
reg_alpha=4.1405151665282025,
reg_lambda=3.8544938112332745,
tree_method='hist'
)

model.fit(X_train, y_train,
          eval_set=[(X_valid, y_valid)],
          early_stopping_rounds=50,
          verbose=False)

preds = model.predict(X_valid)

y_valid_true = np.expm1(y_valid)
preds_true = np.expm1(preds)

rmsle = mean_squared_log_error(y_valid_true, preds_true, squared=False)

print(f"📉 RMSLE: {rmsle}")

In [None]:
log_preds = model.predict(test_data)

test_preds = np.expm1(log_preds)

submission = pd.DataFrame({'id': test_ids, 'Calories': test_preds})
submission.to_csv('submission.csv', index=False)