In [34]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_log_error
from xgboost import XGBRegressor

# Now we load data
Data is in the form of  

| id  | sex    | age | height | weight | duration | heart rate | body temperature | calories |  
| --- | ---    | --- | ---    | ---    | ---      | ---        | ---              | ---      |
| int | string | int | float  | float  | float    | float      | float            | float    |   

* Calories only exists in training data

In [2]:
# training data
train_data = pd.read_csv('train.csv')

train_data

Unnamed: 0,id,Sex,Age,Height,Weight,Duration,Heart_Rate,Body_Temp,Calories
0,0,male,36,189.0,82.0,26.0,101.0,41.0,150.0
1,1,female,64,163.0,60.0,8.0,85.0,39.7,34.0
2,2,female,51,161.0,64.0,7.0,84.0,39.8,29.0
3,3,male,20,192.0,90.0,25.0,105.0,40.7,140.0
4,4,female,38,166.0,61.0,25.0,102.0,40.6,146.0
...,...,...,...,...,...,...,...,...,...
749995,749995,male,28,193.0,97.0,30.0,114.0,40.9,230.0
749996,749996,female,64,165.0,63.0,18.0,92.0,40.5,96.0
749997,749997,male,60,162.0,67.0,29.0,113.0,40.9,221.0
749998,749998,male,45,182.0,91.0,17.0,102.0,40.3,109.0


In [3]:
y = train_data.Calories

y.head()

0    150.0
1     34.0
2     29.0
3    140.0
4    146.0
Name: Calories, dtype: float64

In [4]:
X = train_data.copy().drop(['Calories', 'id'], axis=1)

X

Unnamed: 0,Sex,Age,Height,Weight,Duration,Heart_Rate,Body_Temp
0,male,36,189.0,82.0,26.0,101.0,41.0
1,female,64,163.0,60.0,8.0,85.0,39.7
2,female,51,161.0,64.0,7.0,84.0,39.8
3,male,20,192.0,90.0,25.0,105.0,40.7
4,female,38,166.0,61.0,25.0,102.0,40.6
...,...,...,...,...,...,...,...
749995,male,28,193.0,97.0,30.0,114.0,40.9
749996,female,64,165.0,63.0,18.0,92.0,40.5
749997,male,60,162.0,67.0,29.0,113.0,40.9
749998,male,45,182.0,91.0,17.0,102.0,40.3


In [5]:
# X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
#OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

# One-hot encode 'Sex'
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
OH_encoder.fit(X[['Sex']])

# OH_train = pd.DataFrame(
#     OH_encoder.fit_transform(X[['Sex']]),
#     columns=OH_encoder.get_feature_names_out(['Sex']),
#     index=X.index
# )

# Drop 'Sex' and combine numerical + categorical
# X_train_final = pd.concat([X.drop('Sex', axis=1), OH_train], axis=1)

# X_train_final

In [7]:
def preprocess(df):
    df = df.copy()
    df['Height_m'] = df['Height'] / 100
    df['BMI'] = df['Weight'] / (df['Height_m'] ** 2)
    df['Weight_Height_Ratio'] = df['Weight'] / df['Height']
    df['HeartRate_per_min'] = df['Heart_Rate'] / df['Duration']
    df['Temp_Deviation'] = abs(df['Body_Temp'] - 37)
    df['Intensity_Score'] = df['Heart_Rate'] * df['Duration'] * df['BMI']
    df = df.drop(columns=['Height_m'], errors='ignore')

    # One-hot encode 'Sex'
    OH_cols = pd.DataFrame(
        OH_encoder.transform(df[['Sex']]),
        columns=OH_encoder.get_feature_names_out(['Sex']),
        index=df.index
    )
    df = df.drop('Sex', axis=1)
    df_final = pd.concat([df, OH_cols], axis=1)
    return df_final.astype(float)

In [35]:
X_fe = preprocess(X)

X_tr, X_val, y_tr, y_val = train_test_split(X_fe, y, test_size=0.2,
                                            random_state=42)

In [37]:
#xgb_model = XGBRegressor(n_estimators=1000, learning_rate=0.05, n_jobs=5, random_state=42)
xgb_model = XGBRegressor(
    n_estimators=5000,
    learning_rate=0.05,
    n_jobs=5,
    random_state=42,
    verbosity=1,
    subsample=0.9,
    colsample_bytree=0.8,
    max_depth=6
)
xgb_model.set_params(eval_metric='rmse')
xgb_model.set_params(early_stopping_rounds=200)
xgb_model.fit(X_tr, y_tr, eval_set=[(X_val, y_val)])
#, eval_set=[(X_valid_final, y_valid)], verbose=False)

[0]	validation_0-rmse:59.21369
[1]	validation_0-rmse:56.32787
[2]	validation_0-rmse:53.58839
[3]	validation_0-rmse:50.98571
[4]	validation_0-rmse:48.51433
[5]	validation_0-rmse:46.16788
[6]	validation_0-rmse:43.96333
[7]	validation_0-rmse:41.85083
[8]	validation_0-rmse:39.83931
[9]	validation_0-rmse:37.93189
[10]	validation_0-rmse:36.14940
[11]	validation_0-rmse:34.46124
[12]	validation_0-rmse:32.85331
[13]	validation_0-rmse:31.30158
[14]	validation_0-rmse:29.82665
[15]	validation_0-rmse:28.42774
[16]	validation_0-rmse:27.10374
[17]	validation_0-rmse:25.84730
[18]	validation_0-rmse:24.64842
[19]	validation_0-rmse:23.53760
[20]	validation_0-rmse:22.45770
[21]	validation_0-rmse:21.45987
[22]	validation_0-rmse:20.51543
[23]	validation_0-rmse:19.60432
[24]	validation_0-rmse:18.73482
[25]	validation_0-rmse:17.90290
[26]	validation_0-rmse:17.11685
[27]	validation_0-rmse:16.37988
[28]	validation_0-rmse:15.66850
[29]	validation_0-rmse:14.99885
[30]	validation_0-rmse:14.38430
[31]	validation_0-

In [38]:
# xgb_preds = xgb_model.predict(X_valid_final)
# xgb_mae = root_mean_squared_log_error(y_valid, xgb_preds)
# xgb_mae

In [39]:
test_data = pd.read_csv('test.csv')

test_data

Unnamed: 0,id,Sex,Age,Height,Weight,Duration,Heart_Rate,Body_Temp
0,750000,male,45,177.0,81.0,7.0,87.0,39.8
1,750001,male,26,200.0,97.0,20.0,101.0,40.5
2,750002,female,29,188.0,85.0,16.0,102.0,40.4
3,750003,female,39,172.0,73.0,20.0,107.0,40.6
4,750004,female,30,173.0,67.0,16.0,94.0,40.5
...,...,...,...,...,...,...,...,...
249995,999995,female,56,159.0,62.0,6.0,85.0,39.4
249996,999996,male,32,202.0,101.0,3.0,84.0,38.4
249997,999997,female,31,164.0,64.0,14.0,98.0,40.1
249998,999998,female,62,158.0,61.0,25.0,106.0,40.7


In [40]:
# OH_test = pd.DataFrame(
#     OH_encoder.transform(test_data[['Sex']]),
#     columns=OH_encoder.get_feature_names_out(['Sex']),
#     index=test_data.index
# )

# X_test_final = pd.concat([test_data.drop(['Sex', 'id'], axis=1), OH_test], axis=1)
# X_test_final
X_test_final = preprocess(test_data)
X_test_final = X_test_final.drop('id', axis=1)

In [41]:
predictions = xgb_model.predict(X_test_final)

In [42]:
predictions = np.abs(predictions)

In [43]:
final_data = pd.DataFrame({ 'id': test_data.id, 'Calories': predictions })

final_data.to_csv('submissions9.csv', index=False)