In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, root_mean_squared_log_error
from xgboost import XGBRegressor

# Now we load data
Data is in the form of  

| id  | sex    | age | height | weight | duration | heart rate | body temperature | calories |  
| --- | ---    | --- | ---    | ---    | ---      | ---        | ---              | ---      |
| int | string | int | float  | float  | float    | float      | float            | float    |   

* Calories only exists in training data

In [2]:
# training data
train_data = pd.read_csv('train.csv')

train_data

Unnamed: 0,id,Sex,Age,Height,Weight,Duration,Heart_Rate,Body_Temp,Calories
0,0,male,36,189.0,82.0,26.0,101.0,41.0,150.0
1,1,female,64,163.0,60.0,8.0,85.0,39.7,34.0
2,2,female,51,161.0,64.0,7.0,84.0,39.8,29.0
3,3,male,20,192.0,90.0,25.0,105.0,40.7,140.0
4,4,female,38,166.0,61.0,25.0,102.0,40.6,146.0
...,...,...,...,...,...,...,...,...,...
749995,749995,male,28,193.0,97.0,30.0,114.0,40.9,230.0
749996,749996,female,64,165.0,63.0,18.0,92.0,40.5,96.0
749997,749997,male,60,162.0,67.0,29.0,113.0,40.9,221.0
749998,749998,male,45,182.0,91.0,17.0,102.0,40.3,109.0


In [3]:
y = train_data.Calories

y.head()

0    150.0
1     34.0
2     29.0
3    140.0
4    146.0
Name: Calories, dtype: float64

In [4]:
X = train_data.copy().drop(['Calories', 'id'], axis=1)

X

Unnamed: 0,Sex,Age,Height,Weight,Duration,Heart_Rate,Body_Temp
0,male,36,189.0,82.0,26.0,101.0,41.0
1,female,64,163.0,60.0,8.0,85.0,39.7
2,female,51,161.0,64.0,7.0,84.0,39.8
3,male,20,192.0,90.0,25.0,105.0,40.7
4,female,38,166.0,61.0,25.0,102.0,40.6
...,...,...,...,...,...,...,...
749995,male,28,193.0,97.0,30.0,114.0,40.9
749996,female,64,165.0,63.0,18.0,92.0,40.5
749997,male,60,162.0,67.0,29.0,113.0,40.9
749998,male,45,182.0,91.0,17.0,102.0,40.3


In [5]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
#OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

# One-hot encode 'Sex'
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
OH_train = pd.DataFrame(
    OH_encoder.fit_transform(X_train[['Sex']]),
    columns=OH_encoder.get_feature_names_out(['Sex']),
    index=X_train.index
)
OH_valid = pd.DataFrame(
    OH_encoder.transform(X_valid[['Sex']]),
    columns=OH_encoder.get_feature_names_out(['Sex']),
    index=X_valid.index
)

# Drop 'Sex' and combine numerical + categorical
X_train_final = pd.concat([X_train.drop('Sex', axis=1), OH_train], axis=1)
X_valid_final = pd.concat([X_valid.drop('Sex', axis=1), OH_valid], axis=1)

X_train_final

Unnamed: 0,Age,Height,Weight,Duration,Heart_Rate,Body_Temp,Sex_female,Sex_male
453635,43,190.0,89.0,6.0,87.0,39.1,0.0,1.0
11651,48,155.0,54.0,12.0,97.0,40.2,1.0,0.0
431999,51,187.0,92.0,15.0,96.0,40.5,0.0,1.0
529211,45,182.0,88.0,2.0,83.0,38.3,0.0,1.0
110925,22,202.0,99.0,25.0,98.0,40.7,0.0,1.0
...,...,...,...,...,...,...,...,...
259178,58,179.0,78.0,9.0,93.0,39.7,0.0,1.0
365838,46,171.0,70.0,16.0,85.0,39.8,1.0,0.0
131932,37,191.0,91.0,28.0,108.0,40.7,0.0,1.0
671155,42,193.0,92.0,17.0,93.0,40.0,0.0,1.0


In [7]:
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train_final, y_train)

In [10]:
rf_preds = rf_model.predict(X_valid_final)
rf_rmsl = root_mean_squared_log_error(y_valid, rf_preds)
rf_rmsl

0.06334011749879508

In [22]:
xgb_model = XGBRegressor(n_estimators=1000, learning_rate=0.025, n_jobs=10, random_state=42)
xgb_model.fit(X_train_final, y_train)
#, eval_set=[(X_valid_final, y_valid)], verbose=False)

In [23]:
xgb_preds = xgb_model.predict(X_valid_final)
xgb_mae = root_mean_squared_log_error(y_valid, xgb_preds)
xgb_mae

0.06237722705505063

In [24]:
test_data = pd.read_csv('test.csv')

test_data

Unnamed: 0,id,Sex,Age,Height,Weight,Duration,Heart_Rate,Body_Temp
0,750000,male,45,177.0,81.0,7.0,87.0,39.8
1,750001,male,26,200.0,97.0,20.0,101.0,40.5
2,750002,female,29,188.0,85.0,16.0,102.0,40.4
3,750003,female,39,172.0,73.0,20.0,107.0,40.6
4,750004,female,30,173.0,67.0,16.0,94.0,40.5
...,...,...,...,...,...,...,...,...
249995,999995,female,56,159.0,62.0,6.0,85.0,39.4
249996,999996,male,32,202.0,101.0,3.0,84.0,38.4
249997,999997,female,31,164.0,64.0,14.0,98.0,40.1
249998,999998,female,62,158.0,61.0,25.0,106.0,40.7


In [25]:
OH_test = pd.DataFrame(
    OH_encoder.transform(test_data[['Sex']]),
    columns=OH_encoder.get_feature_names_out(['Sex']),
    index=test_data.index
)

X_test_final = pd.concat([test_data.drop(['Sex', 'id'], axis=1), OH_test], axis=1)
X_test_final

Unnamed: 0,Age,Height,Weight,Duration,Heart_Rate,Body_Temp,Sex_female,Sex_male
0,45,177.0,81.0,7.0,87.0,39.8,0.0,1.0
1,26,200.0,97.0,20.0,101.0,40.5,0.0,1.0
2,29,188.0,85.0,16.0,102.0,40.4,1.0,0.0
3,39,172.0,73.0,20.0,107.0,40.6,1.0,0.0
4,30,173.0,67.0,16.0,94.0,40.5,1.0,0.0
...,...,...,...,...,...,...,...,...
249995,56,159.0,62.0,6.0,85.0,39.4,1.0,0.0
249996,32,202.0,101.0,3.0,84.0,38.4,0.0,1.0
249997,31,164.0,64.0,14.0,98.0,40.1,1.0,0.0
249998,62,158.0,61.0,25.0,106.0,40.7,1.0,0.0


In [26]:
predictions = xgb_model.predict(X_test_final)

In [27]:
predictions = np.abs(predictions)

In [28]:
final_data = pd.DataFrame({ 'id': test_data.id, 'Calories': predictions })

final_data.to_csv('submissions6.csv', index=False)