In [2]:
import numpy as np
import pandas as pd
import os
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_log_error
import xgboost as xgb

In [3]:
train_data = pd.read_csv('C:/Users/User/OneDrive/Рабочий стол/playground-series-s5e5/train.csv')
test_data = pd.read_csv('C:/Users/User/OneDrive/Рабочий стол/playground-series-s5e5/test.csv')

In [4]:
x = train_data.drop('Calories', axis=1)
y = train_data['Calories']

In [5]:
# Splitting the train data into train and validation
train_x, val_x, train_y, val_y = train_test_split(x, y, test_size=0.3, random_state=44)

In [6]:
print('Length of training data: ',len(train_data))
print('Length of testing data: ',len(test_data))

Length of training data:  750000
Length of testing data:  250000


In [7]:
print('Length of train data: ',len(train_x))
print('Length of val data: ', len(val_x))

Length of train data:  525000
Length of val data:  225000


In [8]:
columns = train_x.columns

In [9]:
columns

Index(['id', 'Sex', 'Age', 'Height', 'Weight', 'Duration', 'Heart_Rate',
       'Body_Temp'],
      dtype='object')

In [10]:
train_x.describe()

Unnamed: 0,id,Age,Height,Weight,Duration,Heart_Rate,Body_Temp
count,525000.0,525000.0,525000.0,525000.0,525000.0,525000.0,525000.0
mean,374903.063783,41.406933,174.704676,75.15277,15.418053,95.484836,40.03586
std,216344.894733,15.181876,12.835215,13.98985,8.357289,9.449886,0.780166
min,0.0,20.0,126.0,36.0,1.0,67.0,37.1
25%,187718.75,28.0,164.0,63.0,8.0,88.0,39.6
50%,374881.5,40.0,174.0,74.0,15.0,95.0,40.3
75%,562166.75,52.0,185.0,87.0,23.0,103.0,40.7
max,749999.0,79.0,222.0,132.0,30.0,128.0,41.5


In [11]:
# converting the sex column to one hot-encoding
train_x = pd.get_dummies(train_x, columns=['Sex'])
val_x = pd.get_dummies(val_x, columns=['Sex'])
test_data = pd.get_dummies(test_data, columns=['Sex'])

In [12]:
# dropping id as it is just sequence and not contributing anything to result
train_x = train_x.drop('id', axis=1)
val_x = val_x.drop('id', axis=1)
test_data = test_data.drop('id', axis=1)

In [13]:
model = xgb.XGBRegressor(
    objective="reg:squaredlogerror",
    eval_metric="rmsle",
    n_estimators=50
)

In [14]:
model.fit(train_x, train_y, eval_set=[(val_x, val_y)])

[0]	validation_0-rmsle:3.47574
[1]	validation_0-rmsle:3.27505
[2]	validation_0-rmsle:3.07750
[3]	validation_0-rmsle:2.88336
[4]	validation_0-rmsle:2.69288
[5]	validation_0-rmsle:2.50635
[6]	validation_0-rmsle:2.32408
[7]	validation_0-rmsle:2.14641
[8]	validation_0-rmsle:1.97372
[9]	validation_0-rmsle:1.80634
[10]	validation_0-rmsle:1.64469
[11]	validation_0-rmsle:1.48923
[12]	validation_0-rmsle:1.34039
[13]	validation_0-rmsle:1.19862
[14]	validation_0-rmsle:1.06437
[15]	validation_0-rmsle:0.93820
[16]	validation_0-rmsle:0.82063
[17]	validation_0-rmsle:0.71205
[18]	validation_0-rmsle:0.61320
[19]	validation_0-rmsle:0.52415
[20]	validation_0-rmsle:0.44489
[21]	validation_0-rmsle:0.37571
[22]	validation_0-rmsle:0.31648
[23]	validation_0-rmsle:0.26698
[24]	validation_0-rmsle:0.22619
[25]	validation_0-rmsle:0.19287
[26]	validation_0-rmsle:0.16676
[27]	validation_0-rmsle:0.14607
[28]	validation_0-rmsle:0.13032
[29]	validation_0-rmsle:0.11841
[30]	validation_0-rmsle:0.10954
[31]	validation_0-

In [15]:
print(model.n_estimators)

50


In [17]:
val_y_pred = model.predict(val_x)

In [18]:
y_preds = model.predict(test_data)

In [19]:
# sanity check for y_preds
print(type(y_preds))
print(len(y_preds))
print(y_preds[0])
print(type(y_preds[0]))
print(y_preds[1])

<class 'numpy.ndarray'>
250000
25.869947
<class 'numpy.float32'>
105.93119


In [20]:
# Save submission
submission = pd.read_csv('C:/Users/User/OneDrive/Рабочий стол/playground-series-s5e5/sample_submission.csv')
submission["Calories"] = y_preds
submission.to_csv("submission.csv", index=False)
print('Sumission done!')
submission.head()

Sumission done!


Unnamed: 0,id,Calories
0,750000,25.869947
1,750001,105.93119
2,750002,86.916298
3,750003,128.987198
4,750004,73.132515


In [21]:
import os
print(os.getcwd())

C:\Users\User


In [None]:
from xgboost import to_graphviz

for i in range(model.n_estimators):
    dot = to_graphviz(model, num_trees=i)
    dot.render(f'xgb_tree_{i}.dot')