In [25]:
import numpy as np
import pandas as pd
import os
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_log_error
import xgboost as xgb
from scipy.stats import randint, uniform
from sklearn.model_selection import RandomizedSearchCV


In [2]:
train_data = pd.read_csv('C:/Users/User/OneDrive/Рабочий стол/playground-series-s5e5/train.csv')
test_data = pd.read_csv('C:/Users/User/OneDrive/Рабочий стол/playground-series-s5e5/test.csv')

In [3]:
x = train_data.drop('Calories', axis=1)
y = train_data['Calories']

In [4]:
# Splitting the train data into train and validation
train_x, val_x, train_y, val_y = train_test_split(x, y, test_size=0.3, random_state=44)

In [5]:
print('Length of training data: ',len(train_data))
print('Length of testing data: ',len(test_data))

Length of training data:  750000
Length of testing data:  250000


In [6]:
print('Length of train data: ',len(train_x))
print('Length of val data: ', len(val_x))

Length of train data:  525000
Length of val data:  225000


In [7]:
columns = train_x.columns

In [8]:
columns

Index(['id', 'Sex', 'Age', 'Height', 'Weight', 'Duration', 'Heart_Rate',
       'Body_Temp'],
      dtype='object')

In [9]:
train_x.describe()

Unnamed: 0,id,Age,Height,Weight,Duration,Heart_Rate,Body_Temp
count,525000.0,525000.0,525000.0,525000.0,525000.0,525000.0,525000.0
mean,374903.063783,41.406933,174.704676,75.15277,15.418053,95.484836,40.03586
std,216344.894733,15.181876,12.835215,13.98985,8.357289,9.449886,0.780166
min,0.0,20.0,126.0,36.0,1.0,67.0,37.1
25%,187718.75,28.0,164.0,63.0,8.0,88.0,39.6
50%,374881.5,40.0,174.0,74.0,15.0,95.0,40.3
75%,562166.75,52.0,185.0,87.0,23.0,103.0,40.7
max,749999.0,79.0,222.0,132.0,30.0,128.0,41.5


In [10]:
# converting the sex column to one hot-encoding
train_x = pd.get_dummies(train_x, columns=['Sex'])
val_x = pd.get_dummies(val_x, columns=['Sex'])
test_data = pd.get_dummies(test_data, columns=['Sex'])

In [11]:
# dropping id as it is just sequence and not contributing anything to result
train_x = train_x.drop('id', axis=1)
val_x = val_x.drop('id', axis=1)
test_data = test_data.drop('id', axis=1)

In [19]:
# 6. Явное добавление новых признаков
# BMI
train_x['BMI'] = train_x['Weight'] / (train_x['Height'] / 100) ** 2
val_x['BMI'] = val_x['Weight'] / (val_x['Height'] / 100) ** 2
test_data['BMI'] = test_data['Weight'] / (test_data['Height'] / 100) ** 2

# PulseLoad = Heart_Rate * Duration
train_x['PulseLoad'] = train_x['Heart_Rate'] * train_x['Duration']
val_x['PulseLoad'] = val_x['Heart_Rate'] * val_x['Duration']
test_data['PulseLoad'] = test_data['Heart_Rate'] * test_data['Duration']

# Temp_per_min = Body_Temp / Duration
train_x['Temp_per_min'] = train_x['Body_Temp'] / train_x['Duration']
val_x['Temp_per_min'] = val_x['Body_Temp'] / val_x['Duration']
test_data['Temp_per_min'] = test_data['Body_Temp'] / test_data['Duration']

In [28]:
xgb_model = xgb.XGBRegressor(
    objective='reg:squaredlogerror',
    eval_metric='rmsle',
    n_jobs=-1,
    verbosity=0
)


In [23]:
param_dist = {
    'n_estimators': randint(50, 300),
    'learning_rate': uniform(0.01, 0.2),
    'max_depth': randint(3, 8),
    'subsample': uniform(0.6, 0.4),
    'colsample_bytree': uniform(0.6, 0.4),
    'gamma': uniform(0, 1),
    'reg_alpha': uniform(0, 1),
    'reg_lambda': uniform(0, 1)
}

In [29]:
random_search = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=param_dist,
    n_iter=20,
    scoring='neg_mean_squared_log_error',
    cv=3,
    verbose=2,
    random_state=42
)

In [30]:
random_search.fit(train_x, train_y)

Fitting 3 folds for each of 20 candidates, totalling 60 fits
[CV] END colsample_bytree=0.749816047538945, gamma=0.9507143064099162, learning_rate=0.15639878836228102, max_depth=7, n_estimators=70, reg_alpha=0.15601864044243652, reg_lambda=0.15599452033620265, subsample=0.6232334448672797; total time=   1.9s
[CV] END colsample_bytree=0.749816047538945, gamma=0.9507143064099162, learning_rate=0.15639878836228102, max_depth=7, n_estimators=70, reg_alpha=0.15601864044243652, reg_lambda=0.15599452033620265, subsample=0.6232334448672797; total time=   2.3s
[CV] END colsample_bytree=0.749816047538945, gamma=0.9507143064099162, learning_rate=0.15639878836228102, max_depth=7, n_estimators=70, reg_alpha=0.15601864044243652, reg_lambda=0.15599452033620265, subsample=0.6232334448672797; total time=   2.3s
[CV] END colsample_bytree=0.9464704583099741, gamma=0.6011150117432088, learning_rate=0.1516145155592091, max_depth=7, n_estimators=51, reg_alpha=0.7219987722668247, reg_lambda=0.9385527090157502

In [32]:
print(random_search.best_params_)

print(random_search.best_score_)

best_rmsle = np.sqrt(-random_search.best_score_)
print(best_rmsle)

{'colsample_bytree': 0.6027808522124762, 'gamma': 0.5107473025775657, 'learning_rate': 0.0934822006297558, 'max_depth': 7, 'n_estimators': 274, 'reg_alpha': 0.1198653673336828, 'reg_lambda': 0.33761517140362796, 'subsample': 0.9771638815650077}
-0.00509608651976958
0.07138687918497054


In [34]:
best_model = xgb.XGBRegressor(
    objective='reg:squaredlogerror',
    eval_metric='rmsle',
    colsample_bytree=0.6027,
    gamma=0.5107,
    learning_rate=0.0934,
    max_depth=7,
    n_estimators=274,
    reg_alpha=0.1989,
    reg_lambda=0.3376,
    subsample=0.9772,
    n_jobs=-1,
    verbosity=1
)

best_model.fit(
    train_x, train_y,
    eval_set=[(val_x, val_y)],
    verbose=True
)

[0]	validation_0-rmsle:3.61118
[1]	validation_0-rmsle:3.54333
[2]	validation_0-rmsle:3.47580
[3]	validation_0-rmsle:3.40864
[4]	validation_0-rmsle:3.34182
[5]	validation_0-rmsle:3.27536
[6]	validation_0-rmsle:3.20928
[7]	validation_0-rmsle:3.14358
[8]	validation_0-rmsle:3.07829
[9]	validation_0-rmsle:3.01339
[10]	validation_0-rmsle:2.94889
[11]	validation_0-rmsle:2.88525
[12]	validation_0-rmsle:2.82162
[13]	validation_0-rmsle:2.75843
[14]	validation_0-rmsle:2.69570
[15]	validation_0-rmsle:2.63344
[16]	validation_0-rmsle:2.57179
[17]	validation_0-rmsle:2.51063
[18]	validation_0-rmsle:2.44982
[19]	validation_0-rmsle:2.38951
[20]	validation_0-rmsle:2.32973
[21]	validation_0-rmsle:2.27152
[22]	validation_0-rmsle:2.21282
[23]	validation_0-rmsle:2.15472
[24]	validation_0-rmsle:2.09714
[25]	validation_0-rmsle:2.04013
[26]	validation_0-rmsle:1.98370
[27]	validation_0-rmsle:1.92790
[28]	validation_0-rmsle:1.87268
[29]	validation_0-rmsle:1.81812
[30]	validation_0-rmsle:1.76419
[31]	validation_0-

In [14]:
print(model.n_estimators)

50


In [35]:
val_y_pred = best_model.predict(val_x)

In [36]:
y_preds = best_model.predict(test_data)

In [37]:
# sanity check for y_preds
print(type(y_preds))
print(len(y_preds))
print(y_preds[0])
print(type(y_preds[0]))
print(y_preds[1])

<class 'numpy.ndarray'>
250000
27.000107
<class 'numpy.float32'>
106.39948


In [39]:
# Save submission
submission = pd.read_csv('C:/Users/User/OneDrive/Рабочий стол/playground-series-s5e5/sample_submission.csv')
submission["Calories"] = y_preds
submission.to_csv("submission.csv", index=False)
print('Sumission done!')
submission.head()

Sumission done!


Unnamed: 0,id,Calories
0,750000,27.000107
1,750001,106.399483
2,750002,88.167763
3,750003,125.061867
4,750004,73.526222


In [40]:
import os
print(os.getcwd())

C:\Users\User\Downloads


In [None]:
from xgboost import to_graphviz

for i in range(model.n_estimators):
    dot = to_graphviz(model, num_trees=i)
    dot.render(f'xgb_tree_{i}.dot')