# Preparing training data

In [1]:
# -- import packages -- 
from pathlib import Path
import numpy as np
import xgboost as xgb
import json
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [2]:
training_data_file_name = 'data_1.json'
training_data_file_path = Path().absolute().parent / 'training_data' / training_data_file_name

with open(training_data_file_path, 'r', encoding='utf-8') as f:
    training_data: dict = json.load(f)

keys = [ int(key) for key in training_data.keys()]
values = list(training_data.values())

X, y = (np.array(keys).reshape(-1, 1), np.array(values))

In [5]:
# training_data_file_name = 'data.json'
# training_data_file_path = Path().absolute().parent / 'training_data' / training_data_file_name

# with open(training_data_file_path, 'r', encoding='utf-8') as f:
#     training_data = json.load(f)

# # X, y = np.array(training_data.get('request_number')).reshape(-1, 1), np.array([training_data.get('user_cpu_time')[i] + training_data.get('system_cpu_time')[i] for i in range(len(training_data.get('user_cpu_time')))])
# X, y = (np.array(training_data.get('request_number'))).reshape(-1, 1), np.array(training_data.get('real_processing_time'))

X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.2, random_state=43
)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=43
)

# Xgboost

In [6]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dval   = xgb.DMatrix(X_val,   label=y_val)
dtest  = xgb.DMatrix(X_test,  label=y_test)

# Create XGBoost regression model
model = XGBRegressor(
    objective="reg:squarederror",
    n_estimators=50,   # 基础学习器（树）的数量
    max_depth=3,        # 树的最大深度
    colsample_bytree=0.8,   # 随机选择特征的比例
    random_state=12,     # 保持结果可复现
)

model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
mea = mean_absolute_error(y_test, y_pred)
test_mea = np.mean(y_test)

print(f"Mean Squared Error: {mse}")
print(f"Mean Absolute Error: {mea}")
print(f"Mean Test Data: {test_mea}")


print(f"    real    |    pred   \n")
for i in range(len(y_test)):
    print(f"    {y_test[i]}     |     {y_pred[i]}      ")

Mean Squared Error: 0.0027731380010083805
Mean Absolute Error: 0.03743176555089017
Mean Test Data: 4.703168378208698
    real    |    pred   

    4.442436002000022     |     4.419015884399414      
    4.603330532999962     |     4.643780708312988      
    4.938789699999688     |     4.833023548126221      
    6.23605574999965     |     6.227128982543945      
    1.2103643779996673     |     1.17966890335083      
    4.804459391000364     |     4.77417516708374      
    0.6354300030000104     |     0.6405737996101379      
    2.1126575640000738     |     2.1278228759765625      
    2.385598602999835     |     2.292057752609253      
    2.2370804770002906     |     2.2390105724334717      
    8.31833331100006     |     8.32686996459961      
    2.967242875000011     |     3.0041756629943848      
    0.9777039099999456     |     1.0331015586853027      
    0.6302781800004595     |     0.6405737996101379      
    0.7560727120003321     |     0.7831546664237976      
    5.83

# Save model

In [7]:
model.save_model(Path() / 'modelsfile' / 'xgboost_newest_model_4.json')