# Preparing training data

In [1]:
# -- import packages -- 
from pathlib import Path
import numpy as np
import xgboost as xgb
import json
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [2]:
# 读取数据
training_data_file_name = 'training_data_16.json'
training_data_file_path = Path().absolute().parent / 'training_data' / training_data_file_name

with open(training_data_file_path, 'r', encoding='utf-8') as f:
    training_data = json.load(f)

# 获取数据
waiting_time_array = np.array(training_data.get('real_waiting_time'))
request_number = training_data.get('request_number')
predicted_processing_time = training_data.get('predicted_processing_time')
waiting_jobs = training_data.get('waiting_jobs')

# 确保数据长度一致
assert len(request_number) == len(predicted_processing_time) == len(waiting_jobs) == len(waiting_time_array), "All lists must have the same length"

# 合并特征数据
X = np.array(list(zip(request_number, predicted_processing_time, waiting_jobs)))

# 目标变量
y = np.array(waiting_time_array)

# 打印数据形状
print(X.shape, y.shape)

# 分割数据集
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=41)

X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=43)


(500, 3) (500,)


# Xgboost

In [3]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dval   = xgb.DMatrix(X_val,   label=y_val)
dtest  = xgb.DMatrix(X_test,  label=y_test)

# Create XGBoost regression model
model = XGBRegressor(
    booster='gbtree',
    n_estimators=21,   # 基础学习器（树）的数量
    max_depth=3,        # 树的最大深度
    colsample_bytree=0.7,   # 随机选择特征的比例
    random_state=22,     # 保持结果可复现
)

model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
mea = mean_absolute_error(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"Mean Absolute Error: {mea}")
print(f"Mean of y_test: {np.mean(y_test)}")
print(f"Accuracy: {(np.mean(y_test) - mea) / np.mean(y_test)}")

print(f"    real    |    pred   \n")
for i in range(len(y_test)):
    print(f"    {y_test[i]}     |     {y_pred[i]}      ")

Mean Squared Error: 863.7795830452054
Mean Absolute Error: 22.216949783960978
Mean of y_test: 1304.0554750665028
Accuracy: 0.9829631866061312
    real    |    pred   

    105.54915356636047     |     110.3502426147461      
    989.0137057304382     |     966.6753540039062      
    2298.191405057907     |     2276.81982421875      
    2151.064654111862     |     2191.77685546875      
    2441.9931440353394     |     2424.07080078125      
    1306.0559763908386     |     1313.2608642578125      
    255.08117651939392     |     290.3401184082031      
    2288.3678312301636     |     2248.03662109375      
    1676.9705114364624     |     1677.49853515625      
    1604.2398567199707     |     1590.085205078125      
    1630.6549534797668     |     1620.018310546875      
    1491.9514214992523     |     1489.5628662109375      
    750.0098257064819     |     747.092529296875      
    2285.8876373767853     |     2262.430419921875      
    975.5771358013153     |     984.106140

# Save model

In [4]:
model.save_model(Path() / 'modelsfile' / 'xgboost_waiting_time_model_1.json')