## Data preprocess

In [47]:
import numpy as np
import pandas as pd


def one_hot_encoding(arr):
    s = set()
    _sd = dict()
    for e in arr:
        s.add(e)

    sl = list(s)
    for i in range(len(sl)):
        encode = np.zeros(len(sl))
        encode[i] = 1
        _sd[sl[i]] = encode

    encoded_data = []
    for k in range(len(arr)):
        encoded_data.append(_sd[arr[k]])

    return np.array(encoded_data)


def data_preprocess(filepath):
    df = pd.read_excel(filepath)
    data = df.to_numpy()

    ip = data[:, 2]

    encoded_ip = one_hot_encoding(ip)

    # print(encoded_ip)

    data[:, 2] = encoded_ip.tolist()

    # print(data)
    expanded_data = []
    for row in data:
        expanded_data.append(np.concatenate([row[:2], row[2], row[3:]]).tolist())

    expanded_data = np.array(expanded_data)

    for i in range(len(expanded_data)):
        for j in range(len(expanded_data[i])):
            expanded_data[i][j] = float(expanded_data[i][j])

    return expanded_data[50:200]


filename = "outputv2_5_wait_5s_3(300)"

# Data preprocessing
file_path = f"D:\\model_fit\\training\\training_data\\{filename}.xlsx"
dataset = data_preprocess(file_path)

print(dataset)

[[2.44104e+01 6.29400e+03 1.00000e+00 ... 3.78289e+01 3.86555e+01
  8.82350e+00]
 [3.31840e+00 2.62964e+05 0.00000e+00 ... 1.77686e+01 1.85185e+01
  1.06838e+01]
 [1.35354e+01 6.93728e+05 0.00000e+00 ... 3.82060e+01 3.86555e+01
  2.97468e+01]
 ...
 [2.16819e+01 8.48021e+05 1.00000e+00 ... 1.77966e+01 1.73770e+01
  8.54700e+00]
 [4.88360e+00 3.41019e+05 1.00000e+00 ... 3.95639e+01 3.93162e+01
  8.91090e+00]
 [2.99787e+01 9.81041e+05 0.00000e+00 ... 1.79372e+01 1.75497e+01
  8.73360e+00]]


## XGBOOST

In [48]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

X, y = dataset[:, 1:], dataset[:, 0]


train_size = int(len(X) * 0.7)

# 拆分数据集为训练集和测试集

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# time series
# X_train = X[:train_size]
# X_test = X[train_size:]
# y_train = y[:train_size]
# y_test = y[train_size:]

In [49]:
from keras.callbacks import TensorBoard # type: ignore
import os


# log
log_name = "xgboost_v2"
log_dir = os.path.join("logs", "fit", log_name)
tensorboard_callback = TensorBoard(log_dir=log_dir, histogram_freq=1)

from xgboost import XGBRegressor

# 创建 XGBoost 回归模型
model = XGBRegressor(
    objective="reg:squarederror",  # 损失函数为平方损失函数
    n_estimators=100,  # 迭代次数，即基础学习器的数量
    learning_rate=0.1,  # 学习率，控制每次更新的步长
    max_depth=5,  # 树的最大深度
    min_child_weight=1,  # 叶子节点最小权重
    subsample=0.8,  # 每棵树随机采样的比例
    colsample_bytree=0.8,  # 每棵树随机选择的特征比例
    alpha=0.01,  # L1 正则化项参数
    reg_lambda=0.01,  # L2 正则化项参数
)

# 训练模型
model.fit(X_train, y_train)

In [50]:
from sklearn.metrics import mean_absolute_error

# 使用模型进行预测
y_pred = model.predict(X_test)

# 评估模型性能
mse = mean_squared_error(y_test, y_pred)
mea = mean_absolute_error(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"Mean Squared Error: {mea}")


print(f"    real    |    pred   \n")
for i in range(len(y_test)):
    print(f"    {y_test[i]}     |     {y_pred[i]}      ")


Mean Squared Error: 66.42647817165897
Mean Squared Error: 6.039760605061848
    real    |    pred   

    3.5922     |     5.4307942390441895      
    19.7885     |     12.091316223144531      
    2.0893     |     10.195109367370605      
    20.3672     |     19.84484100341797      
    11.4507     |     14.701212882995605      
    13.1559     |     19.59515380859375      
    9.073     |     18.11901092529297      
    7.141     |     9.271097183227539      
    3.213     |     12.86149787902832      
    2.9346     |     4.951238632202148      
    26.4955     |     6.137820243835449      
    21.9665     |     12.499114990234375      
    8.0869     |     16.56372833251953      
    25.2409     |     20.36342430114746      
    21.8458     |     21.367839813232422      
    0.0348     |     22.50376319885254      
    6.3234     |     12.678730010986328      
    24.3725     |     9.200037956237793      
    16.6615     |     18.311161041259766      
    1.606     |     4.375883

In [51]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error

# 创建 XGBoost 回归模型
model = XGBRegressor(
    objective="reg:squarederror",
    n_estimators=100,
    learning_rate=0.1,
    tree_method="hist",
    device="cuda",
)

# 使用 GridSearchCV 进行参数调优
param_grid = {
    "max_depth": [3],  # 扩展最大深度的范围
    "learning_rate": [0.1],  # 扩展学习率的范围
    "n_estimators": [100],  # 扩展树的数量范围
    "gamma": [0],  # 扩展 gamma 参数范围
    "subsample": [1],  # 扩展子样本比例范围
    "colsample_bytree": [0.8, 0.7],  # 扩展列采样比例范围
    "reg_alpha": [0, 0.5],  # 扩展 L1 正则化参数范围
    "reg_lambda": [0, 0.5],  # 扩展 L2 正则化参数范围
}


grid_search = GridSearchCV(
    estimator=model, param_grid=param_grid, cv=5, scoring="neg_mean_squared_error"
)
grid_search.fit(X_train, y_train)

print("Best parameters found: ", grid_search.best_params_)

# 使用最佳参数重新训练模型
best_model = grid_search.best_estimator_
best_model.fit(X_train, y_train, verbose=True)


# 预测
preds = best_model.predict(X_test)

# 评估模型
rmse = mean_squared_error(y_test, preds, squared=False)
print(f"RMSE: {rmse}")

# 计算 MAE
mae = mean_absolute_error(y_test, preds)
print(f"MAE: {mae}")


import numpy as np

avg = np.mean(y_test)

print(f"AVG: {avg}")

print(f"    real    |    pred   \n")
a = 0
for i in range(len(y_test)):
    if (abs(y_test[i] - y_pred[i])) / y_test[i] <= 0.1:
        a += 1
    print(
        f"    {y_test[i]}     |     {y_pred[i]}      ",
        end=f"|   diff: {abs(y_test[i] - y_pred[i])}      |       {(abs(y_test[i] - y_pred[i])) / y_test[i]}\n",
    )

print(f"Accuracy in 0.9: {round(100 * a / len(y_test), 3)}%")

Best parameters found:  {'colsample_bytree': 0.7, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100, 'reg_alpha': 0, 'reg_lambda': 0, 'subsample': 1}
RMSE: 7.784007551823744
MAE: 5.8053056422678635
AVG: 11.946023333333335
    real    |    pred   

    3.5922     |     5.4307942390441895      |   diff: 1.8385942390441894      |       0.511829586059849
    19.7885     |     12.091316223144531      |   diff: 7.697183776855468      |       0.38897257381082284
    2.0893     |     10.195109367370605      |   diff: 8.105809367370606      |       3.879677101120282
    20.3672     |     19.84484100341797      |   diff: 0.5223589965820317      |       0.025647069630682258
    11.4507     |     14.701212882995605      |   diff: 3.250512882995606      |       0.28387023352245766
    13.1559     |     19.59515380859375      |   diff: 6.439253808593749      |       0.4894574912087922
    9.073     |     18.11901092529297      |   diff: 9.046010925292968      |       0.9970253417

