## Data preprocess

In [1]:
from pathlib import Path
from openpyxl import Workbook # type: ignore
import os


def to_excel(data, filename, sheet_style, result_dir_path):
    workbook = Workbook()
    sheet = workbook.active

    # | user-response-time | request-number | response-ip   | process-time  |
    # | 0.3                | 10000          | 192.168.0.150 | 14.523432     |
    # | 0.5                | 20000          | 192.168.0.151 | 9.5232642     |

    sheet.append(sheet_style)

    for row in data:
        sheet.append(row)

    

    if not os.path.exists(result_dir_path):
        os.makedirs(result_dir_path, exist_ok=True)

    workbook.save(filename=f"{result_dir_path}\\{filename}.xlsx")


# read input dataset
filename_prefix = (
    "RANDclientv_single_worker_node-L1-RB1000-DTWedSep41611532024"
)


dataset_read_filename = filename_prefix
training_data_dir = Path.cwd().parent / "training_data"
# Data preprocessing
file_path = f"{training_data_dir}\\{dataset_read_filename}.xlsx"


# set result output filename and path
result_suffix = "result"

result_dir_path = Path.cwd().parent / "results" / "result_requestNumber_processTime"
version_index = len([_ for _ in Path(result_dir_path).iterdir() if _.is_file()])
version = f"_v{version_index}"

result_name = "requestNumber#responseTime" + version

result_output_filename = f"{filename_prefix}{result_name}{result_suffix}"

In [2]:

import pandas as pd # type: ignore
import numpy as np # type: ignore


def read_data(filename):
    df = pd.read_excel(filename)

    columns = df.columns.to_list()

    data_dict = {col: df[col].to_list() for col in columns}

    return data_dict


def data_preprocess(filepath):
    data = read_data(filepath)
    # TODO more...

    # to numpy
    for key in data.keys():
        data[key] = np.array(data[key])
    
    return data

In [3]:
dataset = data_preprocess(file_path)

## DATA Style View

In [4]:
# # dataset is a dictionary

# for col_header, col_data_list in dataset.items():
#     print(col_header, col_data_list)

## XGBOOST - 1

In [5]:
from xgboost import XGBRegressor # type: ignore
from sklearn.ensemble import RandomForestRegressor  # type: ignore

# Create XGBoost regression model
model = XGBRegressor(
    booster='gbtree',
    n_estimators=100,   # 基础学习器（树）的数量
    max_depth=5,        # 树的最大深度
    colsample_bytree=0.8,   # 随机选择特征的比例
    min_samples_split=2,  # 每个节点分裂所需的最小样本数（可以用于调节过拟合）
    min_samples_leaf=1,  # 每个叶子节点需要的最小样本数
    random_state=42,     # 保持结果可复现

)

In [6]:
from sklearn.model_selection import train_test_split # type: ignore
from sklearn.metrics import mean_squared_error # type: ignore

# # select value from dataset
X = np.array([dataset.get("request_num")]).T
y = dataset.get("processed_time")

# train_size = int(len(X) * 0.7)

# 拆分数据集为训练集和测试集
# split dataset to train dataset and test dataset

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=35
)

# model fit
model.fit(X_train, y_train)

Parameters: { "min_samples_leaf", "min_samples_split" } are not used.



In [7]:
from sklearn.metrics import mean_absolute_error # type: ignore

y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
mea = mean_absolute_error(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"Mean Absolute Error: {mea}")


print(f"    real    |    pred   \n")
for i in range(len(y_test)):
    print(f"    {y_test[i]}     |     {y_pred[i]}      ")

Mean Squared Error: 0.0016457363980903778
Mean Absolute Error: 0.02719065760960802
    real    |    pred   

    3.784573554992676     |     3.706636428833008      
    5.100839138031006     |     5.0438151359558105      
    0.8065564632415771     |     0.7706516981124878      
    0.08397722244262695     |     0.0681023821234703      
    2.898449897766113     |     2.9247689247131348      
    0.09162354469299316     |     0.09273232519626617      
    4.110558271408081     |     4.088840007781982      
    4.19347095489502     |     4.164370536804199      
    6.263468980789185     |     6.166304111480713      
    0.911158561706543     |     0.9088114500045776      
    8.254207611083984     |     8.249679565429688      
    7.902703285217285     |     7.826289176940918      
    5.927728652954102     |     5.910335063934326      
    1.307764291763306     |     1.3098516464233398      
    8.284693479537964     |     8.249679565429688      
    2.202848434448242     |     2.19721

## XGBRegressor

In [8]:
from xgboost import XGBRegressor # type: ignore
from sklearn.metrics import mean_squared_error # type: ignore
from sklearn.metrics import root_mean_squared_error # type: ignore
from sklearn.model_selection import GridSearchCV # type: ignore


# 创建 XGBoost 回归模型
model = XGBRegressor(
    objective="reg:squarederror",
    n_estimators=100,
    learning_rate=0.1,
    tree_method="hist",
    device="cuda",
)

# Define the parameter grid for GridSearchCV
param_grid = {
    "max_depth": [3],  # Range of maximum depth of trees to explore  扩展最大深度的范围
    "learning_rate": [0.1],  # Range of learning rates to explore  扩展学习率的范围
    "n_estimators": [100],  # Range of number of trees (boosting rounds) to explore  扩展树的数量范围
    "gamma": [0],   # Range of gamma parameter to explore (controls tree split) 扩展 gamma 参数范围
    "subsample": [1],  # Range of subsample ratios to explore  扩展子样本比例范围
    "colsample_bytree": [0.8, 0.7],  # Range of column subsample ratios for each tree to explore  扩展列采样比例范围
    "reg_alpha": [0, 0.5],  # Range of L1 regularization parameters to explore  扩展 L1 正则化参数范围
    "reg_lambda": [0, 0.5],  # Range of L2 regularization parameters to explore  扩展 L2 正则化参数范围
}


grid_search = GridSearchCV(
    estimator=model, param_grid=param_grid, cv=5, scoring="neg_mean_squared_error"
)
grid_search.fit(X_train, y_train)

print("Best parameters found:\n")
for k, v in grid_search.best_params_.items():
    print('\t', k, ":", v)


# 使用最佳参数重新训练模型
# Re-train the model using the best parameters
best_model = grid_search.best_estimator_
best_model.fit(X_train, y_train, verbose=True)

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




Best parameters found:

	 colsample_bytree : 0.8
	 gamma : 0
	 learning_rate : 0.1
	 max_depth : 3
	 n_estimators : 100
	 reg_alpha : 0
	 reg_lambda : 0.5
	 subsample : 1


In [9]:
preds = best_model.predict(X_test)


rmse = root_mean_squared_error(y_test, preds, squared=False)
print(f"RMSE: {rmse}")

mae = mean_absolute_error(y_test, preds)
print(f"MAE: {mae}")


avg = np.mean(y_test)

print(f"AVG: {avg}")

# write into excal

sheet_style = [ "num", "test", "prediction", "difference", "accuracy" ]

data_list = list()


for i in range(len(y_test)):
    acc = 0
    diff = abs(y_test[i] - preds[i])
    if preds[i] < 0:
        acc = 0
    else:
        rate = diff / y_test[i]
        if rate < 1 and rate >= 0:
            acc = 1 - rate
        if rate > 1:
            acc = 0

    data_list.append(
        [
            X_test[i][0],
            y_test[i],
            preds[i],
            diff,
            round(float(acc), 5),
        ],
    )


to_excel(data_list, result_output_filename, sheet_style, result_dir_path)

print("Results saved finished.")

RMSE: 0.04086084753533755
MAE: 0.027681089639663706
AVG: 3.547980902194977
Results saved finished.




In [10]:
from pathlib import Path
import os

model_dir_path = str(Path.cwd() / "modelsfile")

if not os.path.exists(model_dir_path):
    os.makedirs(model_dir_path)

model_name = "xgb_number_time_gbtree.json"
best_model.save_model(f"{model_dir_path}/{model_name}")

In [11]:
print(X_test.shape)
print(preds)

(200, 1)
[3.7136917  5.0630326  0.7569058  0.06908131 2.9326952  0.09344149
 4.100716   4.173187   6.1522493  0.91331625 8.246697   7.824644
 5.91       1.2967272  8.246697   2.2035398  0.20638752 6.7672887
 2.3682895  3.0356736  5.789126   4.683747   0.5207846  8.028667
 3.7136917  0.39644408 2.6108723  1.3991936  0.67881656 2.840823
 4.3139005  1.932315   1.7056834  4.2342734  6.6682105  1.5236467
 1.6058408  3.5143502  3.8432152  4.9874954  0.14677763 0.06265378
 2.9356163  7.639103   1.7056834  1.4081917  0.59087014 6.5636053
 1.5236467  2.2035398  4.9874954  0.00992465 4.2019553  6.6208143
 0.47682762 5.4746084  1.7056834  6.931133   0.18449807 8.246697
 1.4081917  0.20638752 0.91867304 1.6058408  5.6723433  6.406247
 3.5143502  5.0630326  2.960386   3.7121232  7.2230515  0.09344149
 2.2035398  1.2967272  1.3866926  0.06265378 1.8469865  6.7672887
 3.4436142  8.081322   4.683747   2.879584   3.2908394  0.9971411
 8.246697   2.46221    0.23584032 0.3915043  3.4436142  4.9874954
 1.