## Data preprocess

In [1]:
from pathlib import Path
from openpyxl import Workbook # type: ignore
from pathlib import Path
import os


def to_excel(data, filename, sheet_style, result_dir_path):
    workbook = Workbook()
    sheet = workbook.active

    # | user-response-time | request-number | response-ip   | process-time  |
    # | 0.3                | 10000          | 192.168.0.150 | 14.523432     |
    # | 0.5                | 20000          | 192.168.0.151 | 9.5232642     |

    sheet.append(sheet_style)

    for row in data:
        sheet.append(row)

    

    if not os.path.exists(result_dir_path):
        os.makedirs(result_dir_path, exist_ok=True)

    workbook.save(filename=f"{result_dir_path}\\{filename}.xlsx")


# read input dataset
filename_prefix = (
    "RANDclientv_single_worker_node-L3-RB200-DTFriAug231800442024"
)


dataset_read_filename = filename_prefix
training_data_dir = Path.cwd().parent / "training_data"
# Data preprocessing
file_path = f"{training_data_dir}\\{dataset_read_filename}.xlsx"


# set result output filename and path
result_suffix = "result"

result_dir_path = Path.cwd().parent / "results" / "result_requestNumber_processTime"
version_index = len([_ for _ in Path(result_dir_path).iterdir() if _.is_file()])
version = f"_v{version_index}"

result_name = "requestNumber#responseTime" + version

result_output_filename = f"{filename_prefix}{result_name}{result_suffix}"

In [2]:

import pandas as pd # type: ignore
import numpy as np # type: ignore


def read_data(filename):
    df = pd.read_excel(filename)

    columns = df.columns.to_list()

    data_dict = {col: df[col].to_list() for col in columns}

    return data_dict


def data_preprocess(filepath):
    data = read_data(filepath)
    # TODO more...

    # to numpy
    for key in data.keys():
        data[key] = np.array(data[key])
    
    return data

In [3]:
dataset = data_preprocess(file_path)

## DATA Style View

In [4]:
# # dataset is a dictionary

# for col_header, col_data_list in dataset.items():
#     print(col_header, col_data_list)

## XGBOOST - 1

In [5]:
from xgboost import XGBRegressor # type: ignore

# Create XGBoost regression model
model = XGBRegressor(
    objective="reg:squarederror",   # Loss function to minimize: squared error for regression  损失函数为平方损失函数, 
    n_estimators=100,  # Number of boosting rounds (number of trees)  迭代次数，即基础学习器的数量
    learning_rate=0.1,  # Boosting learning rate (controls the step size)  学习率，控制每次更新的步长
    max_depth=5,  # Maximum depth of a tree  树的最大深度
    min_child_weight=1,  # Minimum sum of instance weight (hessian) needed in a child  叶子节点最小权重
    subsample=0.8,  # Subsample ratio of the training instance (randomly sampled)  每棵树随机采样的比例
    colsample_bytree=0.8,  # Subsample ratio of columns when constructing each tree  每棵树随机选择的特征比例
    alpha=0.01,  # L1 regularization term on weights  L1 正则化项参数
    reg_lambda=0.01,  # L2 regularization term on weights  L2 正则化项参数
)

In [6]:
from sklearn.model_selection import train_test_split # type: ignore
from sklearn.metrics import mean_squared_error # type: ignore

# # select value from dataset
X = np.array([dataset.get("request_num")]).T
y = dataset.get("processed_time")

# train_size = int(len(X) * 0.7)

# 拆分数据集为训练集和测试集
# split dataset to train dataset and test dataset

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=35
)

# model fit
model.fit(X_train, y_train)

In [7]:
from sklearn.metrics import mean_absolute_error # type: ignore

y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
mea = mean_absolute_error(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"Mean Absolute Error: {mea}")


print(f"    real    |    pred   \n")
for i in range(len(y_test)):
    print(f"    {y_test[i]}     |     {y_pred[i]}      ")

Mean Squared Error: 12.675306526396042
Mean Absolute Error: 2.058156448602676
    real    |    pred   

    0.6989784240722656     |     0.6673154830932617      
    2.005793809890747     |     2.479135751724243      
    2.39279317855835     |     1.9157354831695557      
    46.00368499755859     |     48.412723541259766      
    11.79561305046082     |     16.911563873291016      
    6.003100395202637     |     5.406058311462402      
    7.189697504043579     |     6.518836498260498      
    39.24481439590454     |     38.65833282470703      
    25.80125069618225     |     30.931686401367188      
    8.505000114440918     |     8.993255615234375      
    37.20314168930054     |     42.87421798706055      
    10.89616775512695     |     15.22908878326416      
    12.51950454711914     |     11.887918472290039      
    20.69741058349609     |     22.341907501220703      
    28.41277241706848     |     22.051456451416016      
    22.81205677986145     |     20.9167060852050

## XGBRegressor

In [8]:
from xgboost import XGBRegressor # type: ignore
from sklearn.metrics import mean_squared_error # type: ignore
from sklearn.model_selection import GridSearchCV # type: ignore


# 创建 XGBoost 回归模型
model = XGBRegressor(
    objective="reg:squarederror",
    n_estimators=100,
    learning_rate=0.1,
    tree_method="hist",
    device="cuda",
)

# Define the parameter grid for GridSearchCV
param_grid = {
    "max_depth": [3],  # Range of maximum depth of trees to explore  扩展最大深度的范围
    "learning_rate": [0.1],  # Range of learning rates to explore  扩展学习率的范围
    "n_estimators": [100],  # Range of number of trees (boosting rounds) to explore  扩展树的数量范围
    "gamma": [0],   # Range of gamma parameter to explore (controls tree split) 扩展 gamma 参数范围
    "subsample": [1],  # Range of subsample ratios to explore  扩展子样本比例范围
    "colsample_bytree": [0.8, 0.7],  # Range of column subsample ratios for each tree to explore  扩展列采样比例范围
    "reg_alpha": [0, 0.5],  # Range of L1 regularization parameters to explore  扩展 L1 正则化参数范围
    "reg_lambda": [0, 0.5],  # Range of L2 regularization parameters to explore  扩展 L2 正则化参数范围
}


grid_search = GridSearchCV(
    estimator=model, param_grid=param_grid, cv=5, scoring="neg_mean_squared_error"
)
grid_search.fit(X_train, y_train)

print("Best parameters found:\n")
for k, v in grid_search.best_params_.items():
    print('\t', k, ":", v)


# 使用最佳参数重新训练模型
# Re-train the model using the best parameters
best_model = grid_search.best_estimator_
best_model.fit(X_train, y_train, verbose=True)

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




Best parameters found:

	 colsample_bytree : 0.8
	 gamma : 0
	 learning_rate : 0.1
	 max_depth : 3
	 n_estimators : 100
	 reg_alpha : 0.5
	 reg_lambda : 0.5
	 subsample : 1


In [9]:
preds = best_model.predict(X_test)


rmse = mean_squared_error(y_test, preds, squared=False)
print(f"RMSE: {rmse}")

mae = mean_absolute_error(y_test, preds)
print(f"MAE: {mae}")


avg = np.mean(y_test)

print(f"AVG: {avg}")

# write into excal

sheet_style = [ "num", "test", "prediction", "difference", "accuracy" ]

data_list = list()


for i in range(len(y_test)):
    acc = 0
    diff = abs(y_test[i] - preds[i])
    if preds[i] < 0:
        acc = 0
    else:
        rate = diff / y_test[i]
        if rate < 1 and rate >= 0:
            acc = 1 - rate
        if rate > 1:
            acc = 0

    data_list.append(
        [
            X_test[i][0],
            y_test[i],
            preds[i],
            diff,
            round(float(acc), 5),
        ],
    )


to_excel(data_list, result_output_filename, sheet_style, result_dir_path)

print("Results saved finished.")

RMSE: 3.305333571209699
MAE: 1.8699710210164389
AVG: 18.91339144309362
Results saved finished.




In [10]:
# from pathlib import Path
# import os

# model_dir_path = str(Path.cwd() / "modelsfile")

# if not os.path.exists(model_dir_path):
#     os.makedirs(model_dir_path)

# model_name = "xgb_number_time.json"
# best_model.save_model(f"{model_dir_path}/{model_name}")

In [11]:
print(X_test.shape)
print(preds)

(120, 1)
[ 0.6839676   2.5954876   1.9800949  47.241005   16.681805    5.4483747
  6.4328403  43.057808   28.443588    9.030518   39.06994    15.666276
 11.529556   21.952684   24.016214   20.62253    14.7668085  30.71582
  1.2100124  23.030752    3.420988   24.016214    3.420988    3.420988
 17.8926     34.26923    11.529556    0.4252472  34.26923    46.71734
 20.62253    11.050804    7.382084    5.4483747  14.7668085  34.26923
 10.418309    8.128008   17.8926     40.834183    0.30345726  8.128008
 28.592094   36.81553    28.443588   34.50181    46.11927     5.8140106
 42.92182    16.681805   20.62253    19.123333   23.030752   18.911535
 14.7668085  45.430283    0.72522926 21.952684   40.834183   28.443588
  0.5190258  41.220818    2.5954876   4.614149    1.9800949   2.5954876
 19.911436   19.123333   41.94668     1.4740429   5.8140106  31.404522
 37.423878    9.4028425  20.62253     7.382084   26.743797   11.698466
  5.8140106  18.356155   27.489992   41.220818   15.666276   25.8873