## Data preprocess

In [78]:
from pathlib import Path
from openpyxl import Workbook # type: ignore
from pathlib import Path
import os


def to_excel(data, filename, sheet_style, result_dir_path):
    workbook = Workbook()
    sheet = workbook.active

    # | user-response-time | request-number | response-ip   | process-time  |
    # | 0.3                | 10000          | 192.168.0.150 | 14.523432     |
    # | 0.5                | 20000          | 192.168.0.151 | 9.5232642     |

    sheet.append(sheet_style)

    for row in data:
        sheet.append(row)

    if not os.path.exists(result_dir_path):
        os.makedirs(result_dir_path, exist_ok=True)

    workbook.save(filename=f"{result_dir_path}\\{filename}.xlsx")


# read input dataset
filename_prefix = "RandomRequestNumberclientv_single_worker_node#loops1#requests_batch50#Thu-Aug--8-13-18-05-2024"
dataset_read_filename = filename_prefix
training_data_dir = Path.cwd().parent / "training_data" / "data_set5"
# Data preprocessing
file_path = f"{training_data_dir}\\{dataset_read_filename}.xlsx"


# set result output filename and path
result_suffix = "result"

result_dir_path = Path.cwd().parent / "results" / "result_processTime_waitTasks"
version_index = len([_ for _ in Path(result_dir_path).iterdir() if _.is_file()])
version = f"_v{version_index}"

result_name = "processTime#waitTasks" + version

result_output_filename = f"{filename_prefix}{result_name}{result_suffix}"

In [79]:

import pandas as pd # type: ignore
import numpy as np # type: ignore


def read_data(filename):
    df = pd.read_excel(filename)

    columns = df.columns.to_list()

    data_dict = {col: df[col].to_list() for col in columns}

    return data_dict


def data_preprocess(filepath):
    data = read_data(filepath)
    # TODO more...

    # to numpy
    for key in data.keys():
        data[key] = np.array(data[key])

    return data

In [80]:
dataset = data_preprocess(file_path)

dataset_com = (dataset.get('processed_time_on_worker_node') + dataset.get('processed_and_waited_time_on_manager_node')).tolist()

## DATA Style View

In [81]:
# # dataset is a dictionary

for col_header, col_data_list in dataset.items():
    print(col_header, col_data_list)

request_num [334669 381017 271005 372250 412523 389111 388174 499351 256294 238304
 148812 143071 260802 365724  42800 435893 379073 106535  75975 407824
  80965 353201 476930   1804 283129  95815 146490 266507 343104 119423
 103954  88575 474892 392376 498225 494710 303411 355176 230177 484124
 497595 204408 269840  27114 170577 419312 471736 416527 209154 461702]
return_result [28770 32374 23727 31685 34811 32993 32921 41486 22547 21084 13739 13268
 22897 31201  4476 36639 32222 10153  7479 34438  7924 30230 39777   279
 24693  9236 13549 23353 29427 11249  9925  8580 39607 33264 41398 41126
 26273 30395 20452 40327 41352 18335 23631  2974 15548 35336 39368 35122
 18727 38588]
user_cpu_time [3.71 4.5  2.77 4.3  4.99 4.57 4.57 6.56 2.5  2.26 1.17 1.11 2.58 4.19
 0.22 5.4  4.42 0.73 0.47 4.88 0.51 3.99 6.17 0.   2.9  0.64 1.14 2.65
 3.83 0.86 0.72 0.58 6.1  4.64 6.56 6.48 3.2  4.01 2.16 6.3  6.54 1.81
 2.7  0.13 1.4  5.09 6.03 5.06 1.87 5.86]
system_cpu_time [0.   0.   0.   0.   0.   0

## XGBOOST - 1

In [82]:
from xgboost import XGBRegressor # type: ignore

# Create XGBoost regression model
model = XGBRegressor(
    objective="reg:squarederror",   # Loss function to minimize: squared error for regression  损失函数为平方损失函数, 
    n_estimators=100,  # Number of boosting rounds (number of trees)  迭代次数，即基础学习器的数量
    learning_rate=0.1,  # Boosting learning rate (controls the step size)  学习率，控制每次更新的步长
    max_depth=5,  # Maximum depth of a tree  树的最大深度
    min_child_weight=1,  # Minimum sum of instance weight (hessian) needed in a child  叶子节点最小权重
    subsample=0.8,  # Subsample ratio of the training instance (randomly sampled)  每棵树随机采样的比例
    colsample_bytree=0.8,  # Subsample ratio of columns when constructing each tree  每棵树随机选择的特征比例
    alpha=0.01,  # L1 regularization term on weights  L1 正则化项参数
    reg_lambda=0.01,  # L2 regularization term on weights  L2 正则化项参数
)

In [83]:
from sklearn.model_selection import train_test_split # type: ignore
from sklearn.metrics import mean_squared_error # type: ignore

# # select value from dataset
# dataset.get("num"),
# dataset.get("waiting_tasks_in_worker_node"),

# input <= waiting tasks in worker node, and waiting tasks in manager node.

X = np.array(
    [
        dataset.get("request_num"),
        dataset.get("processing_tasks_on_manager_node")
    ]
).T

y = np.array(dataset_com)


# train_size = int(len(X) * 0.7)

# 拆分数据集为训练集和测试集
# split dataset to train dataset and test dataset

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=43
)

# model fit
model.fit(X_train, y_train)

In [84]:
from sklearn.metrics import mean_absolute_error # type: ignore

y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
mea = mean_absolute_error(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"Mean Absolute Error: {mea}")


print(f"    real    |    pred   \n")
for i in range(len(y_test)):
    print(f"    {y_test[i]}     |     {y_pred[i]}      ")

Mean Squared Error: 100.14576205723833
Mean Absolute Error: 8.005700063705444
    real    |    pred   

    17.405009031295776     |     27.859453201293945      
    43.77121448516846     |     39.13478469848633      
    38.959608793258674     |     30.859373092651367      
    11.433789730072025     |     33.173561096191406      
    28.909806728363037     |     25.37732696533203      
    59.490601539611816     |     43.63338088989258      
    51.022051572799676     |     47.10326385498047      
    14.481367349624634     |     8.605646133422852      
    24.215541601181034     |     27.883007049560547      
    49.01176476478577     |     46.73731994628906      


## XGBRegressor

In [85]:
from xgboost import XGBRegressor # type: ignore
from sklearn.metrics import mean_squared_error # type: ignore
from sklearn.model_selection import GridSearchCV # type: ignore


# 创建 XGBoost 回归模型
model = XGBRegressor(
    objective="reg:squarederror",
    n_estimators=100,
    learning_rate=0.1,
    tree_method="hist",
    device="cuda",
)

# Define the parameter grid for GridSearchCV
param_grid = {
    "max_depth": [3, 5, 7, 9, 12, 15],  # Range of maximum depth of trees to explore  扩展最大深度的范围
    "learning_rate": [0.1, 0.05, 0.01, 0.005, 0.001],  # Range of learning rates to explore  扩展学习率的范围
    "n_estimators": [100, 200, 300, 500],  # Range of number of trees (boosting rounds) to explore  扩展树的数量范围
    "gamma": [0, 0.1, 0.2, 0.5, 1],   # Range of gamma parameter to explore (controls tree split) 扩展 gamma 参数范围
    "subsample": [0.5, 0.7, 0.8, 1],  # Range of subsample ratios to explore  扩展子样本比例范围
    "colsample_bytree": [0.5, 0.7, 0.8, 0.9, 1],  # Range of column subsample ratios for each tree to explore  扩展列采样比例范围
    "reg_alpha": [0, 0.1, 0.5, 1, 2],  # Range of L1 regularization parameters to explore  扩展 L1 正则化参数范围
    "reg_lambda": [0, 0.1, 0.5, 1, 2],  # Range of L2 regularization parameters to explore  扩展 L2 正则化参数范围
}


grid_search = GridSearchCV(
    estimator=model, param_grid=param_grid, cv=5, scoring="neg_mean_squared_error"
)
grid_search.fit(X_train, y_train)

print("Best parameters found:\n")
for k, v in grid_search.best_params_.items():
    print('\t', k, ":", v)


# 使用最佳参数重新训练模型
# Re-train the model using the best parameters
best_model = grid_search.best_estimator_
best_model.fit(X_train, y_train, verbose=True)



Best parameters found:

	 colsample_bytree : 0.8
	 gamma : 0
	 learning_rate : 0.1
	 max_depth : 3
	 n_estimators : 100
	 reg_alpha : 0
	 reg_lambda : 0.5
	 subsample : 1




In [86]:
preds = best_model.predict(X_test)

rmse = mean_squared_error(y_test, preds, squared=False)
print(f"RMSE: {rmse}")

mae = mean_absolute_error(y_test, preds)
print(f"MAE: {mae}")


avg = np.mean(y_test)

print(f"AVG: {avg}")

# write into excal

sheet_style = [ "num", "test", "prediction", "difference", "accuracy"]

data_list = list()


for i in range(len(y_test)):
    acc = 0
    diff = abs(y_test[i] - preds[i])
    if preds[i] < 0:
        acc = 0
    else:
        rate = diff / y_test[i]
        if rate < 1 and rate >= 0:
            acc = 1 - rate
        if rate > 1:
            acc = 0

    data_list.append(
        [
            X_test[i][0],
            y_test[i],
            preds[i],
            diff,
            f"{float(acc * 100): .5}%",
        ],
    )


to_excel(
    data_list,
    result_output_filename,
    sheet_style,
    result_dir_path)

print("Results saved finished.")

RMSE: 5.888523916786467
MAE: 4.613119888305666
AVG: 33.87007555961609
Results saved finished.


