## Data preprocess

In [10]:
from pathlib import Path
from openpyxl import Workbook # type: ignore
from pathlib import Path
import os


def to_excel(data, filename, sheet_style, result_dir_path):
    workbook = Workbook()
    sheet = workbook.active

    # | user-response-time | request-number | response-ip   | process-time  |
    # | 0.3                | 10000          | 192.168.0.150 | 14.523432     |
    # | 0.5                | 20000          | 192.168.0.151 | 9.5232642     |

    sheet.append(sheet_style)

    for row in data:
        sheet.append(row)

    

    if not os.path.exists(result_dir_path):
        os.makedirs(result_dir_path, exist_ok=True)

    workbook.save(filename=f"{result_dir_path}\\{filename}.xlsx")


# read input dataset
filename_prefix = (
    "RandomRequestNumberclientv5#loops4#requests_batch200#Fri-Jul-26-18-36-52-2024"
)
dataset_read_filename = filename_prefix
training_data_dir = Path.cwd().parent / "training_data" / "data_set4"
# Data preprocessing
file_path = f"{training_data_dir}\\{dataset_read_filename}.xlsx"


# set result output filename and path
result_suffix = "result"

result_dir_path = Path.cwd().parent / "results" / "result_processTime_waitTasks"
version_index = len([_ for _ in Path(result_dir_path).iterdir() if _.is_file()])
version = f"_v{version_index}"

result_name = "processTime#waitTasks" + version

result_output_filename = f"{filename_prefix}{result_name}{result_suffix}"

In [11]:

import pandas as pd # type: ignore
import numpy as np # type: ignore


def read_data(filename):
    df = pd.read_excel(filename)

    columns = df.columns.to_list()

    data_dict = {col: df[col].to_list() for col in columns}

    return data_dict


def data_preprocess(filepath):
    data = read_data(filepath)
    # TODO more...

    # to numpy
    for key in data.keys():
        data[key] = np.array(data[key])

    return data

In [12]:
dataset = data_preprocess(file_path)

## DATA Style View

In [13]:
# # dataset is a dictionary

for col_header, col_data_list in dataset.items():
    print(col_header, col_data_list)

num [124276  89639 431408 194355 354111  29191 442198 152565  45301 258078
  17339 337600 361717 420785 228395 282530 116425 130159 434405 399401
  41687 164243 106273 325760 367494 141396   5117   4575  19687 447119
 235305 344347 407699 128078  78776 230087 119027  19360 387233 287346
 414399 348058 323393 108870 380565 124351 130324 300676 258266  13477
 474919 495101 307778 365958 462623 183860 321368 489365 271081 451854
 473089  32945 295888 303283  73479 428023 294240 402689  33970 160196
 131351 192856 255217   6215  55214 493956 468097  44487 451022 181819
 195744 470733 286494 108421 138499 409081 140430 374154 240116 445220
 145495  48171  54822 298458 347013 233524 139964 204850 250187 419201
 193755 138061 202493 439641 335154  68285 389012   4018  86641 179075
 252455 216116 210649 138518  27845 264142 350863  40906 385307 149808
 103039 166906 156898 422125 430238 397221 233156 484052 108364 199723
 397090 101561  75697  56936 119589 346581  20670  10101 100553 193990
 1

## XGBOOST - 1

In [14]:
from xgboost import XGBRegressor # type: ignore

# Create XGBoost regression model
model = XGBRegressor(
    objective="reg:squarederror",   # Loss function to minimize: squared error for regression  损失函数为平方损失函数, 
    n_estimators=100,  # Number of boosting rounds (number of trees)  迭代次数，即基础学习器的数量
    learning_rate=0.1,  # Boosting learning rate (controls the step size)  学习率，控制每次更新的步长
    max_depth=5,  # Maximum depth of a tree  树的最大深度
    min_child_weight=1,  # Minimum sum of instance weight (hessian) needed in a child  叶子节点最小权重
    subsample=0.8,  # Subsample ratio of the training instance (randomly sampled)  每棵树随机采样的比例
    colsample_bytree=0.8,  # Subsample ratio of columns when constructing each tree  每棵树随机选择的特征比例
    alpha=0.01,  # L1 regularization term on weights  L1 正则化项参数
    reg_lambda=0.01,  # L2 regularization term on weights  L2 正则化项参数
)

In [15]:
from sklearn.model_selection import train_test_split # type: ignore
from sklearn.metrics import mean_squared_error # type: ignore

# # select value from dataset
        # dataset.get("num"),
        # dataset.get("waiting_tasks_in_worker_node"),

# input <= waiting tasks in worker node, and waiting tasks in manager node.
X = np.array(
    [
        dataset.get("processing_tasks_in_manager_node"),
    ]
).T
y = dataset.get("process_in_manager_node")

# train_size = int(len(X) * 0.7)

# 拆分数据集为训练集和测试集
# split dataset to train dataset and test dataset

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=35
)

# model fit
model.fit(X_train, y_train)

In [16]:
from sklearn.metrics import mean_absolute_error # type: ignore

y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
mea = mean_absolute_error(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"Mean Absolute Error: {mea}")


print(f"    real    |    pred   \n")
for i in range(len(y_test)):
    print(f"    {y_test[i]}     |     {y_pred[i]}      ")

Mean Squared Error: 221.11254694377726
Mean Absolute Error: 11.408976718783379
    real    |    pred   

    191.5291249752045     |     187.09127807617188      
    189.6427075862885     |     207.2022705078125      
    160.1184141635895     |     162.54180908203125      
    84.71098399162292     |     85.63792419433594      
    116.4048590660095     |     135.51766967773438      
    136.4312314987183     |     146.0036163330078      
    72.39037990570068     |     63.8337516784668      
    94.18330574035645     |     104.63803100585938      
    165.7178544998169     |     166.9185791015625      
    154.9750211238861     |     148.12916564941406      
    0.006290197372436523     |     0.35071900486946106      
    15.76546549797058     |     2.6338703632354736      
    77.42958688735962     |     64.4528579711914      
    74.64387106895447     |     59.675193786621094      
    162.6397891044617     |     182.5587615966797      
    0.343294620513916     |     2.63387036323

## XGBRegressor

In [17]:
from xgboost import XGBRegressor # type: ignore
from sklearn.metrics import mean_squared_error # type: ignore
from sklearn.model_selection import GridSearchCV # type: ignore


# 创建 XGBoost 回归模型
model = XGBRegressor(
    objective="reg:squarederror",
    n_estimators=100,
    learning_rate=0.1,
    tree_method="hist",
    device="cuda",
)

# Define the parameter grid for GridSearchCV
param_grid = {
    "max_depth": [3],  # Range of maximum depth of trees to explore  扩展最大深度的范围
    "learning_rate": [0.1],  # Range of learning rates to explore  扩展学习率的范围
    "n_estimators": [100],  # Range of number of trees (boosting rounds) to explore  扩展树的数量范围
    "gamma": [0],   # Range of gamma parameter to explore (controls tree split) 扩展 gamma 参数范围
    "subsample": [1],  # Range of subsample ratios to explore  扩展子样本比例范围
    "colsample_bytree": [0.8, 0.7],  # Range of column subsample ratios for each tree to explore  扩展列采样比例范围
    "reg_alpha": [0, 0.5],  # Range of L1 regularization parameters to explore  扩展 L1 正则化参数范围
    "reg_lambda": [0, 0.5],  # Range of L2 regularization parameters to explore  扩展 L2 正则化参数范围
}


grid_search = GridSearchCV(
    estimator=model, param_grid=param_grid, cv=5, scoring="neg_mean_squared_error"
)
grid_search.fit(X_train, y_train)

print("Best parameters found:\n")
for k, v in grid_search.best_params_.items():
    print('\t', k, ":", v)


# 使用最佳参数重新训练模型
# Re-train the model using the best parameters
best_model = grid_search.best_estimator_
best_model.fit(X_train, y_train, verbose=True)



Best parameters found:

	 colsample_bytree : 0.8
	 gamma : 0
	 learning_rate : 0.1
	 max_depth : 3
	 n_estimators : 100
	 reg_alpha : 0.5
	 reg_lambda : 0.5
	 subsample : 1




In [18]:
preds = best_model.predict(X_test)

rmse = mean_squared_error(y_test, preds, squared=False)
print(f"RMSE: {rmse}")

mae = mean_absolute_error(y_test, preds)
print(f"MAE: {mae}")


avg = np.mean(y_test)

print(f"AVG: {avg}")

# write into excal

sheet_style = [ "num", "test", "prediction", "difference", "accuracy"]

data_list = list()


for i in range(len(y_test)):
    acc = 0
    diff = abs(y_test[i] - preds[i])
    if preds[i] < 0:
        acc = 0
    else:
        rate = diff / y_test[i]
        if rate < 1 and rate >= 0:
            acc = 1 - rate
        if rate > 1:
            acc = 0

    data_list.append(
        [
            X_test[i][0],
            y_test[i],
            preds[i],
            diff,
            f"{float(acc * 100): .5}%",
        ],
    )


to_excel(
    data_list,
    result_output_filename,
    sheet_style,
    result_dir_path)

print("Results saved finished.")



RMSE: 13.679346021535853
MAE: 10.88684214502573
AVG: 105.67990706861019
Results saved finished.
