## Data preprocess

In [19]:
from pathlib import Path
from openpyxl import Workbook # type: ignore
from pathlib import Path
import os


def to_excel(data, filename, sheet_style, result_dir_path):
    workbook = Workbook()
    sheet = workbook.active

    # | user-response-time | request-number | response-ip   | process-time  |
    # | 0.3                | 10000          | 192.168.0.150 | 14.523432     |
    # | 0.5                | 20000          | 192.168.0.151 | 9.5232642     |

    sheet.append(sheet_style)

    for row in data:
        sheet.append(row)

    

    if not os.path.exists(result_dir_path):
        os.makedirs(result_dir_path, exist_ok=True)

    workbook.save(filename=f"{result_dir_path}\\{filename}.xlsx")


# read input dataset
filename_prefix = (
    "RandomRequestNumberclientv5#loops4#requests_batch200#Fri-Jul-26-16-31-44-2024"
)
dataset_read_filename = filename_prefix
training_data_dir = Path.cwd().parent / "training_data" / "data_set4"
# Data preprocessing
file_path = f"{training_data_dir}\\{dataset_read_filename}.xlsx"


# set result output filename and path
result_suffix = "result"

result_dir_path = Path.cwd().parent / "results" / "result_processTime_waitTasks"
version_index = len([_ for _ in Path(result_dir_path).iterdir() if _.is_file()])
version = f"_v{version_index}"

result_name = "processTime#waitTasks" + version

result_output_filename = f"{filename_prefix}{result_name}{result_suffix}"

In [20]:

import pandas as pd # type: ignore
import numpy as np # type: ignore


def read_data(filename):
    df = pd.read_excel(filename)

    columns = df.columns.to_list()

    data_dict = {col: df[col].to_list() for col in columns}

    return data_dict


def data_preprocess(filepath):
    data = read_data(filepath)
    # TODO more...

    # to numpy
    for key in data.keys():
        data[key] = np.array(data[key])

    return data

In [21]:
dataset = data_preprocess(file_path)

## DATA Style View

In [22]:
# # dataset is a dictionary

for col_header, col_data_list in dataset.items():
    print(col_header, col_data_list)

num [237545 251499 223899 234227 242695 161803 268784  91227 334219 322501
 257531 366010 354559 197790 470600 364427 409267 187681 283669 101227
 475783  34551 365581 249108  55796 356894 323116 301681 366585  68623
 404366 218474 437378 106294 131494 290263 481154 245421  28474 336376
 441958 197353 471344 433075  38039 387045 323778  89840 173565 124443
 383292 118439 321549 109903 434737 202179 344577 415321 172290  20989
 110067 402775 104504 215387  70577 295275 216523  11385 406678 381921
 301691 285627 230227 350973  20272 358464  69400  97928  72091 218500
 462168 432887  30901 379317 311374  70220 357059  11689 270517 392435
  99217 152459  43084 131618  65841 134572 207018 403971 233653 102999
 134213  32988 373325 177337  20004 142287  94106 158099 222993 329694
 267122 314961 332825 357504 457000 302874 393149 352975 469143 121153
  28263 231022 211260 102100 457647 233827 340767 254856  16343 438029
 308601 310098 178253 339144 298202  51126  85080 436018      4 193481
  

## XGBOOST - 1

In [23]:
from xgboost import XGBRegressor # type: ignore

# Create XGBoost regression model
model = XGBRegressor(
    objective="reg:squarederror",   # Loss function to minimize: squared error for regression  损失函数为平方损失函数, 
    n_estimators=100,  # Number of boosting rounds (number of trees)  迭代次数，即基础学习器的数量
    learning_rate=0.1,  # Boosting learning rate (controls the step size)  学习率，控制每次更新的步长
    max_depth=5,  # Maximum depth of a tree  树的最大深度
    min_child_weight=1,  # Minimum sum of instance weight (hessian) needed in a child  叶子节点最小权重
    subsample=0.8,  # Subsample ratio of the training instance (randomly sampled)  每棵树随机采样的比例
    colsample_bytree=0.8,  # Subsample ratio of columns when constructing each tree  每棵树随机选择的特征比例
    alpha=0.01,  # L1 regularization term on weights  L1 正则化项参数
    reg_lambda=0.01,  # L2 regularization term on weights  L2 正则化项参数
)

In [24]:
from sklearn.model_selection import train_test_split # type: ignore
from sklearn.metrics import mean_squared_error # type: ignore

# # select value from dataset
X = np.array([dataset.get("num"), dataset.get("waiting_tasks_in_worker_node"), dataset.get("processing_tasks_in_worker_node")]).T
y = dataset.get("process_in_manager_node")

# train_size = int(len(X) * 0.7)

# 拆分数据集为训练集和测试集
# split dataset to train dataset and test dataset

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=35
)

# model fit
model.fit(X_train, y_train)

In [25]:
from sklearn.metrics import mean_absolute_error # type: ignore

y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
mea = mean_absolute_error(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"Mean Absolute Error: {mea}")


print(f"    real    |    pred   \n")
for i in range(len(y_test)):
    print(f"    {y_test[i]}     |     {y_pred[i]}      ")

Mean Squared Error: 128.98426964038975
Mean Absolute Error: 8.115272170212119
    real    |    pred   

    198.8174231052399     |     204.12246704101562      
    189.1108047962189     |     203.0213623046875      
    188.1546251773834     |     183.85537719726562      
    83.0617094039917     |     81.95089721679688      
    126.2113168239594     |     132.39805603027344      
    137.7391784191132     |     149.64320373535156      
    66.43074750900269     |     72.8193588256836      
    119.0795302391052     |     110.8382568359375      
    169.3786180019379     |     159.99691772460938      
    151.71843957901     |     148.58401489257812      
    0.004213333129882812     |     -2.8702797889709473      
    13.41124749183655     |     43.98076629638672      
    66.23379302024841     |     68.84342193603516      
    56.91588139533997     |     58.85173034667969      
    214.0089013576508     |     202.062255859375      
    17.65011787414551     |     -6.068770408630371

## XGBRegressor

In [26]:
from xgboost import XGBRegressor # type: ignore
from sklearn.metrics import mean_squared_error # type: ignore
from sklearn.model_selection import GridSearchCV # type: ignore


# 创建 XGBoost 回归模型
model = XGBRegressor(
    objective="reg:squarederror",
    n_estimators=100,
    learning_rate=0.1,
    tree_method="hist",
    device="cuda",
)

# Define the parameter grid for GridSearchCV
param_grid = {
    "max_depth": [3],  # Range of maximum depth of trees to explore  扩展最大深度的范围
    "learning_rate": [0.1],  # Range of learning rates to explore  扩展学习率的范围
    "n_estimators": [100],  # Range of number of trees (boosting rounds) to explore  扩展树的数量范围
    "gamma": [0],   # Range of gamma parameter to explore (controls tree split) 扩展 gamma 参数范围
    "subsample": [1],  # Range of subsample ratios to explore  扩展子样本比例范围
    "colsample_bytree": [0.8, 0.7],  # Range of column subsample ratios for each tree to explore  扩展列采样比例范围
    "reg_alpha": [0, 0.5],  # Range of L1 regularization parameters to explore  扩展 L1 正则化参数范围
    "reg_lambda": [0, 0.5],  # Range of L2 regularization parameters to explore  扩展 L2 正则化参数范围
}


grid_search = GridSearchCV(
    estimator=model, param_grid=param_grid, cv=5, scoring="neg_mean_squared_error"
)
grid_search.fit(X_train, y_train)

print("Best parameters found:\n")
for k, v in grid_search.best_params_.items():
    print('\t', k, ":", v)


# 使用最佳参数重新训练模型
# Re-train the model using the best parameters
best_model = grid_search.best_estimator_
best_model.fit(X_train, y_train, verbose=True)

Best parameters found:

	 colsample_bytree : 0.8
	 gamma : 0
	 learning_rate : 0.1
	 max_depth : 3
	 n_estimators : 100
	 reg_alpha : 0
	 reg_lambda : 0.5
	 subsample : 1


In [27]:
preds = best_model.predict(X_test)

rmse = mean_squared_error(y_test, preds, squared=False)
print(f"RMSE: {rmse}")

mae = mean_absolute_error(y_test, preds)
print(f"MAE: {mae}")


avg = np.mean(y_test)

print(f"AVG: {avg}")

# write into excal

sheet_style = [ "num", "test", "prediction", "difference", "accuracy"]

data_list = list()


for i in range(len(y_test)):
    acc = 0
    diff = abs(y_test[i] - preds[i])
    if preds[i] < 0:
        acc = 0
    else:
        rate = diff / y_test[i]
        if rate < 1 and rate >= 0:
            acc = 1 - rate
        if rate > 1:
            acc = 0

    data_list.append(
        [
            X_test[i][0],
            y_test[i],
            preds[i],
            diff,
            f"{float(acc * 100): .5}%",
        ],
    )


to_excel(
    data_list,
    result_output_filename,
    sheet_style,
    result_dir_path)

print("Results saved finished.")

RMSE: 11.327906918495652
MAE: 8.293933786451815
AVG: 112.04583690017462
Results saved finished.


