## Data preprocess

In [203]:
from pathlib import Path
import pandas as pd



def read_data(filename):
    df = pd.read_excel(filename)
    return df.to_numpy()

In [204]:
import numpy as np

## UNUSE
def one_hot_encoding(arr):
    s = set()
    _sd = dict()
    for e in arr:
        s.add(e)

    sl = list(s)
    for i in range(len(sl)):
        encode = np.zeros(len(sl))
        encode[i] = 1
        _sd[sl[i]] = encode
    
    return _sd


def data_preprocess(filepath):
    data = read_data(filepath)

    # switch how many workers
    dataset = dict()

    for d in data:
        ip_address = d[-4]
        worker_data_set = np.concatenate((d[:-4], d[-4 + 1:])).tolist()
        if ip_address not in dataset:
            dataset[ip_address] = list()
            dataset[ip_address].append(worker_data_set)
        else:
            dataset[ip_address].append(worker_data_set)

    ip_data = one_hot_encoding(list(dataset.keys()))

    for key in dataset.keys():
        for i in range(len(dataset[key])):
            dataset[key][i] += ip_data[key].tolist()

    return dataset

In [205]:
filename = "nodeWaitTime_jobsNumber_v8"

resutls_dir = Path.cwd().parent / "training_data"

# Data preprocessing
file_path = f"{resutls_dir}\\{filename}.xlsx"

dataset = data_preprocess(file_path)


## DATA Style View

In [206]:
# dataset is a dictionary

for key, data in dataset.items():
    print(f"{key}: {data}")

    # data array style view
    print(np.array(data).shape)

192.168.0.150: [[22.74798655509949, 967470, 22.43506407737732, 0.01135945320129395, 45, 0.0, 0.0, 1.0], [3.085849761962891, 236099, 2.77397346496582, 0.007861137390136719, 47, 0.0, 0.0, 1.0], [88.324946641922, 214939, 2.454077959060669, 0.0008404254913330078, 70, 0.0, 0.0, 1.0], [79.723304271698, 324230, 4.403936386108398, 0.0008649826049804688, 73, 0.0, 0.0, 1.0], [13.58494520187378, 294972, 3.833493232727051, 9.446401357650757, 45, 0.0, 0.0, 1.0], [44.89628887176514, 938995, 20.95247602462769, 23.63608360290527, 42, 0.0, 0.0, 1.0], [95.2431697845459, 47042, 0.3396763801574707, 0.001082658767700195, 69, 0.0, 0.0, 1.0], [84.8319251537323, 255741, 3.143162727355957, 0.0008251667022705078, 72, 0.0, 0.0, 1.0], [12.64279770851135, 302400, 3.988137006759644, 8.347902297973633, 45, 0.0, 0.0, 1.0], [56.5403950214386, 484318, 7.89743185043335, 48.34145212173462, 42, 0.0, 0.0, 1.0], [97.43026423454285, 61575, 0.4613742828369141, 0.001159429550170898, 68, 0.0, 0.0, 1.0], [91.9651927947998, 82151

## XGBOOST - 1

In [207]:
from xgboost import XGBRegressor

# Create XGBoost regression model
model = XGBRegressor(
    objective="reg:squarederror",   # Loss function to minimize: squared error for regression  损失函数为平方损失函数, 
    n_estimators=100,  # Number of boosting rounds (number of trees)  迭代次数，即基础学习器的数量
    learning_rate=0.1,  # Boosting learning rate (controls the step size)  学习率，控制每次更新的步长
    max_depth=5,  # Maximum depth of a tree  树的最大深度
    min_child_weight=1,  # Minimum sum of instance weight (hessian) needed in a child  叶子节点最小权重
    subsample=0.8,  # Subsample ratio of the training instance (randomly sampled)  每棵树随机采样的比例
    colsample_bytree=0.8,  # Subsample ratio of columns when constructing each tree  每棵树随机选择的特征比例
    alpha=0.01,  # L1 regularization term on weights  L1 正则化项参数
    reg_lambda=0.01,  # L2 regularization term on weights  L2 正则化项参数
)

In [208]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# X jobs number
# y wait time

# select key
ip_key = list(dataset.keys())[0]

# select value from dataset
X, y = np.array(dataset[ip_key])[:, -4:], np.array(dataset[ip_key])[:, -5:-4]

print(X.shape, X[0])
print(y.shape, y[0])


train_size = int(len(X) * 0.7)

# 拆分数据集为训练集和测试集
# split dataset to train dataset and test dataset

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# time series
# X_train = X[:train_size]
# X_test = X[train_size:]
# y_train = y[:train_size]
# y_test = y[train_size:]

# model fit
model.fit(X_train, y_train)

(40, 4) [45.  0.  0.  1.]
(40, 1) [0.01135945]


In [209]:
from sklearn.metrics import mean_absolute_error

y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
mea = mean_absolute_error(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"Mean Absolute Error: {mea}")


print(f"    real    |    pred   \n")
for i in range(len(y_test)):
    print(f"    {y_test[i]}     |     {y_pred[i]}      ")

Mean Squared Error: 81.62921289832418
Mean Absolute Error: 3.8708968072896823
    real    |    pred   

    [0.00102711]     |     -0.0012593944557011127      
    [0.00085568]     |     -0.0012593944557011127      
    [0.00081563]     |     -0.0012593944557011127      
    [0.00083923]     |     -0.0012593944557011127      
    [9.44640136]     |     3.295614719390869      
    [0.00074887]     |     24.804004669189453      
    [0.00091863]     |     -0.0012593944557011127      
    [0.00111938]     |     -0.0012593944557011127      


## XGBRegressor

In [210]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# X jobs number
# y wait time

# select key
ip_key = list(dataset.keys())[1]

# select value from dataset
X, y = np.array(dataset[ip_key])[:, -4:], np.array(dataset[ip_key])[:, -5:-4]

print(X.shape, X[0])
print(y.shape, y[0])


train_size = int(len(X) * 0.7)

# 拆分数据集为训练集和测试集
# split dataset to train dataset and test dataset

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

(40, 4) [46.  1.  0.  0.]
(40, 1) [2.78595948]


In [211]:
import numpy as np
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error


# 创建 XGBoost 回归模型
model = XGBRegressor(
    objective="reg:squarederror",
    n_estimators=100,
    learning_rate=0.1,
    tree_method="hist",
    device="cuda",
)

# Define the parameter grid for GridSearchCV
param_grid = {
    "max_depth": [3],  # Range of maximum depth of trees to explore  扩展最大深度的范围
    "learning_rate": [0.1],  # Range of learning rates to explore  扩展学习率的范围
    "n_estimators": [100],  # Range of number of trees (boosting rounds) to explore  扩展树的数量范围
    "gamma": [0],   # Range of gamma parameter to explore (controls tree split) 扩展 gamma 参数范围
    "subsample": [1],  # Range of subsample ratios to explore  扩展子样本比例范围
    "colsample_bytree": [0.8, 0.7],  # Range of column subsample ratios for each tree to explore  扩展列采样比例范围
    "reg_alpha": [0, 0.5],  # Range of L1 regularization parameters to explore  扩展 L1 正则化参数范围
    "reg_lambda": [0, 0.5],  # Range of L2 regularization parameters to explore  扩展 L2 正则化参数范围
}


grid_search = GridSearchCV(
    estimator=model, param_grid=param_grid, cv=5, scoring="neg_mean_squared_error"
)
grid_search.fit(X_train, y_train)

print("Best parameters found: ", grid_search.best_params_)

# 使用最佳参数重新训练模型
# Re-train the model using the best parameters
best_model = grid_search.best_estimator_
best_model.fit(X_train, y_train, verbose=True)


Best parameters found:  {'colsample_bytree': 0.7, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100, 'reg_alpha': 0.5, 'reg_lambda': 0.5, 'subsample': 1}


In [212]:
from openpyxl import Workbook
from pathlib import Path

def to_excel(data, filename, sheet_style):
    workbook = Workbook()
    sheet = workbook.active

    # | user-response-time | request-number | response-ip   | process-time  |
    # | 0.3                | 10000          | 192.168.0.150 | 14.523432     |
    # | 0.5                | 20000          | 192.168.0.151 | 9.5232642     |

    sheet.append(sheet_style)

    for row in data:
        sheet.append(row)

    results_dir = str(Path.cwd().parent / "results")

    workbook.save(filename=f"{results_dir}\\{filename}.xlsx")

In [213]:
preds = best_model.predict(X_test)

rmse = mean_squared_error(y_test, preds, squared=False)
print(f"RMSE: {rmse}")

mae = mean_absolute_error(y_test, preds)
print(f"MAE: {mae}")


avg = np.mean(y_test)

print(f"AVG: {avg}")

# write into excal

sheet_style = ["test", "prediction", "difference", "accuracy"]

data_list = list()


for i in range(len(y_test)):
    acc = 0
    diff = round(abs(y_test[i][0] - y_pred[i]), 6)
    if y_pred[i] < 0:
        acc = 0
    else:
        rate = round(diff / y_test[i][0], 4)
        if rate <= 1 or rate >= 0:
            acc = 1 - rate
        if rate > 1:
            acc = 0

    data_list.append(
        [
            y_test[i][0],
            y_pred[i],
            diff,
            f"{round(acc * 100, 2)}%",
        ],
    )


filename = "nodeWaitTime_requestJobs_Prediction_vx1_v8"
to_excel(data_list, filename, sheet_style)


print("Results saved finished.")

RMSE: 5.763057069084159
MAE: 2.0595464706420894
AVG: 2.858669072389602
Results saved finished.


