## Data preprocess

In [22]:
from pathlib import Path
from openpyxl import Workbook
from pathlib import Path


def to_excel(data, filename, sheet_style):
    workbook = Workbook()
    sheet = workbook.active

    # | user-response-time | request-number | response-ip   | process-time  |
    # | 0.3                | 10000          | 192.168.0.150 | 14.523432     |
    # | 0.5                | 20000          | 192.168.0.151 | 9.5232642     |

    sheet.append(sheet_style)

    for row in data:
        sheet.append(row)

    results_dir = str(Path.cwd().parent / "results" / "result_v2")

    workbook.save(filename=f"{results_dir}\\{filename}.xlsx")


# read input dataset

dataset_read_filename = "test(150#Tue-Jul-23-19-05-12-2024)"
training_data_dir = Path.cwd().parent / "training_data" / "data_set2"
# Data preprocessing
file_path = f"{training_data_dir}\\{dataset_read_filename}.xlsx"


# set result output filename and path
result_output_filename = "test(150#Tue-Jul-23-19-05-12-2024)"

In [23]:

import pandas as pd
import numpy as np


def read_data(filename):
    df = pd.read_excel(filename)

    columns = df.columns.to_list()

    data_dict = {col: df[col].to_list() for col in columns}

    return data_dict


def data_preprocess(filepath):
    data = read_data(filepath)
    # TODO more...

    # to numpy
    for key in data.keys():
        data[key] = np.array(data[key])

    return data

In [24]:
dataset = data_preprocess(file_path)

## DATA Style View

In [25]:
# # dataset is a dictionary

for col_header, col_data_list in dataset.items():
    print(col_header, col_data_list)

num [500000 500000 500000 ... 500000 500000 500000]
sum [41538 41538 41538 ... 41538 41538 41538]
user [6.58 6.62 6.62 ... 6.61 6.62 6.66]
system [0.   0.   0.   ... 0.01 0.   0.  ]
child_pid [1 1 1 ... 1 1 1]
waiting_cnt [0 0 0 ... 0 0 0]
success [1 1 1 ... 1 1 1]
ip ['192.168.0.152' '192.168.0.150' '192.168.0.151' ... '192.168.0.152'
 '192.168.0.150' '192.168.0.151']
wait_time_in_worker_node [0.00057578 0.00076413 0.00052857 ... 0.00051713 0.00056624 0.00059199]
process_in_manager_node [ 74.46349001  75.07923841  66.78739882 ... 140.8893919  123.85193801
 124.05044055]
process_in_worker_node [8.21560264 8.28435349 8.28173065 ... 8.27793026 8.25403929 8.34374809]
total_response_time [ 82.73077226  83.41229963  75.12595129 ... 149.29319143 132.23056173
 132.51731038]
trans_delay [ 74.51459384  75.12718201  66.84369206 ... 141.01474404 123.9759562
 124.17297029]


## XGBOOST - 1

In [26]:
from xgboost import XGBRegressor

# Create XGBoost regression model
model = XGBRegressor(
    objective="reg:squarederror",   # Loss function to minimize: squared error for regression  损失函数为平方损失函数, 
    n_estimators=100,  # Number of boosting rounds (number of trees)  迭代次数，即基础学习器的数量
    learning_rate=0.1,  # Boosting learning rate (controls the step size)  学习率，控制每次更新的步长
    max_depth=5,  # Maximum depth of a tree  树的最大深度
    min_child_weight=1,  # Minimum sum of instance weight (hessian) needed in a child  叶子节点最小权重
    subsample=0.8,  # Subsample ratio of the training instance (randomly sampled)  每棵树随机采样的比例
    colsample_bytree=0.8,  # Subsample ratio of columns when constructing each tree  每棵树随机选择的特征比例
    alpha=0.01,  # L1 regularization term on weights  L1 正则化项参数
    reg_lambda=0.01,  # L2 regularization term on weights  L2 正则化项参数
)

In [27]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# # select value from dataset
X = np.array([dataset.get("num"), dataset.get("waiting_cnt")]).T
y = dataset.get("wait_time_in_worker_node")

# train_size = int(len(X) * 0.7)

# 拆分数据集为训练集和测试集
# split dataset to train dataset and test dataset

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=35
)

# model fit
model.fit(X_train, y_train)

In [28]:
from sklearn.metrics import mean_absolute_error

y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
mea = mean_absolute_error(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"Mean Absolute Error: {mea}")


print(f"    real    |    pred   \n")
for i in range(len(y_test)):
    print(f"    {y_test[i]}     |     {y_pred[i]}      ")

Mean Squared Error: 83.54606998189072
Mean Absolute Error: 2.811681026617686
    real    |    pred   

    0.0005209445953369141     |     0.3858637809753418      
    0.0005214214324951172     |     0.3858637809753418      
    0.0005252361297607422     |     0.3858637809753418      
    0.0005257129669189453     |     0.3858637809753418      
    0.0008451938629150391     |     0.3858637809753418      
    0.0005538463592529297     |     0.3858637809753418      
    0.0005261898040771484     |     0.3858637809753418      
    0.02760577201843262     |     10.187143325805664      
    49.54652285575867     |     17.173995971679688      
    0.0005216598510742188     |     0.3858637809753418      
    0.0005440711975097656     |     0.3858637809753418      
    0.0005428791046142578     |     0.3858637809753418      
    0.0005505084991455078     |     0.3858637809753418      
    8.275943756103516     |     0.3858637809753418      
    0.0006248950958251953     |     0.385863780975341

## XGBRegressor

In [29]:
import numpy as np
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error


# 创建 XGBoost 回归模型
model = XGBRegressor(
    objective="reg:squarederror",
    n_estimators=100,
    learning_rate=0.1,
    tree_method="hist",
    device="cuda",
)

# Define the parameter grid for GridSearchCV
param_grid = {
    "max_depth": [3],  # Range of maximum depth of trees to explore  扩展最大深度的范围
    "learning_rate": [0.1],  # Range of learning rates to explore  扩展学习率的范围
    "n_estimators": [100],  # Range of number of trees (boosting rounds) to explore  扩展树的数量范围
    "gamma": [0],   # Range of gamma parameter to explore (controls tree split) 扩展 gamma 参数范围
    "subsample": [1],  # Range of subsample ratios to explore  扩展子样本比例范围
    "colsample_bytree": [0.8, 0.7],  # Range of column subsample ratios for each tree to explore  扩展列采样比例范围
    "reg_alpha": [0, 0.5],  # Range of L1 regularization parameters to explore  扩展 L1 正则化参数范围
    "reg_lambda": [0, 0.5],  # Range of L2 regularization parameters to explore  扩展 L2 正则化参数范围
}


grid_search = GridSearchCV(
    estimator=model, param_grid=param_grid, cv=5, scoring="neg_mean_squared_error"
)
grid_search.fit(X_train, y_train)

print("Best parameters found:\n")
for k, v in grid_search.best_params_.items():
    print('\t', k, ":", v)


# 使用最佳参数重新训练模型
# Re-train the model using the best parameters
best_model = grid_search.best_estimator_
best_model.fit(X_train, y_train, verbose=True)

Best parameters found:

	 colsample_bytree : 0.8
	 gamma : 0
	 learning_rate : 0.1
	 max_depth : 3
	 n_estimators : 100
	 reg_alpha : 0.5
	 reg_lambda : 0.5
	 subsample : 1


In [30]:
preds = best_model.predict(X_test)

rmse = mean_squared_error(y_test, preds, squared=False)
print(f"RMSE: {rmse}")

mae = mean_absolute_error(y_test, preds)
print(f"MAE: {mae}")


avg = np.mean(y_test)

print(f"AVG: {avg}")

# write into excal

sheet_style = ["test", "prediction", "difference", "accuracy", "is_waited"]

data_list = list()

is_waited = True


for i in range(len(y_test)):
    is_waited = True
    acc = 0
    diff = round(abs(y_test[i] - preds[i]), 6)
    if preds[i] < 0:
        acc = 0
    elif diff < 1:
        acc = 1
        is_waited = False
    else:
        rate = round(diff / y_test[i], 4)
        if rate <= 1 or rate >= 0:
            acc = 1 - rate
        if rate > 1:
            acc = 0

    data_list.append(
        [
            y_test[i],
            preds[i],
            diff,
            f"{round(acc * 100, 2)}%",
            is_waited
        ],
    )


to_excel(data_list, result_output_filename, sheet_style)

print("Results saved finished.")

RMSE: 9.18454181514116
MAE: 2.833063853263855
AVG: 2.709268331368764
Results saved finished.


