## Data preprocess

In [1]:
from pathlib import Path
from openpyxl import Workbook # type: ignore
from pathlib import Path
import os


def to_excel(data, filename, sheet_style, result_dir_path):
    workbook = Workbook()
    sheet = workbook.active

    # | user-response-time | request-number | response-ip   | process-time  |
    # | 0.3                | 10000          | 192.168.0.150 | 14.523432     |
    # | 0.5                | 20000          | 192.168.0.151 | 9.5232642     |

    sheet.append(sheet_style)

    for row in data:
        sheet.append(row)

    if not os.path.exists(result_dir_path):
        os.makedirs(result_dir_path, exist_ok=True)

    workbook.save(filename=f"{result_dir_path}\\{filename}.xlsx")


# read input dataset
filename_prefix = (
    "RandomRequestNumberclientv6#loops1#requests_batch500#Mon-Aug-12-05-53-29-2024"
)
dataset_read_filename = filename_prefix
training_data_dir = Path.cwd().parent / "training_data" / "data_set6"
# Data preprocessing
file_path = f"{training_data_dir}\\{dataset_read_filename}.xlsx"


# set result output filename and path
result_suffix = "result"

result_dir_path = Path.cwd().parent / "results" / "result_processTime_waitTasks"
version_index = len([_ for _ in Path(result_dir_path).iterdir() if _.is_file()])
version = f"_v{version_index}"

result_name = "processTime#waitTasks" + version

result_output_filename = f"{filename_prefix}{result_name}{result_suffix}"

In [2]:

import pandas as pd # type: ignore
import numpy as np # type: ignore


def read_data(filename):
    df = pd.read_excel(filename)

    columns = df.columns.to_list()

    data_dict = {col: df[col].to_list() for col in columns}

    return data_dict


def data_preprocess(filepath):
    data = read_data(filepath)
    # TODO more...

    # to numpy
    for key in data.keys():
        data[key] = np.array(data[key])

    return data

In [3]:
dataset = data_preprocess(file_path)

## DATA Style View

In [4]:
# # dataset is a dictionary

for col_header, col_data_list in dataset.items():
    print(col_header, col_data_list)

request_num [ 80307 339241 193503  47844   9037  38640 426670 452367 289801  13217
 339060 281086 110280  92608 360362 117683 183646  61874 354737 439726
 364295  12980 331538 251639  23797 224947 196516  42616 126688 428280
 150890 360332 124799  45831 276949 117438 172621 261941  93926 176384
 157150  38741 449622 325859 118883 448678 103414 145786 324737 135074
 384173 312963 416175  31415 396732 289422  87862 414928 491580  45291
  34467  37464 309567 468942  94536 374612 353622 292614 431592 445346
 308539 333436 344576 443486 369175 196618 285407 468622 171846  40462
 164564 345289 452125 410187 449043   5704 222418 143385 252810 251169
 143682 369575 222831 314733 360790 190867  34695 207805 361329 462426
  86920 231684 170651 251797 200856 426745 182639 144327 219666 279980
 347878  88209 230105 116171   2815 343678 287816 423511 385184  93663
 230256 105110 166833  59912 115572 322195  36161 410508  18262  23428
 472445 483145 218960 434063  11088 187202 324340 299629  40778 3

## XGBOOST - 1

### Data split

In [5]:
from sklearn.model_selection import train_test_split # type: ignore

X = np.array(
    [dataset.get("request_num"), dataset.get("processing_tasks_on_manager_node")]
).T

y = np.array(dataset.get("processed_and_waited_time_on_manager_node"))

# split dataset to train dataset and test dataset

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=1
)

## XGBRegressor

In [6]:
import itertools

# Define the parameter ranges
params = {
    "max_depth": [3],
    "learning_rate": [0.01],
    "n_estimators": [500],
    "min_child_weight": [1],
    "subsample": [0.8],
    "colsample_bytree": [0.8],
    "reg_alpha": [0.1],
    "reg_lambda": [0.1],
}

# Generate all combinations of parameters
parameter_combinations = [
    dict(zip(params.keys(), combination))
    for combination in itertools.product(*params.values())
]

# Print the number of parameter combinations and a few examples
print(f"Number of parameter combinations: {len(parameter_combinations)}")

Number of parameter combinations: 1


In [7]:
from xgboost import XGBRegressor  # type: ignore
import xgboost  # type: ignore
from sklearn.metrics import mean_squared_error  # type: ignore
import cupy as cp  # type: ignore
from typing import Tuple


# Create the XGBoost regressor model
def create_model(params):
    return XGBRegressor(
        objective="reg:squarederror",
        n_estimators=params["n_estimators"],
        learning_rate=params["learning_rate"],
        max_depth=params["max_depth"],
        subsample=params["subsample"],
        colsample_bytree=params["colsample_bytree"],
        reg_alpha=params["reg_alpha"],
        reg_lambda=params["reg_lambda"],
        tree_method="hist",
        device="cuda:0",
    )


# Function to evaluate model performance
def evaluate_model(
    params, X_train, y_train, X_test: np.array, y_test: np.array
) -> Tuple[np.float64, xgboost.Booster]:

    model = create_model(params)

    # Convert CuPy arrays to NumPy arrays
    X_train_np = cp.asarray(X_train)
    y_train_np = cp.asarray(y_train)
    X_test_np = cp.asarray(X_test)

    model.fit(X_train_np, y_train_np, verbose=True)

    # Make predictions using the trained model
    predictions = model.predict(X_test_np)

    # Calculate mean squared error
    mse = mean_squared_error(y_test, cp.asnumpy(predictions))

    print(params, mse)
    return mse, model

In [8]:
from sklearn.metrics import mean_squared_error, mean_absolute_error  # type: ignore
# Initialize variables to track the best parameters
best_params = None
best_mse = float("inf")

# Evaluate each parameter combination
for params in parameter_combinations:
    mse, model = evaluate_model(params, X_train, y_train, X_test, y_test)

    if mse < best_mse:
        best_mse = mse
        best_params = params

best_model = model
print("Best Parameters:", best_params)
print("Best MSE:", best_mse)

preds = best_model.predict(cp.asarray(X_test))

rmse = mean_squared_error(y_test, preds)
print(f"RMSE: {rmse}")

mae = mean_absolute_error(y_test, preds)
print(f"MAE: {mae}")

avg = np.mean(y_test)
print(f"AVG: {avg}")
# write into excal
sheet_style = ["num", "test", "prediction", "difference", "accuracy"]
data_list = list()


for i in range(len(y_test)):
    acc = 0
    diff = abs(y_test[i] - preds[i])
    if preds[i] < 0:
        acc = 0
    else:
        rate = diff / y_test[i]
        if rate < 1 and rate >= 0:
            acc = 1 - rate
        if rate > 1:
            acc = 0
    data_list.append(
        [
            X_test[i][0],
            y_test[i],
            preds[i],
            diff,
            f"{float(acc * 100): .5}%",
        ],
    )
# to_excel(data_list, result_output_filename, sheet_style, result_dir_path)
print("Results saved finished.")

{'max_depth': 3, 'learning_rate': 0.01, 'n_estimators': 500, 'min_child_weight': 1, 'subsample': 0.8, 'colsample_bytree': 0.8, 'reg_alpha': 0.1, 'reg_lambda': 0.1} 6787.427054514496
Best Parameters: {'max_depth': 3, 'learning_rate': 0.01, 'n_estimators': 500, 'min_child_weight': 1, 'subsample': 0.8, 'colsample_bytree': 0.8, 'reg_alpha': 0.1, 'reg_lambda': 0.1}
Best MSE: 6787.427054514496
RMSE: 6787.427054514496
MAE: 58.10103504419327
AVG: 232.28458684682846
Results saved finished.
