## Data preprocess

In [214]:
from pathlib import Path
from openpyxl import Workbook, load_workbook  # type: ignore
import os



def to_excel(
    data, filename, sheet_style, result_dir_path, new_sheet=False, sheet_name="Sheet1"
):

    if not os.path.exists(result_dir_path):
        os.makedirs(result_dir_path, exist_ok=True)

    file_path = os.path.join(result_dir_path, f"{filename}.xlsx")

    if os.path.exists(file_path):
        workbook = load_workbook(file_path)
        if new_sheet:
            sheet = workbook.create_sheet(title=sheet_name)

        else:
            sheet = workbook.active
    else:
        workbook = Workbook()
        default_sheet = workbook.active
        workbook.remove(default_sheet)
        sheet = workbook.create_sheet(title=sheet_name)

    sheet.append(sheet_style)

    # write data into file
    for row in data:
        sheet.append(row)

    workbook.save(filename=f"{result_dir_path}\\{filename}.xlsx")



# read input dataset
filename_prefix = (
    "RandomRequestNumberclientv6#loops1#requests_batch500#Mon-Aug-12-05-53-29-2024"
)


dataset_read_filename = filename_prefix
training_data_dir = Path.cwd().parent / "training_data" / "data_set6"


# Data preprocessing
file_path = f"{training_data_dir}\\{dataset_read_filename}.xlsx"



# set result output filename and path
result_suffix = "result"

result_dir_path = Path.cwd().parent / "results" / "result_processTime_waitTasks_v1"

if not os.path.exists(result_dir_path):
    os.makedirs(result_dir_path)

version_index = len([_ for _ in Path(result_dir_path).iterdir() if _.is_file()])
version = f"_v{version_index}"

result_name = "processTime#waitTasks" + version
result_output_filename = f"{filename_prefix}{result_name}{result_suffix}"

In [215]:

import pandas as pd # type: ignore
import numpy as np # type: ignore


def read_data(filename):
    df = pd.read_excel(filename)
    columns = df.columns.to_list()
    data_dict = {col: df[col].to_list() for col in columns}
    return data_dict


def data_preprocess(filepath):
    data = read_data(filepath)
    # TODO more...

    # to numpy
    for key in data.keys():
        data[key] = np.array(data[key])

    return data

In [216]:
dataset = data_preprocess(file_path)

## DATA Style View

In [217]:
# # dataset is a dictionary

print(type(dataset))

<class 'dict'>


## XGBOOST - 1

### Data split

In [218]:
from sklearn.model_selection import train_test_split # type: ignore

X = np.array(
    [dataset.get("request_num"), dataset.get("processing_tasks_on_manager_node")]
).T

y = np.array(dataset.get("processed_and_waited_time_on_manager_node"))

# split dataset to train dataset and test dataset

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=1
)

## XGBRegressor

In [219]:
import itertools

# Define the parameter ranges
params = {
    "max_depth": [3],
    "learning_rate": [0.01],
    "n_estimators": [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1100, 2000],
    "min_child_weight": [1, 2, 5],
    "subsample": [0.8, 0.7, 0.9],
    "colsample_bytree": [0.8, 0.5, 0.6, 0.7],
    "reg_alpha": [0.1, 0.2],
    "reg_lambda": [0.1, 0.2],
}


# Generate all combinations of parameters
parameter_combinations = [
    dict(zip(params.keys(), combination))
    for combination in itertools.product(*params.values())
]

# Print the number of parameter combinations and a few examples
print(f"Number of parameter combinations: {len(parameter_combinations)}")

Number of parameter combinations: 432


In [220]:
from xgboost import XGBRegressor  # type: ignore
import xgboost  # type: ignore
from sklearn.metrics import mean_squared_error  # type: ignore
import cupy as cp  # type: ignore
from typing import Tuple


# Create the XGBoost regressor model
def create_model(params):
    return XGBRegressor(
        objective="reg:squarederror",
        n_estimators=params["n_estimators"],
        learning_rate=params["learning_rate"],
        max_depth=params["max_depth"],
        subsample=params["subsample"],
        colsample_bytree=params["colsample_bytree"],
        reg_alpha=params["reg_alpha"],
        reg_lambda=params["reg_lambda"],
        tree_method="hist",
        device="cuda:0",
    )


# Function to evaluate model performance
def evaluate_model(
    params, X_train, y_train, X_test: np.array, y_test: np.array
) -> Tuple[np.float64, xgboost.Booster]:

    model = create_model(params)

    # Convert CuPy arrays to NumPy arrays
    X_train_np = cp.asarray(X_train)
    y_train_np = cp.asarray(y_train)
    X_test_np = cp.asarray(X_test)

    model.fit(X_train_np, y_train_np, verbose=True)

    # Make predictions using the trained model
    predictions = model.predict(X_test_np)

    # Calculate mean squared error
    mse = mean_squared_error(y_test, cp.asnumpy(predictions))

    print(params, mse)
    return mse, model

In [221]:
from sklearn.metrics import mean_squared_error, mean_absolute_error  # type: ignore


# Initialize variables to track the best parameters


best_params = None
best_mse = float("inf")



# Evaluate each parameter combination
for params in parameter_combinations:
    mse, model = evaluate_model(params, X_train, y_train, X_test, y_test)
    if mse < best_mse:
        best_mse = mse
        best_params = params



best_model = model


print("Best Parameters:", best_params)



print("Best MSE:", best_mse)



preds = best_model.predict(cp.asarray(X_test))



rmse = mean_squared_error(y_test, preds)



print(f"RMSE: {rmse}")



mae = mean_absolute_error(y_test, preds)



print(f"MAE: {mae}")



avg = np.mean(y_test)



print(f"AVG: {avg}")


# write into excal


sheet_style = ["num", "test", "prediction", "difference", "accuracy"]
data_list = list()



for i in range(len(y_test)):

    acc = 0

    diff = abs(y_test[i] - preds[i])

    if preds[i] < 0:

        acc = 0

    else:
        rate = diff / y_test[i]
        if rate < 1 and rate >= 0:
            acc = 1 - rate
        if rate > 1:
            acc = 0
    data_list.append(
        [
            X_test[i][0],
            y_test[i],
            preds[i],
            diff,

            f"{float(acc * 100): .5}%",
        ],
    )



to_excel(
    data=data_list,
    filename=result_output_filename,
    sheet_style=sheet_style,
    result_dir_path=result_dir_path,
    new_sheet=False,
)

params_keys_list = list(params.keys())
params_keys_list.append("mse")

params_values_list = list(best_params.values())
params_values_list.append(best_mse)


to_excel(
    data=[params_values_list],
    filename=result_output_filename,
    sheet_style=params_keys_list,
    result_dir_path=result_dir_path,
    new_sheet=True,
    sheet_name="Sheet2",
)
print("Results saved finished.")

{'max_depth': 3, 'learning_rate': 0.01, 'n_estimators': 100, 'min_child_weight': 1, 'subsample': 0.8, 'colsample_bytree': 0.8, 'reg_alpha': 0.1, 'reg_lambda': 0.1} 11821.249466592373
{'max_depth': 3, 'learning_rate': 0.01, 'n_estimators': 100, 'min_child_weight': 1, 'subsample': 0.8, 'colsample_bytree': 0.5, 'reg_alpha': 0.1, 'reg_lambda': 0.1} 11821.249466592373
{'max_depth': 3, 'learning_rate': 0.01, 'n_estimators': 100, 'min_child_weight': 1, 'subsample': 0.8, 'colsample_bytree': 0.6, 'reg_alpha': 0.1, 'reg_lambda': 0.1} 11821.249466592373
{'max_depth': 3, 'learning_rate': 0.01, 'n_estimators': 100, 'min_child_weight': 1, 'subsample': 0.8, 'colsample_bytree': 0.7, 'reg_alpha': 0.1, 'reg_lambda': 0.1} 11821.249466592373
{'max_depth': 3, 'learning_rate': 0.01, 'n_estimators': 100, 'min_child_weight': 1, 'subsample': 0.7, 'colsample_bytree': 0.8, 'reg_alpha': 0.1, 'reg_lambda': 0.1} 11823.830324605324
{'max_depth': 3, 'learning_rate': 0.01, 'n_estimators': 100, 'min_child_weight': 1, '

## Model save

In [222]:
from pathlib import Path
model_file = "xboost_tasks_time" + ".json"
model_path = str(Path.cwd() / "modelsfile" / model_file)
best_model.save_model(model_path)