## Data preprocess

In [None]:
from pathlib import Path
from openpyxl import Workbook, load_workbook  # type: ignore
import os



def to_excel(
    data, filename, sheet_style, result_dir_path, new_sheet=False, sheet_name="Sheet1"
):

    if not os.path.exists(result_dir_path):
        os.makedirs(result_dir_path, exist_ok=True)

    file_path = os.path.join(result_dir_path, f"{filename}.xlsx")

    if os.path.exists(file_path):
        workbook = load_workbook(file_path)
        if new_sheet:
            sheet = workbook.create_sheet(title=sheet_name)

        else:
            sheet = workbook.active
    else:
        workbook = Workbook()
        default_sheet = workbook.active
        workbook.remove(default_sheet)
        sheet = workbook.create_sheet(title=sheet_name)
        
    if sheet_style:
        sheet.append(sheet_style)


    if new_sheet:
        print(data)
    # write data into file
    for row in data:
        sheet.append(row)

    workbook.save(filename=f"{result_dir_path}\\{filename}.xlsx")



# read input dataset
filename_prefix = (
    "RandomRequestNumberclientv_single_worker_node#loops3#requests_batch150#Thu-Aug-22-20-43-55-2024"
)


dataset_read_filename = filename_prefix
training_data_dir = Path.cwd().parent / "training_data" / "data_set8"


# Data preprocessing
file_path = f"{training_data_dir}\\{dataset_read_filename}.xlsx"



# set result output filename and path
result_suffix = "result"

result_dir_path = Path.cwd().parent / "results" / "result_processTime_waitTasks_v3"

if not os.path.exists(result_dir_path):
    os.makedirs(result_dir_path)

version_index = len([_ for _ in Path(result_dir_path).iterdir() if _.is_file()])
version = f"_v{version_index}"

result_name = "processTime#waitTasks" + version
result_output_filename = f"{filename_prefix}{result_name}{result_suffix}"

In [None]:

import pandas as pd # type: ignore
import numpy as np # type: ignore


def read_data(filename):
    df = pd.read_excel(filename)
    columns = df.columns.to_list()
    data_dict = {col: df[col].to_list() for col in columns}
    return data_dict


def data_preprocess(filepath):
    data = read_data(filepath)
    # TODO more...

    # to numpy
    for key in data.keys():
        data[key] = np.array(data[key])

    return data

In [None]:
dataset = data_preprocess(file_path)

## DATA Style View

In [None]:
# # dataset is a dictionary

print(type(dataset))

## XGBOOST - 1

### Data split

In [5]:
from sklearn.model_selection import train_test_split # type: ignore
import cupy as cp

y = cp.array(dataset.get("processed_and_waited_time_on_manager_node"))



## Strandard Scaler

In [6]:
from sklearn.preprocessing import StandardScaler       # type: ignore
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import PowerTransformer


_X = [dataset.get("request_num"), dataset.get("sub_processing_tasks_on_manager_node"), dataset.get("processed_time_on_worker_node")]

for i in range(len(_X)):
    _X[i] = np.array(_X[i]).reshape(-1, 1)
    

_scaler = "normalization"

if _scaler == "standard":
    X_scaler = StandardScaler()
    for i in range(len(_X)):
        _X[i] = X_scaler.fit_transform(_X[i])
elif _scaler == "normalization":
    X_scaler = MinMaxScaler()
    for i in range(len(_X)):
        _X[i] = X_scaler.fit_transform(_X[i])
elif _scaler == "log":
    for i in range(len(_X)):
        _X[i] = np.log1p(_X[i])
elif _scaler == "power":
    X_scaler = PowerTransformer(method='yeo-johnson')
    for i in range(len(_X)):
        _X[i] = X_scaler.fit_transform(_X[i])
else:
    pass

for i in range(len(_X)):
    _X[i] = _X[i].reshape(-1)

X = cp.asarray(np.array(
    _X
).T)

## XGBRegressor

In [7]:
import itertools

# Define the parameter ranges
params = {
    "max_depth": [5],
    "learning_rate": [0.01],
    "n_estimators": [i for i in range(100, 4000, 100)],
    "min_child_weight": [1],
    "subsample": [0.1],
    "colsample_bytree": [0.1],
    "reg_alpha": [0, 0.1],
    "reg_lambda": [i / 10 for i in range(10, 50, 1)],
}

# params = {
#     "max_depth": [3],
#     "learning_rate": [0.01],
#     "n_estimators": [3800],
#     "min_child_weight": [1],
#     "subsample": [0.1],
#     "colsample_bytree": [0.1],
#     "reg_alpha": [0],
#     "reg_lambda": [3.6],
# }


# Generate all combinations of parameters
parameter_combinations = [
    dict(zip(params.keys(), combination))
    for combination in itertools.product(*params.values())
]

# Print the number of parameter combinations and a few examples
print(f"Number of parameter combinations: {len(parameter_combinations)}")

Number of parameter combinations: 3120


In [8]:
from xgboost import XGBRegressor  # type: ignore
import xgboost  # type: ignore
from sklearn.metrics import mean_squared_error, mean_absolute_error  # type: ignore
from typing import Tuple


# Create the XGBoost regressor model
def create_model(params):
    # if use gpu to add these two params
    # - tree_method="hist",
    # - device="cuda",
    
    return XGBRegressor(
        objective="reg:squarederror",
        n_estimators=params["n_estimators"],
        learning_rate=params["learning_rate"],
        max_depth=params["max_depth"],
        subsample=params["subsample"],
        colsample_bytree=params["colsample_bytree"],
        reg_alpha=params["reg_alpha"],
        reg_lambda=params["reg_lambda"],
        device="cuda",
        tree_method="hist"
    )


# Function to evaluate model performance
def evaluate_model(
    params, X_train, y_train, X_test: np.array, y_test: np.array
) -> Tuple[np.float64, xgboost.Booster]:

    model = create_model(params)

    # Convert CuPy arrays to NumPy arrays

    model.fit(X_train, y_train, verbose=True)

    # Make predictions using the trained model
    predictions = model.predict(X_test)


    # typecasting  cp -> np
    y_test_np = cp.asnumpy(y_test)
    X_test_np = cp.asnumpy(X_test)
    predictions_np = cp.asnumpy(predictions)


    # Calculate mean squared error
    mse = float(mean_squared_error(y_test_np, predictions_np))
    mae = float(mean_absolute_error(y_test_np, predictions_np))
    avg = float(np.mean(y_test_np))
    avg_acc = 100 * (avg - mae) / avg

    # calculate diffirence and accuracy
    data_list = list()
    for i in range(len(y_test_np)):
        acc = 0
        diff = abs(y_test_np[i] - predictions_np[i])
        if predictions[i] < 0:
            acc = 0
        else:
            rate = diff / y_test_np[i]
            if rate < 1 and rate >= 0:
                acc = 1 - rate
            if rate > 1:
                acc = 0
        data_list.append(
            [
                float(X_test_np[i][0]),
                float(X_test_np[i][1]),
                float(X_test_np[i][2]),
                float(y_test_np[i]),
                float(predictions_np[i]),
                float(diff),
                float(round(acc, 5)),
            ],
        )

    results_tuple = {"mse": mse, "mae": mae, "avg": avg, "avg_acc": avg_acc, "model": model, "data_list": data_list}
    filtered_results = {k: v for k, v in results_tuple.items() if k != "model" and k != "data_list"}
    print(filtered_results, f'\n{params}', '\n')
    return results_tuple

In [9]:
# Initialize variables to track the best parameters
best_params = None
best_result = None

# results param
best_mse = float("inf")
best_avg_acc = float(-1)
best_mae = float("inf")

# result params tuple
best_avg_acc_params = None
best_mae_params = None


# control sheet style of excel written
first_write = True



# Evaluate each parameter combination
for params in parameter_combinations:

    # split dataset to train dataset and test dataset for every params group
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=None
    )
    results_tuple = evaluate_model(params, X_train, y_train, X_test, y_test)

    filtered_results = {
        k: v for k, v in results_tuple.items() if k != "model" and k != "data_list"
    }

    # update best model
    if results_tuple['mse'] < best_mse:
        best_mse = results_tuple["mse"]
        best_params = params
        best_model = results_tuple["model"]
        best_result = filtered_results

    # write result into excel file
    sheet_style = ["num", "jobs", "X_test", "prediction", "difference", "accuracy"]
    avg_acc = results_tuple["avg_acc"]
    data_list = results_tuple['data_list']

    if best_avg_acc < results_tuple['avg_acc']:
        best_avg_acc = results_tuple["avg_acc"]
        best_acc_params = params

    if best_mae > results_tuple['mae']:
        best_mae = results_tuple["mae"]
        best_mae_params = params

    if first_write:
        # sheet 1
        to_excel(
            data=data_list,
            filename=result_output_filename,
            sheet_style=sheet_style,
            result_dir_path=result_dir_path,
            new_sheet=False,
        )
        first_write = False
    else:
        to_excel(
            data=data_list,
            filename=result_output_filename,
            sheet_style=None,
            result_dir_path=result_dir_path,
            new_sheet=False,
        )
    print(f"Best acc {best_avg_acc}")

print("Best Parameters:", best_params)
print("Best MSE:", best_mse)
print("Best MAE:", best_result["mae"])
print("Average:", best_result["avg"])
print("Average accuracy:", best_result["avg_acc"])
print("Results saved finished.")

params_keys_list = list(best_params.keys())
params_values_list = list(best_params.values())

# best mse params
to_excel(
    data=[params_values_list],
    filename=result_output_filename,
    sheet_style=params_keys_list,
    result_dir_path=result_dir_path,
    new_sheet=True,
    sheet_name="best mse params",
)


result_keys_list = list(filtered_results.keys())

# avg_acc sheet
to_excel(
    data=[list(best_acc_params.values())],
    filename=result_output_filename,
    sheet_style=result_keys_list,
    result_dir_path=result_dir_path,
    new_sheet=True,
    sheet_name="best avg acc params",
)


# mae sheet
to_excel(
    data=[list(best_mae_params.values())],
    filename=result_output_filename,
    sheet_style=result_keys_list,
    result_dir_path=result_dir_path,
    new_sheet=True,
    sheet_name="best mae params",
)

{'mse': 718.0167216935415, 'mae': 23.329663565423754, 'avg': 46.45586587323083, 'avg_acc': 49.78101661244256} 
{'max_depth': 5, 'learning_rate': 0.01, 'n_estimators': 100, 'min_child_weight': 1, 'subsample': 0.1, 'colsample_bytree': 0.1, 'reg_alpha': 0, 'reg_lambda': 1.0} 

Best acc 49.78101661244256
{'mse': 629.2455158572657, 'mae': 21.82968061765035, 'avg': 40.82756203545465, 'avg_acc': 46.53200061592348} 
{'max_depth': 5, 'learning_rate': 0.01, 'n_estimators': 100, 'min_child_weight': 1, 'subsample': 0.1, 'colsample_bytree': 0.1, 'reg_alpha': 0, 'reg_lambda': 1.1} 

Best acc 49.78101661244256
{'mse': 769.1835293595933, 'mae': 24.20462843047248, 'avg': 49.50597881211175, 'avg_acc': 51.10766616223178} 
{'max_depth': 5, 'learning_rate': 0.01, 'n_estimators': 100, 'min_child_weight': 1, 'subsample': 0.1, 'colsample_bytree': 0.1, 'reg_alpha': 0, 'reg_lambda': 1.2} 

Best acc 51.10766616223178
{'mse': 912.9149254163663, 'mae': 27.042403973473444, 'avg': 46.654923322465685, 'avg_acc': 42.0

KeyboardInterrupt: 

## Model save

In [None]:
# from pathlib import Path
# model_file = "xboost_tasks_time" + ".json"
# model_path = str(Path.cwd() / "modelsfile" / model_file)
# best_model.save_model(model_path)