## Data preprocess

In [1]:
from pathlib import Path
from openpyxl import Workbook, load_workbook  # type: ignore
import os


def to_excel(
    data, filename, sheet_style, result_dir_path, new_sheet=False, sheet_name="Sheet1"
):

    if not os.path.exists(result_dir_path):
        os.makedirs(result_dir_path, exist_ok=True)

    file_path = os.path.join(result_dir_path, f"{filename}.xlsx")

    if os.path.exists(file_path):
        workbook = load_workbook(file_path)
        if new_sheet:
            sheet = workbook.create_sheet(title=sheet_name)

        else:
            sheet = workbook.active
    else:
        workbook = Workbook()
        default_sheet = workbook.active
        workbook.remove(default_sheet)
        sheet = workbook.create_sheet(title=sheet_name)
        
    if sheet_style:
        sheet.append(sheet_style)


    if new_sheet:
        print(data)
    # write data into file
    for row in data:
        sheet.append(row)

    workbook.save(filename=f"{result_dir_path}\\{filename}.xlsx")



# read input dataset
filename_prefix = (
    "RANDclient_org-L1-G200-RB48498-DTFriSep132216422024"
)


dataset_read_filename = filename_prefix
training_data_dir = Path.cwd().parent / "training_data"


# Data preprocessing
file_path = f"{training_data_dir}\\{dataset_read_filename}.xlsx"



# set result output filename and path
result_suffix = "result"

result_dir_path = Path.cwd().parent / "results" / "result_processTime_waitTasks_v3"

if not os.path.exists(result_dir_path):
    os.makedirs(result_dir_path)

version_index = len([_ for _ in Path(result_dir_path).iterdir() if _.is_file()])
version = f"_v{version_index}"

result_name = "processTime#waitTasks" + version
result_output_filename = f"{filename_prefix}{result_name}{result_suffix}"

In [2]:

import pandas as pd # type: ignore
import numpy as np # type: ignore


def read_data(filename):
    df = pd.read_excel(filename)
    columns = df.columns.to_list()
    data_dict = {col: df[col].to_list() for col in columns}
    return data_dict


def data_preprocess(filepath):
    data = read_data(filepath)
    # TODO more...

    # to numpy
    for key in data.keys():
        data[key] = np.array(data[key])

    return data

In [3]:
dataset = data_preprocess(file_path)

# if wait time < 1 => no wait [ remove ]
is_remove_no_wait = False

if is_remove_no_wait:
    index_list = list()
    for key, value in dataset.items():
        for v in value:
            if key == 'worker_wait_time' and v < 1:
                index_list.append(list(dataset.get(key)).index(v))

    for key, value in dataset.items():
        dataset[key] = [x for i, x in enumerate(value) if i not in index_list]

## DATA Style View

In [4]:
# # dataset is a dictionary

print(type(dataset))

<class 'dict'>


## XGBOOST - 1

### Data split

In [5]:
from sklearn.model_selection import train_test_split # type: ignore
import cupy as cp                                      # type: ignore

y = cp.array(dataset.get('worker_wait_time'))

## Strandard Scaler

In [6]:
from sklearn.preprocessing import StandardScaler       # type: ignore
from sklearn.preprocessing import MinMaxScaler         # type: ignore
from sklearn.preprocessing import PowerTransformer     # type: ignore
import numpy as np                                     # type: ignore

_X = [dataset.get("request_num"), dataset.get(
    "jobs_on_worker_node")]

for i in range(len(_X)):
    _X[i] = np.array(_X[i]).reshape(-1, 1)
    

# _scaler = "normalization"
_scaler = None

if _scaler == "standard":
    X_scaler = StandardScaler()
    for i in range(len(_X)):
        _X[i] = X_scaler.fit_transform(_X[i])
elif _scaler == "normalization":
    X_scaler = MinMaxScaler()
    for i in range(len(_X)):
        _X[i] = X_scaler.fit_transform(_X[i])
elif _scaler == "log":
    for i in range(len(_X)):
        _X[i] = np.log1p(_X[i])
elif _scaler == "power":
    X_scaler = PowerTransformer(method='yeo-johnson')
    for i in range(len(_X)):
        _X[i] = X_scaler.fit_transform(_X[i])
else:
    pass

for i in range(len(_X)):
    _X[i] = _X[i].reshape(-1)

X = cp.asarray(np.array(
    _X
).T)

## XGBRegressor

In [7]:
import itertools

# Define the parameter ranges
params = {
    "max_depth": [3, 4, 5, 10],
    "learning_rate": [0.01, 0.05, 0.1],
    "n_estimators": [1000, 1500, 2000],
    "min_child_weight": [1],
    "subsample": [0.7],
    "colsample_bytree": [0.8],
    "reg_alpha": [1],
    "reg_lambda": [1],
}


# Generate all combinations of parameters
parameter_combinations = [
    dict(zip(params.keys(), combination))
    for combination in itertools.product(*params.values())
]

# Print the number of parameter combinations and a few examples
print(f"Number of parameter combinations: {len(parameter_combinations)}")

Number of parameter combinations: 36


In [8]:
from xgboost import XGBRegressor  # type: ignore
import xgboost  # type: ignore
from sklearn.metrics import mean_squared_error, mean_absolute_error  # type: ignore
from typing import Tuple


# Create the XGBoost regressor model
def create_model(params):
    # if use gpu to add these two params
    # - tree_method="hist",
    # - device="cuda",
    
    return XGBRegressor(
        objective="reg:squarederror",
        n_estimators=params["n_estimators"],
        learning_rate=params["learning_rate"],
        max_depth=params["max_depth"],
        subsample=params["subsample"],
        colsample_bytree=params["colsample_bytree"],
        reg_alpha=params["reg_alpha"],
        reg_lambda=params["reg_lambda"],
    )


# Function to evaluate model performance
def evaluate_model(
    params, X_train, y_train, X_test: np.array, y_test: np.array
) -> Tuple[np.float64, xgboost.Booster]:

    model = create_model(params)

    # Convert CuPy arrays to NumPy arrays

    model.fit(X_train, y_train, verbose=True)

    # Make predictions using the trained model
    predictions = model.predict(X_test)


    # typecasting  cp -> np
    y_test_np = cp.asnumpy(y_test)
    X_test_np = cp.asnumpy(X_test)
    predictions_np = cp.asnumpy(predictions)


    # Calculate mean squared error
    mse = float(mean_squared_error(y_test_np, predictions_np))
    mae = float(mean_absolute_error(y_test_np, predictions_np))
    avg = float(np.mean(y_test_np))

    # calculate diffirence and accuracy
    data_list = list()
    for i in range(len(y_test_np)):
        acc = 0
        diff = abs(y_test_np[i] - predictions_np[i])
        if predictions[i] < 0:
            acc = 0
        else:
            rate = diff / y_test_np[i]
            if rate < 1 and rate >= 0:
                acc = 1 - rate
            if rate > 1:
                acc = 0

        data_list.append(
            [
                float(X_test_np[i][0]),
                float(X_test_np[i][1]),
                float(y_test_np[i]),
                float(predictions_np[i]),
                float(diff),
                float(round(acc, 5)),
            ],
        )

    results_tuple = {"mse": mse, "mae": mae, "avg": avg, "model": model, "data_list": data_list}
    filtered_results = {k: v for k, v in results_tuple.items() if k != "model" and k != "data_list"}
    print(filtered_results, f'\n{params}', '\n')
    return results_tuple

In [9]:
# Initialize variables to track the best parameters
best_params = None
best_result = None
best_data = None

# results param
best_mse = float("inf")
best_mae = float("inf")

sheet_style = ["num", "jobs", "X_test", "prediction", "difference", "accuracy"]

# control sheet style of excel written
first_write = True

# Evaluate each parameter combination
for params in parameter_combinations:

    # split dataset to train dataset and test dataset for every params group
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )
    results_tuple = evaluate_model(params, X_train, y_train, X_test, y_test)

    filtered_results = {
        k: v for k, v in results_tuple.items() if k != "model" and k != "data_list"
    }

    # update best model
    if results_tuple['mse'] < best_mse:
        best_mse = results_tuple["mse"]
        best_params = params
        best_model = results_tuple["model"]
        best_result = filtered_results
        best_data = results_tuple['data_list']

print("Best Parameters:", best_params)
print("Best MSE:", best_mse)
print("Best MAE:", best_result["mae"])
print("Average:", best_result["avg"])
print("Results saved finished.")

params_keys_list = list(best_params.keys())
params_values_list = list(best_params.values())

to_excel(
    data=best_data,
    filename=result_output_filename,
    sheet_style=sheet_style,
    result_dir_path=result_dir_path,
    new_sheet=True,  # Create a new sheet for the best results
)

# best mse params
to_excel(
    data=[params_values_list],
    filename=result_output_filename,
    sheet_style=params_keys_list,
    result_dir_path=result_dir_path,
    new_sheet=True,
    sheet_name="best mse params",
)

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




{'mse': 290.4684425087009, 'mae': 12.849734785467694, 'avg': 136.99068983041133} 
{'max_depth': 3, 'learning_rate': 0.01, 'n_estimators': 1000, 'min_child_weight': 1, 'subsample': 0.7, 'colsample_bytree': 0.8, 'reg_alpha': 1, 'reg_lambda': 1} 

{'mse': 289.50316005522063, 'mae': 12.783872580747168, 'avg': 136.99068983041133} 
{'max_depth': 3, 'learning_rate': 0.01, 'n_estimators': 1500, 'min_child_weight': 1, 'subsample': 0.7, 'colsample_bytree': 0.8, 'reg_alpha': 1, 'reg_lambda': 1} 

{'mse': 289.59326740323837, 'mae': 12.78114261497213, 'avg': 136.99068983041133} 
{'max_depth': 3, 'learning_rate': 0.01, 'n_estimators': 2000, 'min_child_weight': 1, 'subsample': 0.7, 'colsample_bytree': 0.8, 'reg_alpha': 1, 'reg_lambda': 1} 

{'mse': 290.0640413711327, 'mae': 12.789791659133067, 'avg': 136.99068983041133} 
{'max_depth': 3, 'learning_rate': 0.05, 'n_estimators': 1000, 'min_child_weight': 1, 'subsample': 0.7, 'colsample_bytree': 0.8, 'reg_alpha': 1, 'reg_lambda': 1} 

{'mse': 290.5121128

## Model save

In [10]:
from pathlib import Path
model_file = "xgb_tasks_time_fn" + ".json"
model_path = str(Path.cwd() / "modelsfile" / model_file)
best_model.save_model(model_path)