# XGBoost and linear regressors model fit
## Prepare data

In [1]:
# load data
from pathlib import Path
import json

data_path = Path.cwd() / 'combined_data' / 'combined_data.json'

with open(data_path, 'r') as f:
    data = dict(json.load(f))

print(data.keys())


dict_keys(['CPU', 'HDD', 'MEM'])


In [2]:
data: dict

for d_key in data.keys():
    # d_key is mem, cpu, hdd
    if d_key == 'HDD':
        data[d_key]['task_class'] = [1 for _ in range(len(data[d_key]['processing_time']))]
    elif d_key == 'MEM':
        data[d_key]['task_class'] = [2 for _ in range(len(data[d_key]['processing_time']))]
    elif d_key == 'CPU':
        data[d_key]['task_class'] = [3 for _ in range(len(data[d_key]['processing_time']))]
    else:
        raise ValueError(f"Unknown key {d_key} in data")

print(len(data['CPU']['task_class']))
print(len(data['HDD']['task_class']))
print(len(data['MEM']['task_class']))

print(data['CPU'].keys())
print(data['HDD'].keys())
print(data['MEM'].keys())

print()

6000
4602
4206
dict_keys(['processing_time', 'response_host', 'number', 'memory_limit_mb', 'cpu_limit_cores', 'hdd_limit_mb', 'task_class'])
dict_keys(['processing_time', 'hdd_free_before', 'response_host', 'size_mb', 'memory_limit_mb', 'cpu_limit_cores', 'hdd_limit_mb', 'task_class'])
dict_keys(['processing_time', 'mem_free_before', 'response_host', 'size_mb', 'memory_limit_mb', 'cpu_limit_cores', 'hdd_limit_mb', 'task_class'])



In [3]:
# combined all data to one list[dict[features_keys, features_values]]


features_list = set({'processing_time', 'task_class', 'number', 'memory_limit_mb', 'cpu_limit_cores', 'hdd_limit_mb', 'size_mb', 'hdd_free_before', 'mem_free_before' })
combined_data = list()

for d_key in data.keys():
    sub_list = data[d_key]
    for i in range(len(sub_list['processing_time'])):
        data_point = dict()
        for f_key in features_list:
            if f_key not in sub_list:
                # set NaN for xgboost compatibility
                data_point[f_key] = float('nan')
            else:
                data_point[f_key] = sub_list[f_key][i]
        combined_data.append(data_point )


print(len(combined_data))

14808


In [None]:
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import Lasso, Ridge
from sklearn.svm import SVR as SkSVR
from sklearn.linear_model import LinearRegression as SkLinearRegression

from cuml.linear_model import LinearRegression as CuLinearRegression
from cuml.svm import SVR as CuSVR

import xgboost as xgb
from torch.utils.tensorboard import SummaryWriter

writer = SummaryWriter("runs/regression_benchmark_pip")

# ================================
# Load your dataset
# ================================
from sklearn.datasets import fetch_california_housing
data = fetch_california_housing()
X = data.data.astype(np.float32)
y = data.target.astype(np.float32)

# ================================
# Models list
# ================================
models = {
    "LinearRegression_GPU": (
        CuLinearRegression(),
        {}
    ),
    "Lasso_CPU": (
        Lasso(max_iter=5000),
        {
            "alpha": [0.01, 0.1, 1.0]
        }
    ),
    "Ridge_CPU": (
        Ridge(),
        {
            "alpha": [0.1, 1, 10]
        }
    ),
    # cuML SVR 不支持 kernel 参数，只能调 C、epsilon、cache_size
    "SVR_GPU": (
        CuSVR(),
        {
            "C": [1, 10, 50],
            "epsilon": [0.0, 0.1, 0.2]
        }
    ),
    "XGBoost_GPU": (
        xgb.XGBRegressor(
            objective="reg:squarederror",
            tree_method="gpu_hist",
            predictor="gpu_predictor",
            device="cuda",
        ),
        {
            "n_estimators": [50, 100, 200],
            "max_depth": [3, 5, 7],
            "learning_rate": [0.01, 0.1, 0.2]
        }
    )
}

# ================================
# GridSearch + TensorBoard
# ================================
for name, (model, param_grid) in models.items():

    print(f"==== Training {name} ====")

    gscv = GridSearchCV(
        estimator=model,
        param_grid=param_grid,
        scoring="neg_mean_squared_error",
        cv=3,
        verbose=1,
        n_jobs=1,   # cuML 模型必须 n_jobs=1
    )

    gscv.fit(X, y)

    best_model = gscv.best_estimator_
    y_pred = best_model.predict(X)

    rmse = np.sqrt(mean_squared_error(y, y_pred))
    mae = mean_absolute_error(y, y_pred)
    r2 = r2_score(y, y_pred)

    writer.add_hparams(
        hparam_dict=gscv.best_params_,
        metric_dict={
            f"{name}_RMSE": rmse,
            f"{name}_MAE": mae,
            f"{name}_R2": r2,
        },
        run_name=name
    )

    print(f"{name} Best Params = {gscv.best_params_}")
    print(f"{name} RMSE={rmse:.4f} MAE={mae:.4f} R2={r2:.4f}")

writer.close()


ModuleNotFoundError: No module named 'numpy'

## README
### dashboard