In [1]:
import pandas as pd
import numpy as np
import joblib
from sklearn.ensemble import RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from xgboost import XGBRegressor
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.neural_network import MLPRegressor

In [2]:
df=pd.read_csv('/home/jolivera/Documents/CloudSkin/Time-Series-Library/ccgrid_scripts/data/training_data_scored_pairs.csv')

In [3]:
# Split function stays as is
def split_by_app_user(df, train_users, val_users, test_users):
    train_df = df[df["torchserve_app_user"].isin(train_users)].copy()
    val_df   = df[df["torchserve_app_user"].isin(val_users)].copy()
    test_df  = df[df["torchserve_app_user"].isin(test_users)].copy()
    return train_df, val_df, test_df

# Calculate percentages
def calculate_split_percentages(train_df, val_df, test_df):
    total = len(train_df) + len(val_df) + len(test_df)
    return {
        "train_pct": len(train_df) / total * 100,
        "val_pct": len(val_df) / total * 100,
        "test_pct": len(test_df) / total * 100,
        "train_rows": len(train_df),
        "val_rows": len(val_df),
        "test_rows": len(test_df),
        "total_rows": total
    }

# Create X and y splits
def make_xy(df):
    X = df[[
        "torchserve_app_user",
        "node_id_src",
        "node_id_tgt",
        # all source metrics
        "torchserve_node_cpu_src",
        "torchserve_node_energy_src",
        "torchserve_node_power_src",
        "torchserve_app_cpu_src",
        "torchserve_app_energy_src",
        "torchserve_app_power_src",
        "torchserve_app_latency_src",
        "torchserve_app_qps_src"
    ]].copy()

    y = df[[
        "torchserve_node_cpu_tgt",
        "torchserve_node_energy_tgt",
        "torchserve_node_power_tgt",
        "torchserve_app_cpu_tgt",
        "torchserve_app_energy_tgt",
        "torchserve_app_power_tgt",
        "torchserve_app_latency_tgt",
        "torchserve_app_qps_tgt"
    ]].copy()
    return X, y

def shuffle_df(X, y, random_state=42):
    shuffled_idx = X.sample(frac=1, random_state=random_state).index
    return X.loc[shuffled_idx].reset_index(drop=True), y.loc[shuffled_idx].reset_index(drop=True)

def evaluate_model(model, X_val, y_val, plot=False):
    """
    Evaluate a multi-output regression model on validation data.

    Parameters
    ----------
    model : fitted sklearn-like regressor
    X_val : array-like
    y_val : pd.DataFrame
    plot : bool, optional

    Returns
    -------
    pd.DataFrame
    """

    targets = list(y_val.columns)
    y_pred = model.predict(X_val)

    if isinstance(y_pred, list):
        y_pred = np.column_stack(y_pred)

    results, r2_list, mae_list, rmse_list, mape_list = [], [], [], [], []

    for i, target in enumerate(targets):
        y_true_t = y_val[target].values
        y_pred_t = y_pred[:, i]

        r2 = r2_score(y_true_t, y_pred_t)
        mae = mean_absolute_error(y_true_t, y_pred_t)
        rmse = mean_squared_error(y_true_t, y_pred_t, squared=False)
        # Compute MAPE safely
        mape = np.mean(np.abs((y_true_t - y_pred_t) / np.where(y_true_t==0, 1e-8, y_true_t))) * 100

        results.append([target, r2, mae, rmse, mape])
        r2_list.append(r2)
        mae_list.append(mae)
        rmse_list.append(rmse)
        mape_list.append(mape)

        if plot:
            fig, axes = plt.subplots(1, 2, figsize=(10, 4))
            fig.suptitle(target, fontsize=13)

            axes[0].scatter(y_true_t, y_pred_t, alpha=0.6)
            axes[0].plot([y_true_t.min(), y_true_t.max()],
                         [y_true_t.min(), y_true_t.max()], 'r--', lw=1)
            axes[0].set_xlabel("Actual")
            axes[0].set_ylabel("Predicted")
            axes[0].set_title("Predicted vs Actual")

            residuals = y_true_t - y_pred_t
            axes[1].scatter(y_pred_t, residuals, alpha=0.6)
            axes[1].axhline(0, color='red', linestyle='--', lw=1)
            axes[1].set_xlabel("Predicted")
            axes[1].set_ylabel("Residual")
            axes[1].set_title("Residuals vs Predicted")

            plt.tight_layout()
            plt.show()

    overall_r2_mean = np.mean(r2_list)
    overall_r2_global = r2_score(y_val, y_pred)
    overall_mae = np.mean(mae_list)
    overall_rmse = np.mean(rmse_list)
    overall_mape = np.mean(mape_list)

    results_df = pd.DataFrame(results, columns=["Target", "R2", "MAE", "RMSE", "MAPE"])
    overall_row = pd.DataFrame({
        "Target": ["Overall (mean)", "Overall (global)"],
        "R2": [overall_r2_mean, overall_r2_global],
        "MAE": [overall_mae, None],
        "RMSE": [overall_rmse, None],
        "MAPE": [overall_mape, None]
    })
    results_df = pd.concat([results_df, overall_row], ignore_index=True)

    print(results_df)
    return results_df


In [4]:
# Apply

train_users = [1, 13, 25, 31, 43, 55]
val_users   = [7, 19]
test_users  = [37, 49]


train_df, val_df, test_df = split_by_app_user(df, train_users, val_users, test_users)

stats = calculate_split_percentages(train_df, val_df, test_df)
print(stats)

X_train, y_train = make_xy(train_df)
X_val, y_val     = make_xy(val_df)
X_test, y_test   = make_xy(test_df)

X_train, y_train = shuffle_df(X_train, y_train)
X_val, y_val     = shuffle_df(X_val, y_val)
X_test, y_test   = shuffle_df(X_test, y_test)

print("Shapes:")
print("X_train:", X_train.shape, "y_train:", y_train.shape)
print("X_val:", X_val.shape, "y_val:", y_val.shape)
print("X_test:", X_test.shape, "y_test:", y_test.shape)


{'train_pct': 58.77706732577508, 'val_pct': 21.147247787406833, 'test_pct': 20.075684886818088, 'train_rows': 153146, 'val_rows': 55100, 'test_rows': 52308, 'total_rows': 260554}
Shapes:
X_train: (153146, 11) y_train: (153146, 8)
X_val: (55100, 11) y_val: (55100, 8)
X_test: (52308, 11) y_test: (52308, 8)


In [5]:
encode=1
model_name="random_forest"

# First model: Random forest

In [6]:
# rf =RandomForestRegressor(
#         n_estimators=200,
#         max_depth=None,
#         random_state=42,
#         n_jobs=-1
#     )

# rf.fit(X_train, y_train)

# print("Train R^2:", rf.score(X_train, y_train))
# print("Val R^2:", rf.score(X_val, y_val))


In [7]:

# # save the trained model
# joblib.dump(rf, f"./models/{model_name}_multioutput_scoredpairs.pkl")

# later, load it back
rf_loaded = joblib.load(f"./models/{model_name}_multioutput_scoredpairs.pkl")

# check it's working
print("Loaded model Val R^2:", rf_loaded.score(X_val, y_val))


Loaded model Val R^2: 0.8393735987692135


In [8]:
results_df = evaluate_model(rf_loaded, X_val, y_val, plot=False)


                       Target        R2         MAE        RMSE       MAPE
0     torchserve_node_cpu_tgt  0.879076  246.770442  329.942019  11.052893
1  torchserve_node_energy_tgt  0.992894  214.542408  335.907626   5.597639
2   torchserve_node_power_tgt  0.988198    4.834819    7.109354   6.553331
3      torchserve_app_cpu_tgt  0.892616  272.062274  356.213509  16.923628
4   torchserve_app_energy_tgt  0.904240  238.149185  358.747759  16.686388
5    torchserve_app_power_tgt  0.899360    4.574169    7.058053  17.845120
6  torchserve_app_latency_tgt  0.332441   15.968049   23.495710  16.310640
7      torchserve_app_qps_tgt  0.826164    4.552061    6.048523  18.730480
8              Overall (mean)  0.839374  125.181676  178.065319  13.712515
9            Overall (global)  0.839374         NaN         NaN        NaN


# XGBoosting

In [9]:
# # Define model
# base_xgb = XGBRegressor(
#     n_estimators=300,
#     learning_rate=0.01,
#     max_depth=6,
#     subsample=0.8,
#     colsample_bytree=0.8,
#     objective='reg:squarederror',
#     random_state=42,
#     n_jobs=-1,
#     enable_categorical=True
# )

# # Wrap for multi-output regression
# xgb_multi = MultiOutputRegressor(base_xgb)

# # Fit
# xgb_multi.fit(X_train, y_train)


In [10]:
# # save the trained model
model_name = "xgb_multi"
# joblib.dump(xgb_multi, f"./models/{model_name}_multioutput_scoredpairs.pkl")

# later, load it back
xgb_multi_loaded = joblib.load(f"./models/{model_name}_multioutput_scoredpairs.pkl")

# check it's working
print("Loaded model Val R^2:", xgb_multi_loaded.score(X_val, y_val))


Loaded model Val R^2: -0.8868972836132697


In [11]:
results_df = evaluate_model(xgb_multi_loaded, X_val, y_val, plot=False)

                       Target        R2          MAE         RMSE        MAPE
0     torchserve_node_cpu_tgt -0.046240   801.032126   970.502804   31.722424
1  torchserve_node_energy_tgt -1.816044  6111.776858  6687.079668  144.751315
2   torchserve_node_power_tgt -1.839168   100.568310   110.266388  137.350180
3      torchserve_app_cpu_tgt -0.038272   957.221580  1107.632075   47.729577
4   torchserve_app_energy_tgt -1.074274  1433.733047  1669.669400  113.065229
5    torchserve_app_power_tgt -0.946045    26.448835    31.036710  107.510537
6  torchserve_app_latency_tgt -1.850668    39.665199    48.553123   43.963984
7      torchserve_app_qps_tgt  0.515532     9.394704    10.097453   38.701366
8              Overall (mean) -0.886897  1184.980082  1329.354702   83.099327
9            Overall (global) -0.886897          NaN          NaN         NaN


# Neural Networks: MLP
### First Scaling and onehotencoding.

In [12]:
# Identify categorical and numeric features
cat_features = ["node_id_src", "node_id_tgt"]
num_features = [col for col in X_train.columns if col not in cat_features]

# --- 1. One-hot encode categorical features ---
# Fit only on train to avoid data leakage
encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
encoder.fit(X_train[cat_features])

# Transform each split
X_train_cat = pd.DataFrame(
    encoder.transform(X_train[cat_features]),
    columns=encoder.get_feature_names_out(cat_features),
    index=X_train.index
)

X_val_cat = pd.DataFrame(
    encoder.transform(X_val[cat_features]),
    columns=encoder.get_feature_names_out(cat_features),
    index=X_val.index
)

X_test_cat = pd.DataFrame(
    encoder.transform(X_test[cat_features]),
    columns=encoder.get_feature_names_out(cat_features),
    index=X_test.index
)

# --- 2. Scale numeric features ---
x_scaler = MinMaxScaler()
x_scaler.fit(X_train[num_features])

X_train_num = pd.DataFrame(
    x_scaler.transform(X_train[num_features]),
    columns=num_features,
    index=X_train.index
)

X_val_num = pd.DataFrame(
    x_scaler.transform(X_val[num_features]),
    columns=num_features,
    index=X_val.index
)

X_test_num = pd.DataFrame(
    x_scaler.transform(X_test[num_features]),
    columns=num_features,
    index=X_test.index
)

# --- 3. Combine scaled numeric + encoded categorical ---
X_train_prepared = pd.concat([X_train_num, X_train_cat], axis=1)
X_val_prepared = pd.concat([X_val_num, X_val_cat], axis=1)
X_test_prepared = pd.concat([X_test_num, X_test_cat], axis=1)


# Model

In [13]:
# # Define the MLP model
# mlp = MLPRegressor(
#     hidden_layer_sizes=(128, 64, 32),  # 3 hidden layers
#     activation='relu',
#     solver='adam',
#     learning_rate_init=0.005,
#     max_iter=1000,
#     early_stopping=True,
#     random_state=42,
#     verbose=True
# )
# mlp.fit(X_train_prepared, y_train)

In [14]:
# # save the trained model
model_name = "mlp"
# joblib.dump(mlp, f"./models/{model_name}_multioutput_scoredpairs_scaled_onehotencoded.pkl")

# later, load it back
mlp_loaded = joblib.load(f"./models/{model_name}_multioutput_scoredpairs_scaled_onehotencoded.pkl")

# check it's working
print("Loaded model Val R^2:", mlp_loaded.score(X_val_prepared, y_val))


Loaded model Val R^2: 0.9308998725764865


In [15]:
results_df = evaluate_model(mlp_loaded, X_val_prepared, y_val, plot=False)

                       Target        R2         MAE        RMSE       MAPE
0     torchserve_node_cpu_tgt  0.975790  108.554187  147.630252   4.557416
1  torchserve_node_energy_tgt  0.998808   93.330054  137.606355   2.371705
2   torchserve_node_power_tgt  0.997677    2.415919    3.154028   2.995235
3      torchserve_app_cpu_tgt  0.983586  103.212113  139.268672   5.720266
4   torchserve_app_energy_tgt  0.974583  133.752236  184.823212   9.460074
5    torchserve_app_power_tgt  0.966447    3.001484    4.075375  11.151755
6  torchserve_app_latency_tgt  0.611905   14.510211   17.914838  14.700600
7      torchserve_app_qps_tgt  0.938404    2.904102    3.600452   9.311910
8              Overall (mean)  0.930900   57.710038   79.759148   7.533620
9            Overall (global)  0.930900         NaN         NaN        NaN


In [16]:
# from sklearn.model_selection import GridSearchCV

# param_grid = {
#     'hidden_layer_sizes': [(128,64,32), (256,128,64), (128,128,64,32)],
#     'activation': ['relu', 'tanh'],
#     'learning_rate_init': [0.001, 0.003, 0.005],
#     'alpha': [0.0001, 0.001, 0.01]
# }

# mlp_base = MLPRegressor(max_iter=1000, early_stopping=True, random_state=42)

# grid = GridSearchCV(mlp_base, param_grid, cv=3, scoring='r2', n_jobs=-1, verbose=2)
# grid.fit(X_train_prepared, y_train)

# print("Best parameters:", grid.best_params_)
# print("Best CV R2:", grid.best_score_)
