In [1]:
from sklearn.ensemble import RandomForestRegressor
import shap
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib
from sklearn.model_selection import ParameterGrid
import json

In [2]:
def evaluate(y_true, y_pred, results, solver_name, label):
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)
    rel_rmse = rmse / np.mean(y_true)

    safe_y_true = np.where(y_true == 0, 1e-8, y_true)
    mape = np.mean(np.abs((y_true - y_pred) / safe_y_true)) * 100

    result = {
        "Solver": solver_name,
        "Dataset": label,
        "MAE": mae,
        "MSE": mse,
        "RMSE": rmse,
        "R2": r2,
        "Rel_RMSE": rel_rmse,
        "MAPE (%)": mape,
    }
    results.append(result)

In [None]:
def log_rf_shap_and_importance(model, X_val, y_val, pred_val, features, solver_name, target):
   
    background = shap.sample(X_val, min(100, X_val.shape[0]), random_state=42)

    try:
        explainer = shap.KernelExplainer(model.predict, background)
        shap_values = explainer.shap_values(X_val, nsamples=100)

        shap_df = pd.DataFrame(shap_values, columns=features)
        shap_df["predicted_value"] = pred_val
        shap_df["actual_value"] = y_val
        shap_df["target"] = target
        shap_df["solver"] = solver_name

        os.makedirs("./rf/rf_shap_values", exist_ok=True)
        shap_filename = f"./rf/rf_shap_values/shap_{solver_name}_{target}_reg.csv"
        shap_df.to_csv(shap_filename, index=False)

        # Feature importance 
        importance_df = pd.DataFrame({
            "feature": features,
            "shap_importance": np.abs(shap_values).mean(axis=0),
            "model_importance": model.feature_importances_,
            "target": target,
            "solver": solver_name
        })

        os.makedirs("./rf/rf_feature_importance", exist_ok=True)
        importance_file = "./rf/rf_feature_importance/rf_feature_importance_reg.csv"
        importance_df.to_csv(importance_file, mode='a', index=False, header=not os.path.exists(importance_file))

        #top-5
        top5_file = "./rf/rf_feature_importance/rf_top5_feature_importance_reg.csv"
        importance_df.sort_values(by="shap_importance", ascending=False).head(5).to_csv(
            top5_file, mode='a', index=False, header=not os.path.exists(top5_file)
        )

    except Exception as e:
        print(f"SHAP failed for {solver_name}-{target}: {e}")

In [None]:
def train_rf_for_solver(solver_name, train_file, test_file, val_file):
    print(f"\nSolver: {solver_name}")
    df_train = pd.read_csv(train_file)
    df_test = pd.read_csv(test_file)
    df_val = pd.read_csv(val_file)

    df_train.dropna(inplace=True)
    df_test.dropna(inplace=True)
    df_val.dropna(inplace=True)

    target_cols = ["solution_time", "optimality_gap", "peak_memory"]
    features = [
        "number_of_elements", "capacity", "max_weight", "min_weight", "mean_weight",
        "median_weight", "std_weight", "weight_range", "max_profit", "min_profit", "mean_profit",
        "median_profit", "std_profit", "profit_range", "renting_ratio", "mean_weight_profit_ratio",
        "median_weight_profit_ratio", "capacity_mean_weight_ratio", "capacity_median_weight_ratio",
        "capacity_std_weight_ratio", "std_weight_profit_ratio", "weight_profit_correlation",
        "ram", "cpu_cores"
    ]

    for df in [df_train, df_test, df_val]:
        for col in target_cols:
            df[col] = pd.to_numeric(df[col], errors='coerce')
            df.dropna(subset=[col], inplace=True)

    scaler = StandardScaler()
    X_train = scaler.fit_transform(df_train[features])
    X_test = scaler.transform(df_test[features])
    X_val = scaler.transform(df_val[features])

    os.makedirs("./rf_tuning", exist_ok=True)
    os.makedirs("./rf_configs", exist_ok=True)
    results = []

    for target in target_cols:
        y_train = df_train[target].values
        y_test = df_test[target].values
        y_val = df_val[target].values

        y_scaler = StandardScaler()
        y_train_scaled = y_scaler.fit_transform(y_train.reshape(-1, 1)).flatten()
        y_test_scaled = y_scaler.transform(y_test.reshape(-1, 1)).flatten()
        y_val_scaled = y_scaler.transform(y_val.reshape(-1, 1)).flatten()

        param_grid = {
            "n_estimators": [100, 200, 500],
            "max_depth": [10, 20]
        }

        best_rmse = float("inf")
        best_model = None
        best_config = None
        tuning_logs = []

        for params in ParameterGrid(param_grid):
            model = RandomForestRegressor(**params, random_state=42)
            model.fit(X_train, y_train_scaled)

            pred_val_scaled = model.predict(X_val)
            pred_val = y_scaler.inverse_transform(pred_val_scaled.reshape(-1, 1)).flatten()
            y_val_orig = y_scaler.inverse_transform(y_val_scaled.reshape(-1, 1)).flatten()
            rmse = np.sqrt(mean_squared_error(y_val_orig, pred_val))

            tuning_logs.append({**params, "rmse": rmse})

            if rmse < best_rmse:
                best_rmse = rmse
                best_model = model
                best_config = params
                best_y_test = y_scaler.inverse_transform(y_test_scaled.reshape(-1, 1)).flatten()
                best_y_val = y_val_orig
                best_pred_test = y_scaler.inverse_transform(model.predict(X_test).reshape(-1, 1)).flatten()
                best_pred_val = pred_val

        tuning_df = pd.DataFrame(tuning_logs)
        tuning_df.to_csv(
            f"./rf_tuning/tuning_rf_{solver_name}_{target}_reg.csv", index=False
        )
        with open(f"./rf_configs/best_rf_{solver_name}_{target}_reg.json", "w") as f:
            json.dump(best_config, f, indent=4)

#         print(f"\nTarget: {target.upper()} | Best Params: {best_config}")
        print("[TEST]")
        evaluate(best_y_test, best_pred_test, results, solver_name, f"{target} (Test)")
        print("[VAL]")
        evaluate(best_y_val, best_pred_val, results, solver_name, f"{target} (Val)")
        
        os.makedirs("./rf_models", exist_ok=True)
        model_path = f"./rf_models/rf_model_{solver_name}_{target}.joblib"
        joblib.dump(best_model, model_path)
        print(f"Saved model to {model_path}")

        log_rf_shap_and_importance(best_model, X_val, best_y_val, best_pred_val, features, solver_name, target)

    results_df = pd.DataFrame(results)
    results_file = "./rf_evaluation_results_reg.csv"
    results_df.to_csv(results_file, mode='a', index=False, header=not os.path.exists(results_file))

In [8]:
def run_all_models(base_folder):
    for root, dirs, files in os.walk(base_folder):
        for folder in dirs:
            folder_path = os.path.join(root, folder)
            csv_files = os.listdir(folder_path)

            train_file = [f for f in csv_files if f.endswith("_train.csv")]
            test_file = [f for f in csv_files if f.endswith("_test.csv")]
            val_file = [f for f in csv_files if f.endswith("_val.csv")]

            if train_file and test_file and val_file:
                train_fp = os.path.join(folder_path, train_file[0])
                test_fp = os.path.join(folder_path, test_file[0])
                val_fp = os.path.join(folder_path, val_file[0])

                solver_name = folder
                train_rf_for_solver(solver_name, train_fp, test_fp, val_fp)

In [None]:
base_folder = "./trainingData/final_td_min" #Specify path to training data
run_all_models(base_folder)


Solver: or_min
[TEST]
[VAL]
Saved Random Forest model to ./results_min_kp/rf_models/rf_model_or_min_solution_time.joblib


  0%|          | 0/560 [00:00<?, ?it/s]

[TEST]
[VAL]
Saved Random Forest model to ./results_min_kp/rf_models/rf_model_or_min_optimality_gap.joblib


  0%|          | 0/560 [00:00<?, ?it/s]

[TEST]
[VAL]
Saved Random Forest model to ./results_min_kp/rf_models/rf_model_or_min_peak_memory.joblib


  0%|          | 0/560 [00:00<?, ?it/s]


Solver: gurobi_min
[TEST]
[VAL]
Saved Random Forest model to ./results_min_kp/rf_models/rf_model_gurobi_min_solution_time.joblib


  0%|          | 0/560 [00:00<?, ?it/s]

[TEST]
[VAL]
Saved Random Forest model to ./results_min_kp/rf_models/rf_model_gurobi_min_optimality_gap.joblib


  rel_rmse = rmse / np.mean(y_true)
  rel_rmse = rmse / np.mean(y_true)


  0%|          | 0/560 [00:00<?, ?it/s]

[TEST]
[VAL]
Saved Random Forest model to ./results_min_kp/rf_models/rf_model_gurobi_min_peak_memory.joblib


  0%|          | 0/560 [00:00<?, ?it/s]


Solver: greedy_min
[TEST]
[VAL]
Saved Random Forest model to ./results_min_kp/rf_models/rf_model_greedy_min_solution_time.joblib


  0%|          | 0/560 [00:00<?, ?it/s]

[TEST]
[VAL]
Saved Random Forest model to ./results_min_kp/rf_models/rf_model_greedy_min_optimality_gap.joblib


  0%|          | 0/560 [00:00<?, ?it/s]

[TEST]
[VAL]
Saved Random Forest model to ./results_min_kp/rf_models/rf_model_greedy_min_peak_memory.joblib


  0%|          | 0/560 [00:00<?, ?it/s]


Solver: ga_min
[TEST]
[VAL]
Saved Random Forest model to ./results_min_kp/rf_models/rf_model_ga_min_solution_time.joblib


  0%|          | 0/560 [00:00<?, ?it/s]

[TEST]
[VAL]
Saved Random Forest model to ./results_min_kp/rf_models/rf_model_ga_min_optimality_gap.joblib


  0%|          | 0/560 [00:00<?, ?it/s]

[TEST]
[VAL]
Saved Random Forest model to ./results_min_kp/rf_models/rf_model_ga_min_peak_memory.joblib


  0%|          | 0/560 [00:00<?, ?it/s]


Solver: dp_min
[TEST]
[VAL]
Saved Random Forest model to ./results_min_kp/rf_models/rf_model_dp_min_solution_time.joblib


  0%|          | 0/560 [00:00<?, ?it/s]

[TEST]
[VAL]
Saved Random Forest model to ./results_min_kp/rf_models/rf_model_dp_min_optimality_gap.joblib


  0%|          | 0/560 [00:00<?, ?it/s]

[TEST]
[VAL]
Saved Random Forest model to ./results_min_kp/rf_models/rf_model_dp_min_peak_memory.joblib


  0%|          | 0/560 [00:00<?, ?it/s]


Solver: bb_min
[TEST]
[VAL]
Saved Random Forest model to ./results_min_kp/rf_models/rf_model_bb_min_solution_time.joblib


  0%|          | 0/560 [00:00<?, ?it/s]

[TEST]
[VAL]
Saved Random Forest model to ./results_min_kp/rf_models/rf_model_bb_min_optimality_gap.joblib


  0%|          | 0/560 [00:00<?, ?it/s]

[TEST]
[VAL]
Saved Random Forest model to ./results_min_kp/rf_models/rf_model_bb_min_peak_memory.joblib


  0%|          | 0/560 [00:00<?, ?it/s]