In [None]:
%pip install catboost
import joblib
import json
import numpy as np
import pandas as pd
import os
from tensorflow.keras.models import load_model
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from catboost import CatBoostRegressor
from itertools import combinations
from tensorflow.keras.losses import MeanSquaredError

In [None]:
def evaluate_ensemble(y_true, y_pred, solver_name, target, model_combo, weights):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    r2 = r2_score(y_true, y_pred)
    print(f"\n[ENSEMBLE] Solver: {solver_name}, Target: {target}, Models: {model_combo}, Weights: {weights}")
    print(f"MAE: {mae:.4f}, RMSE: {rmse:.4f}, R2: {r2:.4f}")
    return {
        "Solver": solver_name,
        "Target": target,
        "Models": "_".join(model_combo),
        "Weights": str(weights),
        "MAE": mae,
        "RMSE": rmse,
        "R2": r2
    }

In [None]:
def load_model_for_combination(model_name, solver_name, target, X_test, X_test_cnn):
    try:
        if model_name == "rf":
            path = f"./rf_reg/rf_models/rf_model_{solver_name}_{target}.joblib"
            if os.path.exists(path):
                model = joblib.load(path)
                return model_name, model.predict(X_test)
        elif model_name == "cb":
            path = f"./cb_reg/cb_models/cb_model_{solver_name}_{target}.cbm"
            if os.path.exists(path):
                model = CatBoostRegressor()
                model.load_model(path)
                return model_name, model.predict(X_test)
        elif model_name == "cnn":
            path = f"./cnn_reg/cnn_models/cnn_model_{solver_name}_{target}.h5"
            if os.path.exists(path):
                model = load_model(path, custom_objects={'mse': MeanSquaredError()})
                return model_name, model.predict(X_test_cnn).flatten()
        elif model_name == "mlp":
            path = f"./mlp_reg/mlp_models/mlp_model_{solver_name}_{target}.h5"
            if os.path.exists(path):
                model = load_model(path, custom_objects={'mse': MeanSquaredError()})
                return model_name, model.predict(X_test).flatten()
        elif model_name == "svm":
            path = f"./svm_reg/svm_models/svm_model_{solver_name}_{target}.joblib"
            if os.path.exists(path):
                model = joblib.load(path)
                return model_name, model.predict(X_test)
        elif model_name == "lr":
            path = f"./lr_reg/lr_models/lr_model_{solver_name}_{target}.joblib"
            if os.path.exists(path):
                model = joblib.load(path)
                return model_name, model.predict(X_test)
        elif model_name == "dt":
            path = f"./dt_reg/dt_models/dt_model_{solver_name}_{target}.joblib"
            if os.path.exists(path):
                model = joblib.load(path)
                return model_name, model.predict(X_test)
    except Exception as e:
        print(f"Failed to load {model_name} for {solver_name}, {target}: {e}")
    return None, None


In [None]:
from sklearn.metrics import mean_squared_error
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

def rank_models_by_performance(solver_name, test_file, target, X_test, X_test_cnn, y_test):
    all_models = ["rf", "cb", "cnn", "mlp", "svm", "lr", "dt"]
    model_performance = {}

    for model_name in all_models:
        name, pred = load_model_for_combination(model_name, solver_name, target, X_test, X_test_cnn)
        if pred is not None:
            rmse = np.sqrt(mean_squared_error(y_test, pred))
            model_performance[model_name] = rmse
            print(f"{model_name} RMSE: {rmse:.4f}")
        else:
            print(f"{model_name} not evaluated.")

    # Sort models by RMSE
    ranked_models = sorted(model_performance, key=model_performance.get)
    print(f"Models ranked by RMSE for {target}: {ranked_models}")
    return ranked_models, model_performance

In [None]:
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd
import os

def run_top_k_ensemble_dynamic(solver_name, train_file, test_file):
    
    df_train = pd.read_csv(train_file)
    df_test  = pd.read_csv(test_file)
    df_test.dropna(inplace=True)

 
    features = [
        "number_of_elements","capacity","max_weight","min_weight","mean_weight",
        "median_weight","std_weight","weight_range","max_profit","min_profit","mean_profit",
        "median_profit","std_profit","profit_range","renting_ratio","mean_weight_profit_ratio",
        "median_weight_profit_ratio","capacity_mean_weight_ratio","capacity_median_weight_ratio",
        "capacity_std_weight_ratio","std_weight_profit_ratio","weight_profit_correlation",
        "ram","cpu_cores"
    ]
    targets = ["solution_time","optimality_gap","peak_memory"]

   
    feat_scaler = StandardScaler().fit(df_train[features])
    X_test      = feat_scaler.transform(df_test[features])
    X_test_cnn  = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))

  
    y_scalers = {
        t: StandardScaler().fit(df_train[[t]].values)
        for t in targets
    }

    results = []

    for target in targets:
        y_test = df_test[target].values

       
        ranked_models, _ = rank_models_by_performance(
            solver_name, test_file, target, X_test, X_test_cnn, y_test
        )

        for k in [3, 5, 7]:
            ensemble_preds = []
            names = []

            for mname in ranked_models[:k]:
                name, pred = load_model_for_combination(
                    mname, solver_name, target, X_test, X_test_cnn
                )
                if pred is None:
                    continue

               
                if name in {"cnn","mlp","dt","rf","svm"}:
                    pred = y_scalers[target].inverse_transform(pred.reshape(-1,1)).flatten()

                ensemble_preds.append(pred)
                names.append(name)

            if not ensemble_preds:
                print(f"No models for Top-{k} on {solver_name}/{target}")
                continue

            #Equal‐weight average
            P = np.vstack(ensemble_preds)
            avg_pred = P.mean(axis=0)

            row = evaluate_ensemble(
                y_test, avg_pred, solver_name, target, tuple(names),
                weights=[1/len(P)]*len(P)
            )
            row["Top_K"] = k
            results.append(row)

    df_res = pd.DataFrame(results)
    out_path = "output.csv"
    os.makedirs(os.path.dirname(out_path), exist_ok=True)
    df_res.to_csv(out_path, mode="a", index=False,
                  header=not os.path.exists(out_path))


In [None]:
def run_ensembles_for_all_solvers(base_folder):
    for root, dirs, files in os.walk(base_folder):
        for folder in dirs:
            folder_path = os.path.join(root, folder)
            csv_files = os.listdir(folder_path)

            train_file = [f for f in csv_files if f.endswith("_train.csv")]
            test_file = [f for f in csv_files if f.endswith("_test.csv")]

            if test_file:
                train_fp = os.path.join(folder_path, train_file[0])
                test_fp = os.path.join(folder_path, test_file[0])
                solver_name = folder

                print(f"\nRunning for: {solver_name}")
                run_top_k_ensemble_dynamic(solver_name, train_fp, test_fp)

In [None]:
base_folder = "./training_data"  #Path to training data folder
run_ensembles_for_all_solvers(base_folder)