In [None]:
import os
import glob
import json
import joblib
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score, classification_report, accuracy_score
from catboost import CatBoostClassifier
from tensorflow.keras.models import load_model
from tensorflow.keras.losses import MeanSquaredError 

In [None]:
BASE_DPI   = "./training_data/algorithms"     #path to the folder that contains training dataset for algorithms
BINS_BASE  = "./trainingData/bins"      #path to the folder that contains bins for algorithms
CLASS_BASE = "./outupt_folder"                  
K_LIST     = [3,5,7]                           

In [None]:
def load_classifier_proba(name, solver, target, X, X_cnn):

    try:
        if name == "rf":
            p = f"{CLASS_BASE}/rf_class/rf_classifier_models/rf_{solver}_{target}.joblib"
            if os.path.exists(p):
                m = joblib.load(p); return name, m.predict_proba(X), p

        if name == "cb":
            p = f"{CLASS_BASE}/cb_class/cb_classifier_models/cb_{solver}_{target}.cbm"
            if os.path.exists(p):
                m = CatBoostClassifier(); m.load_model(p)
                return name, m.predict_proba(X), p

        if name == "cnn":
           
            pattern = f"{CLASS_BASE}/cnn_class/cnn_classifier_models/cnn_classifier_{solver}_{target}_*e.h5"
            files   = glob.glob(pattern)
            if not files:
                return name, None, None
            
            fpath = sorted(files, key=lambda s: int(s.split("_")[-1].rstrip("e.h5")))[-1]
            m = load_model(fpath, custom_objects={'mse': MeanSquaredError()})
            return name, m.predict(X_cnn), fpath

        if name == "mlp":
            pattern = f"{CLASS_BASE}/mlp_class/mlp_classifier_models/mlp_classifier_{solver}_{target}_*e.h5"
            files   = glob.glob(pattern)
            if not files:
                return name, None, None
            fpath = sorted(files, key=lambda s: int(s.split("_")[-1].rstrip("e.h5")))[-1]
            m = load_model(fpath, custom_objects={'mse': MeanSquaredError()})
            return name, m.predict(X), fpath

        if name == "svm":
            p = f"{CLASS_BASE}/svm_class/svm_classifier_models/svm_{solver}_{target}.joblib"
            if os.path.exists(p):
                m = joblib.load(p); return name, m.predict_proba(X), p

        if name == "lr":
            p = f"{CLASS_BASE}/lr_class/lr_classifier_models/lr_{solver}_{target}.joblib"
            if os.path.exists(p):
                m = joblib.load(p); return name, m.predict_proba(X), p

        if name == "dt":
            p = f"{CLASS_BASE}/dt_class/dt_classifier_models/dt_{solver}_{target}.joblib"
            if os.path.exists(p):
                m = joblib.load(p); return name, m.predict_proba(X), p

    except Exception as e:
        print(f"[ERROR] loading {name}: {e}")
    return None, None, None


In [None]:
def ensemble_classifiers_for_solver(solver, train_file, test_file, val_file):
    print(f"\n Solver: {solver} ")
   
    df_tr = pd.read_csv(train_file).dropna()
    df_va = pd.read_csv(val_file).dropna()
    df_te = pd.read_csv(test_file).dropna()

   
    bins_dir = os.path.join(BINS_BASE, f"{solver}_bins")
    binf = glob.glob(os.path.join(bins_dir, "*_bins.json"))
    assert len(binf)==1, "Need one JSON in "+bins_dir
    bin_edges = json.load(open(binf[0]))

    # scale features
    feats = [
      "number_of_elements","capacity","max_weight","min_weight","mean_weight",
      "median_weight","std_weight","weight_range","max_profit","min_profit","mean_profit",
      "median_profit","std_profit","profit_range","renting_ratio","mean_weight_profit_ratio",
      "median_weight_profit_ratio","capacity_mean_weight_ratio","capacity_median_weight_ratio",
      "capacity_std_weight_ratio","std_weight_profit_ratio","weight_profit_correlation",
      "ram","cpu_cores"
    ]
    scaler = StandardScaler().fit(df_tr[feats])
    X_va    = scaler.transform(df_va[feats])
    X_te    = scaler.transform(df_te[feats])
    X_va_cnn= X_va.reshape((-1,X_va.shape[1],1))
    X_te_cnn= X_te.reshape((-1,X_te.shape[1],1))

    records = []
    for target in ["solution_time","optimality_gap","peak_memory"]:
        # Recover true classes
        edges = bin_edges[target]
        def to_bins(arr,edges):
            return np.clip(np.digitize(arr, edges[:-1], right=False)-1, 0, len(edges)-2)
        y_va_raw = to_bins(df_va[target].values,edges)
        y_te_raw = to_bins(df_te[target].values,edges)

      
        perf = {}
        proba = {}
        for name in ["rf","cb","cnn","mlp","svm","lr","dt"]:
            nm, pr, path = load_classifier_proba(name, solver, target, X_va, X_va_cnn)
            if pr is None: continue

            y_va = np.clip(y_va_raw.copy(), 0, pr.shape[1] - 1)
            y_te = np.clip(y_te_raw.copy(), 0, pr.shape[1] - 1)
            ypred = pr.argmax(axis=1)
            perf[nm] = f1_score(y_va, ypred, average="macro", zero_division=0)
            proba[nm] = pr
            print(f"  {nm:<3} val-F1 = {perf[nm]:.3f}")

        
        if not perf:
            print(f"No classifiers for “{target}”, skip.")
            continue
        # Rank by desc F1
        ranked = sorted(perf, key=lambda m: perf[m], reverse=True)
 

       
        for K in K_LIST:
            chosen = ranked[:K]

           
            all_probas = [
                load_classifier_proba(m, solver, target, X_te, X_te_cnn)[1]
                for m in chosen
            ]
            n_cls = max(pr.shape[1] for pr in all_probas)

            padded = []
            for pr in all_probas:
                if pr.shape[1] < n_cls:
                    pad_width = n_cls - pr.shape[1]
                    pr = np.concatenate([pr, np.zeros((pr.shape[0], pad_width))], axis=1)
                padded.append(pr)

           
            P = np.stack(padded, axis=0)      
            P = P.mean(axis=0)                
            y_pred = P.argmax(axis=1)

            acc = accuracy_score(y_te, y_pred)
            f1 = f1_score(y_te, y_pred, average="macro", zero_division=0)
            print(f"Top-{K} ensemble test-F1 = {f1:.3f}")

            records.append({
                "solver": solver,
                "target": target,
                "Top_K": K,
                "members": ";".join(chosen),
                "test_accuracy": acc,
                "test_f1": f1
            })

   
    df_out = pd.DataFrame(records)
    out_fp = os.path.join(CLASS_BASE, "ensembles_fl", f"{solver}_class_ensembles.csv")
    os.makedirs(os.path.dirname(out_fp), exist_ok=True)
    df_out.to_csv(out_fp, index=False)
    

In [None]:
def run_all_models(base_folder):
    for root, dirs, files in os.walk(base_folder):
        for folder in dirs:
            folder_path = os.path.join(root, folder)
            csv_files = os.listdir(folder_path)

            train_file = [f for f in csv_files if f.endswith("_train.csv")]
            test_file = [f for f in csv_files if f.endswith("_test.csv")]
            val_file = [f for f in csv_files if f.endswith("_val.csv")]

            if train_file and test_file and val_file:
                train_fp = os.path.join(folder_path, train_file[0])
                test_fp = os.path.join(folder_path, test_file[0])
                val_fp = os.path.join(folder_path, val_file[0])

                solver_name = folder  
                ensemble_classifiers_for_solver(solver_name, train_fp, test_fp, val_fp)

In [None]:
run_all_models(BASE_DPI)