In [None]:
%pip install shap
%pip install catboost

from sklearn.model_selection import ParameterGrid
from sklearn.preprocessing import StandardScaler
from catboost import CatBoostClassifier, Pool
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import pandas as pd
import numpy as np
import shap
import os
import json

In [None]:
def evaluate_classification(y_true, y_pred, results, solver_name, label):
    acc = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred, average='macro', zero_division=0)
    rec = recall_score(y_true, y_pred, average='macro', zero_division=0)
    f1 = f1_score(y_true, y_pred, average='macro', zero_division=0)

    print(f"{label} | Acc: {acc:.4f}, Prec: {prec:.4f}, Rec: {rec:.4f}, F1: {f1:.4f}")

    results.append({
        "Solver": solver_name,
        "Dataset": label,
        "Accuracy": acc,
        "Precision": prec,
        "Recall": rec,
        "F1-Score": f1
    })

In [None]:
import shap
import numpy as np
import pandas as pd
import os

def log_shap_and_importance_classification(model,X_val,y_val,pred_val,features,solver_name,target):

    explainer = shap.TreeExplainer(model)
    shap_values = explainer.shap_values(X_val)

    # Return an array arr of shape (n_samples, n_features, n_classes)
    if isinstance(shap_values, list):
        arr = np.stack(shap_values, axis=-1)  
    else:
        arr = np.array(shap_values)
        if arr.ndim == 2:
            arr = arr[:, :, np.newaxis]
        elif arr.ndim == 4 and arr.shape[2] == 1:
            arr = arr.squeeze(2)  

    

    feature_importance = np.mean(np.abs(arr), axis=(0, 2))

    # Save one‐row per‐solver/target SHAP importances
    shap_df = pd.DataFrame(feature_importance.reshape(1, -1), columns=features)
    shap_df["target"] = target
    shap_df["solver"] = solver_name
    os.makedirs("./cb_class/cb_shap_values", exist_ok=True)
    shap_df.to_csv(
        f"./cb_class/cb_shap_values/shap_{solver_name}_{target}_classification.csv",
        index=False
    )

    # Model's built-in importances
    importance_df = pd.DataFrame({
        "feature": features,
        "model_importance": model.feature_importances_,
        "target": target,
        "solver": solver_name
    })
    os.makedirs("./cb_class/cb_feature_importance", exist_ok=True)
    imp_file = "./cb_class/cb_feature_importance/cb_feature_importance_classification.csv"
    importance_df.to_csv(imp_file, mode='a', index=False,
                         header=not os.path.exists(imp_file))

    # Top-5
    top5 = importance_df.nlargest(5, "model_importance")
    top5_file = "./cb_class/cb_feature_importance/cb_top5_feature_importance_classification.csv"
    top5.to_csv(top5_file, mode='a', index=False,
                header=not os.path.exists(top5_file))

    print(f"SHAP & feature importances {solver_name}-{target}.")

In [None]:
def train_catboost_classifier_for_solver(solver_name, train_file, test_file, val_file):
    print(f"\nSolver: {solver_name}")
    #Drop any rows with NaNs in features or targets
    df_train = pd.read_csv(train_file).dropna()
    df_test  = pd.read_csv(test_file).dropna()
    df_val   = pd.read_csv(val_file).dropna()

    features = [
        "number_of_elements", "capacity", "max_weight", "min_weight", "mean_weight",
        "median_weight", "std_weight", "weight_range", "max_profit", "min_profit",
        "mean_profit", "median_profit", "std_profit", "profit_range", "renting_ratio",
        "mean_weight_profit_ratio", "median_weight_profit_ratio",
        "capacity_mean_weight_ratio", "capacity_median_weight_ratio",
        "capacity_std_weight_ratio", "std_weight_profit_ratio",
        "weight_profit_correlation", "ram", "cpu_cores"
    ]
    target_cols = ["solution_time", "optimality_gap", "peak_memory"]

    # Load precomputed bin edges for the solver
    bins_dir = os.path.join(BINS_BASE_DIR, f"{solver_name}_bins")
    if not os.path.isdir(bins_dir):
        raise FileNotFoundError(f"No bins directory: {bins_dir}")
    json_files = [f for f in os.listdir(bins_dir) if f.endswith("_bins.json")]
    if len(json_files) != 1:
        raise FileNotFoundError(f"Expected one .json in {bins_dir}, found: {json_files}")
    bin_path = os.path.join(bins_dir, json_files[0])

    with open(bin_path, "r") as f:
        bin_edges_dict = json.load(f)

    
    scaler = StandardScaler().fit(df_train[features])
    X_train = scaler.transform(df_train[features])
    X_test  = scaler.transform(df_test[features])
    X_val   = scaler.transform(df_val[features])

    # Output directories
    os.makedirs("./cb_class/catboost_tuning", exist_ok=True)
    os.makedirs("./cb_class/catboost_configs", exist_ok=True)
    os.makedirs("./cb_class/cb_classifier_models", exist_ok=True)

    results = []
    for target in target_cols:
        if target not in bin_edges_dict:
            print(f"  No bin edges for '{target}' in {bin_path}, skipping.")
            continue

        edges = bin_edges_dict[target]
        # Labels 0 to (len(edges)-2) for dataset
        def to_bins(arr,edges):
            labels = np.digitize(arr, edges[:-1], right=False) - 1
            return np.clip(labels, 0, len(edges)-2)

        y_train = to_bins(df_train[target].values, edges)
        y_test  = to_bins(df_test [target].values, edges)
        y_val   = to_bins(df_val  [target].values, edges)
        
       
        max_train = y_train.max()
        y_test = np.clip(y_test,  0, max_train)
        y_val  = np.clip(y_val,   0, max_train)

        # Skip if only one class in train or unseen in val
        cls_train = np.unique(y_train)
        cls_val   = np.unique(y_val)
        if len(cls_train) < 2:
            print(f"  Skipping '{target}': only one class in training {cls_train}")
            continue
        if not set(cls_val).issubset(set(cls_train)):
            print(f"  Skipping '{target}': val classes {cls_val} not in train {cls_train}")
            continue

        
        param_grid = {
            "depth": [4, 6],
            "learning_rate": [0.01, 0.1],
            "l2_leaf_reg": [3, 5],
        }
        best_f1 = -np.inf
        tuning_logs = []
        best_model = None

        for params in ParameterGrid(param_grid):
            model = CatBoostClassifier(
                **params,
                iterations=1000,
                early_stopping_rounds=50,
                verbose=0,
                random_seed=42
            )
            model.fit(X_train, y_train, eval_set=(X_val, y_val), use_best_model=True)
            pred_val = model.predict(X_val)
            f1 = f1_score(y_val, pred_val, average='weighted')
            tuning_logs.append({**params, "f1_score": f1})

            if f1 > best_f1:
                best_f1 = f1
                best_model = model
                best_pred_val = pred_val
                best_pred_test = model.predict(X_test)
                best_y_test = y_test
                best_y_val  = y_val
                best_params = params

        if best_model is None:
            print(f"  No valid CatBoostClassifier found for '{target}'")
            continue

   
        pd.DataFrame(tuning_logs).to_csv(
            f"./cb_class/catboost_tuning/tuning_{solver_name}_{target}.csv",
            index=False
        )
        with open(f"./cb_class/catboost_configs/best_{solver_name}_{target}.json", "w") as f:
            json.dump(best_params, f, indent=4)

       
        print(f"[{target} → Test]")
        evaluate_classification(best_y_test, best_pred_test, results, solver_name, target)
        print(f"[{target} → Val]")
        evaluate_classification(best_y_val, best_pred_val, results, solver_name, target)

        # Save the classifier
        model_path = f"./cb_class/cb_classifier_models/cb_{solver_name}_{target}.cbm"
        best_model.save_model(model_path)
        

        #SHAP & feature importance
        log_shap_and_importance_classification(
            best_model, X_val, best_y_val, best_pred_val, features, solver_name, target
        )

    
    pd.DataFrame(results).to_csv(
        "./catboost_evaluation_results_classification.csv",
        mode='a',
        index=False,
        header=not os.path.exists("./catboost_evaluation_results_classification.csv")
    )

In [None]:
def run_all_models(base_folder):
    for root, dirs, files in os.walk(base_folder):
        for folder in dirs:
            folder_path = os.path.join(root, folder)
            csv_files = os.listdir(folder_path)

            train_file = [f for f in csv_files if f.endswith("_train.csv")]
            test_file = [f for f in csv_files if f.endswith("_test.csv")]
            val_file = [f for f in csv_files if f.endswith("_val.csv")]

            if train_file and test_file and val_file:
                train_fp = os.path.join(folder_path, train_file[0])
                test_fp = os.path.join(folder_path, test_file[0])
                val_fp = os.path.join(folder_path, val_file[0])

                solver_name = folder  
                train_catboost_classifier_for_solver(solver_name, train_fp, test_fp, val_fp)

In [None]:
base_folder = "Dataset/maximization/training_data/algorithms" #Specify the path where the training data (train/test/val files) is saved
BINS_BASE_DIR = "Dataset/maximization/training_data/bins" #Specify the path where the bins are saved
run_all_models(base_folder)