In [1]:
%pip install shap

from sklearn.model_selection import ParameterGrid
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import pandas as pd
import numpy as np
import shap
import os
import json
import joblib

Note: you may need to restart the kernel to use updated packages.


In [2]:
def evaluate_classification(y_true, y_pred, results, solver_name, label):
    acc = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred, average='macro', zero_division=0)
    rec = recall_score(y_true, y_pred, average='macro', zero_division=0)
    f1 = f1_score(y_true, y_pred, average='macro', zero_division=0)

    print(f"{label} | Acc: {acc:.4f}, Prec: {prec:.4f}, Rec: {rec:.4f}, F1: {f1:.4f}")

    results.append({
        "Solver": solver_name,
        "Dataset": label,
        "Accuracy": acc,
        "Precision": prec,
        "Recall": rec,
        "F1-Score": f1
    })

In [None]:
import shap
import numpy as np
import pandas as pd
import os

def log_shap_and_importance_classification(model, X_val, y_val, pred_val, features, solver_name, target):
 
    explainer = shap.TreeExplainer(model)
    shap_values = explainer.shap_values(X_val)

    # Return an array arr of shape (n_samples, n_features, n_classes)
    if isinstance(shap_values, list):
        arr = np.stack(shap_values, axis=-1)  
    else:
        arr = np.array(shap_values)
        if arr.ndim == 2:
            # binary-classifier single array (n_samples,n_features)
            arr = arr[:, :, np.newaxis]
        elif arr.ndim == 4 and arr.shape[2] == 1:
            arr = arr.squeeze(2)  

    # arr is (n_samples, n_features, n_classes)
    
    feature_importance = np.mean(np.abs(arr), axis=(0, 2))

   
    shap_df = pd.DataFrame(feature_importance.reshape(1, -1), columns=features)
    shap_df["target"] = target
    shap_df["solver"] = solver_name
    os.makedirs("./dt_class/dt_shap_values", exist_ok=True)
    shap_df.to_csv(
        f"./dt_class/dt_shap_values/shap_{solver_name}_{target}_classification.csv",
        index=False
    )

  
    importance_df = pd.DataFrame({
        "feature": features,
        "model_importance": model.feature_importances_,
        "target": target,
        "solver": solver_name
    })
    os.makedirs("./dt_class/dt_feature_importance", exist_ok=True)
    imp_file = "./dt_class/dt_feature_importance/dt_feature_importance_classification.csv"
    importance_df.to_csv(imp_file, mode='a', index=False,
                         header=not os.path.exists(imp_file))

    # Top-5
    top5 = importance_df.nlargest(5, "model_importance")
    top5_file = "./dt_class/dt_feature_importance/dt_top5_feature_importance_classification.csv"
    top5.to_csv(top5_file, mode='a', index=False,
                header=not os.path.exists(top5_file))


In [None]:
import os
import json
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import ParameterGrid
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score
import joblib



def train_dt_classifier_for_solver(solver_name, train_file, test_file, val_file):
    print(f"\nSolver: {solver_name}")
   
    df_train = pd.read_csv(train_file).dropna()
    df_test  = pd.read_csv(test_file).dropna()
    df_val   = pd.read_csv(val_file).dropna()

    features = [
        "number_of_elements", "capacity", "max_weight", "min_weight", "mean_weight",
        "median_weight", "std_weight", "weight_range", "max_profit", "min_profit", "mean_profit",
        "median_profit", "std_profit", "profit_range", "renting_ratio", "mean_weight_profit_ratio",
        "median_weight_profit_ratio", "capacity_mean_weight_ratio", "capacity_median_weight_ratio",
        "capacity_std_weight_ratio", "std_weight_profit_ratio", "weight_profit_correlation",
        "ram", "cpu_cores"
    ]
    target_cols = ["solution_time", "optimality_gap", "peak_memory"]

    # Load precomputed bin edges 
    bins_dir = os.path.join(BINS_BASE_DIR, f"{solver_name}_bins")
    if not os.path.isdir(bins_dir):
        raise FileNotFoundError(f"No bins directory: {bins_dir}")
    json_files = [f for f in os.listdir(bins_dir) if f.endswith("_bins.json")]
    if len(json_files) != 1:
        raise FileNotFoundError(f"Expected one json in {bins_dir}, found: {json_files}")
    bin_path = os.path.join(bins_dir, json_files[0])
    with open(bin_path, "r") as f:
        bin_edges_dict = json.load(f)

   
    scaler = StandardScaler().fit(df_train[features])
    X_train = scaler.transform(df_train[features])
    X_test  = scaler.transform(df_test[features])
    X_val   = scaler.transform(df_val[features])

   
    base_out = "./dt_class"
    os.makedirs(f"{base_out}/dt_tuning", exist_ok=True)
    os.makedirs(f"{base_out}/dt_configs", exist_ok=True)
    os.makedirs(f"{base_out}/dt_classifier_models", exist_ok=True)

    results = []
    for target in target_cols:
        if target not in bin_edges_dict:
            print(f" No bin for '{target}', skipping.")
            continue
        edges = bin_edges_dict[target]

    
        def to_bins(arr, edges):
            labels = np.digitize(arr, edges[:-1], right=False) - 1
            return np.clip(labels, 0, len(edges) - 2)

        y_train = to_bins(df_train[target].values, edges)
        y_test  = to_bins(df_test [target].values, edges)
        y_val   = to_bins(df_val  [target].values, edges)

        
        max_train = y_train.max()
        y_test = np.clip(y_test, 0, max_train)
        y_val  = np.clip(y_val,   0, max_train)

        #Skip if only one class in train
        cls_train = np.unique(y_train)
        if len(cls_train) < 2:
            print(f"  Skipping '{target}': only one class in train {cls_train}")
            continue

        # Hyperparameter tune
        param_grid = {
            "max_depth": [5, 10, 20, None],
            "min_samples_split": [2, 5, 10]
        }
        best_f1     = -np.inf
        tuning_logs = []
        best_model  = None

        for params in ParameterGrid(param_grid):
            model = DecisionTreeClassifier(**params, random_state=42)
            model.fit(X_train, y_train)
            pred_val = model.predict(X_val)
            f1 = f1_score(y_val, pred_val, average='weighted')
            tuning_logs.append({**params, "f1_score": f1})

            if f1 > best_f1:
                best_f1       = f1
                best_model    = model
                best_params   = params
                best_pred_test = model.predict(X_test)
                best_pred_val  = pred_val
                best_y_test    = y_test
                best_y_val     = y_val

        if best_model is None:
            print(f" No valid model for '{target}'")
            continue

        # Save tuning logs & config
        pd.DataFrame(tuning_logs).to_csv(
            f"{base_out}/dt_tuning/tuning_dt_{solver_name}_{target}.csv", index=False
        )
        with open(f"{base_out}/dt_configs/best_dt_{solver_name}_{target}.json", "w") as f:
            json.dump(best_params, f, indent=4)

        # Evaluate
        print(f"[{target}  Test]")
        evaluate_classification(best_y_test,    best_pred_test, results,
                                solver_name, target)
        print(f"[{target} Val]")
        evaluate_classification(best_y_val,     best_pred_val,  results,
                                solver_name, target)

        # Save the classifier
        model_path = f"{base_out}/dt_classifier_models/dt_{solver_name}_{target}.joblib"
        joblib.dump(best_model, model_path)
        print(f" Saved model{model_path}")

        # SHAP & feature importance
        log_shap_and_importance_classification(
            best_model, X_val, best_y_val, best_pred_val,
            features, solver_name, target
        )

    pd.DataFrame(results).to_csv(
        f"{base_out}/dt_evaluation_results_classification.csv",
        mode='a', index=False,
        header=not os.path.exists(f"{base_out}/dt_evaluation_results_classification.csv")
    )

In [5]:
def run_all_models(base_folder):
    for root, dirs, files in os.walk(base_folder):
        for folder in dirs:
            folder_path = os.path.join(root, folder)
            csv_files = os.listdir(folder_path)

            train_file = [f for f in csv_files if f.endswith("_train.csv")]
            test_file = [f for f in csv_files if f.endswith("_test.csv")]
            val_file = [f for f in csv_files if f.endswith("_val.csv")]

            if train_file and test_file and val_file:
                train_fp = os.path.join(folder_path, train_file[0])
                test_fp = os.path.join(folder_path, test_file[0])
                val_fp = os.path.join(folder_path, val_file[0])

                solver_name = folder  
                train_dt_classifier_for_solver(solver_name, train_fp, test_fp, val_fp)

In [None]:
base_folder = "./trainingData/final_td_min/td_models" #Path to training data
BINS_BASE_DIR = "./trainingData/final_td_min/td_bindata" #Path to bin data
run_all_models(base_folder)


Solver: or_min
[solution_time → Test]
solution_time | Acc: 0.9750, Prec: 0.5000, Rec: 0.4875, F1: 0.4937
[solution_time → Val]
solution_time | Acc: 1.0000, Prec: 1.0000, Rec: 1.0000, F1: 1.0000
  • saved model → ./binres_min_kp/dt_class/dt_classifier_models/dt_or_min_solution_time.joblib
SHAP & feature importances saved for or_min-solution_time.
[optimality_gap → Test]
optimality_gap | Acc: 1.0000, Prec: 1.0000, Rec: 1.0000, F1: 1.0000
[optimality_gap → Val]
optimality_gap | Acc: 0.9500, Prec: 0.7368, Rec: 0.7368, F1: 0.7368
  • saved model → ./binres_min_kp/dt_class/dt_classifier_models/dt_or_min_optimality_gap.joblib
SHAP & feature importances saved for or_min-optimality_gap.
[peak_memory → Test]
peak_memory | Acc: 0.8804, Prec: 0.5054, Rec: 0.5548, F1: 0.5261
[peak_memory → Val]
peak_memory | Acc: 0.9482, Prec: 0.5362, Rec: 0.5725, F1: 0.5526
  • saved model → ./binres_min_kp/dt_class/dt_classifier_models/dt_or_min_peak_memory.joblib
SHAP & feature importances saved for or_min-peak