In [1]:
%pip install shap

from sklearn.model_selection import ParameterGrid
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import pandas as pd
import numpy as np
import shap
import os
import json
import joblib

Note: you may need to restart the kernel to use updated packages.


In [2]:
def evaluate_classification(y_true, y_pred, results, solver_name, label):
    acc = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred, average='macro', zero_division=0)
    rec = recall_score(y_true, y_pred, average='macro', zero_division=0)
    f1 = f1_score(y_true, y_pred, average='macro', zero_division=0)

    print(f"{label} | Acc: {acc:.4f}, Prec: {prec:.4f}, Rec: {rec:.4f}, F1: {f1:.4f}")

    results.append({
        "Solver": solver_name,
        "Dataset": label,
        "Accuracy": acc,
        "Precision": prec,
        "Recall": rec,
        "F1-Score": f1
    })

In [None]:
import os
import numpy as np
import pandas as pd
import shap

def log_shap_and_importance_classification(model, X_val, y_val, pred_val, features, solver_name, target):
   
    background = shap.sample(X_val, min(100, X_val.shape[0]), random_state=42)

    try:
        #  LinearExplainer for logistic regression
        explainer   = shap.LinearExplainer(model, background, feature_perturbation="interventional")
        shap_values = explainer.shap_values(X_val)
      

       
        if isinstance(shap_values, list):
            arr = np.stack(shap_values, axis=-1)
            shap_2d = arr.mean(axis=2)
        else:
            shap_2d = np.array(shap_values)
            if shap_2d.ndim == 3:
                shap_2d = shap_2d.mean(axis=2)

      
        assert shap_2d.ndim == 2 and shap_2d.shape[1] == len(features), (
            f"Expected SHAP shape (n, {len(features)}), exists {shap_2d.shape}"
        )

    except Exception as e:
        print(f"Failed for {solver_name}-{target}: {e}")
        return


    shap_df = pd.DataFrame(shap_2d, columns=features)
    shap_df["predicted_value"] = pred_val
    shap_df["actual_value"]    = y_val
    shap_df["target"]          = target
    shap_df["solver"]          = solver_name

    out_dir = "./lr_class/lr_shap_values"
    os.makedirs(out_dir, exist_ok=True)
    shap_df.to_csv(
        f"{out_dir}/shap_{solver_name}_{target}_classification.csv",
        index=False
    )

   
    feat_imp = np.abs(shap_2d).mean(axis=0) 

    importance_df = pd.DataFrame({
        "feature": features,
        "importance": feat_imp,
        "target": target,
        "solver": solver_name
    })

    imp_dir = "./lr_class/lr_feature_importance"
    os.makedirs(imp_dir, exist_ok=True)
    importance_df.to_csv(
        f"{imp_dir}/lr_feature_importance_classification.csv",
        mode='a', index=False,
        header=not os.path.exists(f"{imp_dir}/lr_feature_importance_classification.csv")
    )

    #Top-5 features
    top5 = importance_df.nlargest(5, "importance")
    top5.to_csv(
        f"{imp_dir}/lr_top5_feature_importance_classification.csv",
        mode='a', index=False,
        header=not os.path.exists(f"{imp_dir}/lr_top5_feature_importance_classification.csv")
    )

    print(f"SHAP & feature importances saved for {solver_name}-{target}.")

In [None]:
def train_lr_classifier_for_solver(solver_name, train_file, test_file, val_file):
    print(f"\nSolver: {solver_name}")
   
    df_train = pd.read_csv(train_file).dropna()
    df_test  = pd.read_csv(test_file).dropna()
    df_val   = pd.read_csv(val_file).dropna()

    features = [
        "number_of_elements","capacity","max_weight","min_weight","mean_weight",
        "median_weight","std_weight","weight_range","max_profit","min_profit","mean_profit",
        "median_profit","std_profit","profit_range","renting_ratio","mean_weight_profit_ratio",
        "median_weight_profit_ratio","capacity_mean_weight_ratio","capacity_median_weight_ratio",
        "capacity_std_weight_ratio","std_weight_profit_ratio","weight_profit_correlation",
        "ram","cpu_cores"
    ]
    target_cols = ["solution_time", "optimality_gap", "peak_memory"]

    # Precomputed bin edges
    bins_dir = os.path.join(BINS_BASE_DIR, f"{solver_name}_bins")
    if not os.path.isdir(bins_dir):
        raise FileNotFoundError(f"No bins directory: {bins_dir}")
    json_files = [f for f in os.listdir(bins_dir) if f.endswith("_bins.json")]
    if len(json_files) != 1:
        raise FileNotFoundError(f"Expected one json in {bins_dir}, found: {json_files}")
    bin_path = os.path.join(bins_dir, json_files[0])
    with open(bin_path, "r") as f:
        bin_edges_dict = json.load(f)

    
    scaler = StandardScaler().fit(df_train[features])
    X_train = scaler.transform(df_train[features])
    X_test  = scaler.transform(df_test[features])
    X_val   = scaler.transform(df_val[features])

   
    base_out = "./lr_class"
    os.makedirs(f"{base_out}/lr_tuning", exist_ok=True)
    os.makedirs(f"{base_out}/lr_configs", exist_ok=True)
    os.makedirs(f"{base_out}/lr_classifier_models", exist_ok=True)

    results = []
    for target in target_cols:
        if target not in bin_edges_dict:
            print(f"No bin for '{target}', skipping.")
            continue
        edges = bin_edges_dict[target]

        
        def to_bins(arr, edges):
            labels = np.digitize(arr, edges[:-1], right=False) - 1
            return np.clip(labels, 0, len(edges) - 2)

        y_train = to_bins(df_train[target].values, edges)
        y_test  = to_bins(df_test [target].values, edges)
        y_val   = to_bins(df_val  [target].values, edges)

       
        max_train = y_train.max()
        y_test = np.clip(y_test, 0, max_train)
        y_val  = np.clip(y_val,   0, max_train)

        # Skip if only one class in train
        cls_train = np.unique(y_train)
        if len(cls_train) < 2:
            print(f"Skipping '{target}': only one class {cls_train}")
            continue

        # Hyperparameter tune
        param_grid = {
            "C": [0.01, 0.1],
            "penalty": ["l2"],
            "solver": ["lbfgs"]
        }
        best_f1     = -np.inf
        tuning_logs = []
        best_model  = None

        for params in ParameterGrid(param_grid):
            model = LogisticRegression(**params, max_iter=1000, random_state=42)
            model.fit(X_train, y_train)
            pred_val = model.predict(X_val)
            f1 = f1_score(y_val, pred_val, average='weighted')
            tuning_logs.append({**params, "f1_score": f1})

            if f1 > best_f1:
                best_f1       = f1
                best_model    = model
                best_params   = params
                best_y_test    = y_test
                best_y_val     = y_val
                best_pred_test = model.predict(X_test)
                best_pred_val  = pred_val

        if best_model is None:
            print(f"No model for '{target}'")
            continue

        # Save tuning logs & config
        pd.DataFrame(tuning_logs).to_csv(
            f"{base_out}/lr_tuning/tuning_lr_{solver_name}_{target}.csv", index=False
        )
        with open(f"{base_out}/lr_configs/best_lr_{solver_name}_{target}.json", "w") as f:
            json.dump(best_params, f, indent=4)

        # Evaluate 
        print(f"[{target}Test]")
        evaluate_classification(best_y_test,    best_pred_test, results,
                                solver_name, target)
        print(f"[{target}Val]")
        evaluate_classification(best_y_val,     best_pred_val,  results,
                                solver_name, target)

       
        model_path = f"{base_out}/lr_classifier_models/lr_{solver_name}_{target}.joblib"
        joblib.dump(best_model, model_path)
        print(f"Saved model {model_path}")

        #  SHAP & feature importance
        log_shap_and_importance_classification(
            best_model, X_val, best_y_val, best_pred_val,
            features, solver_name, target
        )

   
    pd.DataFrame(results).to_csv(
        f"{base_out}/lr_evaluation_results_classification.csv",
        mode='a', index=False,
        header=not os.path.exists(f"{base_out}/lr_evaluation_results_classification.csv")
    )

In [5]:
def run_all_models(base_folder):
    for root, dirs, files in os.walk(base_folder):
        for folder in dirs:
            folder_path = os.path.join(root, folder)
            csv_files = os.listdir(folder_path)

            train_file = [f for f in csv_files if f.endswith("_train.csv")]
            test_file = [f for f in csv_files if f.endswith("_test.csv")]
            val_file = [f for f in csv_files if f.endswith("_val.csv")]

            if train_file and test_file and val_file:
                train_fp = os.path.join(folder_path, train_file[0])
                test_fp = os.path.join(folder_path, test_file[0])
                val_fp = os.path.join(folder_path, val_file[0])

                solver_name = folder  
                train_lr_classifier_for_solver(solver_name, train_fp, test_fp, val_fp)

In [None]:
base_folder = "./trainingData/final_td_min/td_models" #Path to training data
BINS_BASE_DIR = "./trainingData/final_td_min/td_bindata" #Path to bin data
run_all_models(base_folder)


Solver: or_min
[solution_time → Test]
solution_time | Acc: 1.0000, Prec: 1.0000, Rec: 1.0000, F1: 1.0000
[solution_time → Val]
solution_time | Acc: 1.0000, Prec: 1.0000, Rec: 1.0000, F1: 1.0000
  • saved model → ./binres_min_kp/lr_class/lr_classifier_models/lr_or_min_solution_time.joblib
SHAP & feature importances saved for or_min-solution_time.
[optimality_gap → Test]
optimality_gap | Acc: 1.0000, Prec: 1.0000, Rec: 1.0000, F1: 1.0000
[optimality_gap → Val]
optimality_gap | Acc: 0.9500, Prec: 0.4750, Rec: 0.5000, F1: 0.4872
  • saved model → ./binres_min_kp/lr_class/lr_classifier_models/lr_or_min_optimality_gap.joblib
SHAP & feature importances saved for or_min-optimality_gap.
[peak_memory → Test]
peak_memory | Acc: 0.9179, Prec: 0.3060, Rec: 0.3333, F1: 0.3191
[peak_memory → Val]
peak_memory | Acc: 0.9107, Prec: 0.3114, Rec: 0.3244, F1: 0.3178
  • saved model → ./binres_min_kp/lr_class/lr_classifier_models/lr_or_min_peak_memory.joblib
SHAP & feature importances saved for or_min-peak



[solution_time → Test]
solution_time | Acc: 0.7964, Prec: 0.4639, Rec: 0.4661, F1: 0.4650
[solution_time → Val]
solution_time | Acc: 0.8554, Prec: 0.5262, Rec: 0.4570, F1: 0.4828
  • saved model → ./binres_min_kp/lr_class/lr_classifier_models/lr_gurobi_min_solution_time.joblib
SHAP & feature importances saved for gurobi_min-solution_time.
  • skipping 'optimality_gap': only one class [0]
[peak_memory → Test]
peak_memory | Acc: 0.2839, Prec: 0.1893, Rec: 0.1850, F1: 0.1723
[peak_memory → Val]
peak_memory | Acc: 0.2946, Prec: 0.3642, Rec: 0.2367, F1: 0.2079
  • saved model → ./binres_min_kp/lr_class/lr_classifier_models/lr_gurobi_min_peak_memory.joblib
SHAP & feature importances saved for gurobi_min-peak_memory.

Solver: greedy_min
[solution_time → Test]
solution_time | Acc: 0.9446, Prec: 0.8899, Rec: 0.9651, F1: 0.9102
[solution_time → Val]
solution_time | Acc: 0.7411, Prec: 0.7735, Rec: 0.7602, F1: 0.7415
  • saved model → ./binres_min_kp/lr_class/lr_classifier_models/lr_greedy_min_sol



[optimality_gap → Test]
optimality_gap | Acc: 0.7750, Prec: 0.2583, Rec: 0.3333, F1: 0.2911
[optimality_gap → Val]
optimality_gap | Acc: 0.8000, Prec: 0.2735, Rec: 0.3232, F1: 0.2963
  • saved model → ./binres_min_kp/lr_class/lr_classifier_models/lr_greedy_min_optimality_gap.joblib
SHAP & feature importances saved for greedy_min-optimality_gap.
  • skipping 'peak_memory': only one class [0]

Solver: ga_min
[solution_time → Test]
solution_time | Acc: 0.8125, Prec: 0.6958, Rec: 0.7859, F1: 0.7080
[solution_time → Val]
solution_time | Acc: 0.6375, Prec: 0.6550, Rec: 0.6508, F1: 0.6482
  • saved model → ./binres_min_kp/lr_class/lr_classifier_models/lr_ga_min_solution_time.joblib
SHAP & feature importances saved for ga_min-solution_time.
  • skipping 'optimality_gap': only one class [0]
[peak_memory → Test]
peak_memory | Acc: 0.9286, Prec: 0.4643, Rec: 0.5000, F1: 0.4815
[peak_memory → Val]
peak_memory | Acc: 0.9286, Prec: 0.4643, Rec: 0.5000, F1: 0.4815
  • saved model → ./binres_min_kp/lr



[solution_time → Test]
solution_time | Acc: 0.7446, Prec: 0.6284, Rec: 0.6829, F1: 0.6206
[solution_time → Val]
solution_time | Acc: 0.6589, Prec: 0.7016, Rec: 0.7171, F1: 0.6758
  • saved model → ./binres_min_kp/lr_class/lr_classifier_models/lr_dp_min_solution_time.joblib
SHAP & feature importances saved for dp_min-solution_time.
[optimality_gap → Test]
optimality_gap | Acc: 1.0000, Prec: 1.0000, Rec: 1.0000, F1: 1.0000
[optimality_gap → Val]
optimality_gap | Acc: 0.9500, Prec: 0.4750, Rec: 0.5000, F1: 0.4872
  • saved model → ./binres_min_kp/lr_class/lr_classifier_models/lr_dp_min_optimality_gap.joblib
SHAP & feature importances saved for dp_min-optimality_gap.
[peak_memory → Test]
peak_memory | Acc: 0.9286, Prec: 0.6410, Rec: 0.6667, F1: 0.6533
[peak_memory → Val]
peak_memory | Acc: 0.9286, Prec: 0.6410, Rec: 0.6667, F1: 0.6533
  • saved model → ./binres_min_kp/lr_class/lr_classifier_models/lr_dp_min_peak_memory.joblib
SHAP & feature importances saved for dp_min-peak_memory.

Solver

