In [1]:
%pip install joblib

import os
import pandas as pd
import numpy as np
import joblib
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import shap
import json

Note: you may need to restart the kernel to use updated packages.


In [2]:
def evaluate(y_true, y_pred, results, solver_name, label):
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)
    rel_rmse = rmse / np.mean(y_true)

    safe_y_true = np.where(y_true == 0, 1e-8, y_true)
    mape = np.mean(np.abs((y_true - y_pred) / safe_y_true)) * 100

    result = {
        "Solver": solver_name,
        "Dataset": label,
        "MAE": mae,
        "MSE": mse,
        "RMSE": rmse,
        "R2": r2,
        "Rel_RMSE": rel_rmse,
        "MAPE (%)": mape,
    }
    results.append(result)

In [None]:
def log_lr_shap_and_importance(model, X_val, y_val, pred_val, features, solver_name, target):

    background = shap.sample(X_val, min(100, X_val.shape[0]), random_state=42)

    try:
        explainer = shap.KernelExplainer(model.predict, background)
        shap_values = explainer.shap_values(X_val, nsamples=100)

        shap_df = pd.DataFrame(shap_values, columns=features)
        shap_df["predicted_value"] = pred_val
        shap_df["actual_value"] = y_val.values
        shap_df["target"] = target
        shap_df["solver"] = solver_name

        os.makedirs("./lr/lr_shap_values", exist_ok=True)
        shap_filename = f"./lr/lr_shap_values/shap_{solver_name}_{target}_reg.csv"
        shap_df.to_csv(shap_filename, index=False)

      
        importance_df = pd.DataFrame({
            "feature": features,
            "importance": np.abs(shap_values).mean(axis=0),
            "coefficient": model.coef_,
            "target": target,
            "solver": solver_name
        })

        os.makedirs("./lr/lr_feature_importance", exist_ok=True)
        importance_file = "./lr/lr_feature_importance/lr_feature_importance_reg.csv"
        importance_df.to_csv(importance_file, mode='a', index=False, header=not os.path.exists(importance_file))

        #  top-5 
        top5_file = "./lr/lr_feature_importance/lr_top5_feature_importance_reg.csv"
        importance_df.sort_values(by="importance", ascending=False).head(5).to_csv(
            top5_file, mode='a', index=False, header=not os.path.exists(top5_file)
        )

    except Exception as e:
        print(f"SHAP failed for {solver_name}-{target}: {e}")

In [None]:
def train_linear_for_solver(solver_name, train_file, test_file, val_file):
    print(f"\nSolver: {solver_name}")
    df_train = pd.read_csv(train_file)
    df_test = pd.read_csv(test_file)
    df_val = pd.read_csv(val_file)
        
    df_train.dropna(inplace=True)
    df_test.dropna(inplace=True)
    df_val.dropna(inplace=True)

    target_cols = ["solution_time", "optimality_gap", "peak_memory"]
    features = [
        "number_of_elements", "capacity", "max_weight", "min_weight", "mean_weight",
        "median_weight", "std_weight", "weight_range", "max_profit", "min_profit", "mean_profit",
        "median_profit", "std_profit", "profit_range", "renting_ratio", "mean_weight_profit_ratio",
        "median_weight_profit_ratio", "capacity_mean_weight_ratio", "capacity_median_weight_ratio",
        "capacity_std_weight_ratio", "std_weight_profit_ratio", "weight_profit_correlation",
        "ram", "cpu_cores"
    ]

    for df in [df_train, df_test, df_val]:
        for col in target_cols:
            df[col] = pd.to_numeric(df[col], errors='coerce')
            df.dropna(subset=[col], inplace=True)

    scaler = StandardScaler()
    X_train = scaler.fit_transform(df_train[features])
    X_test = scaler.transform(df_test[features])
    X_val = scaler.transform(df_val[features])
    
    results=[]

    for target in target_cols:
        y_train = df_train[target]
        y_test = df_test[target]
        y_val = df_val[target]

        model = LinearRegression()
        model.fit(X_train, y_train)

        pred_test = model.predict(X_test)
        pred_val = model.predict(X_val)

        print(f"Target: {target.upper()}")
        print("TEST")
        evaluate(y_test, pred_test, results, solver_name, label=f"{target} (Test)")
        print("VAL")
        evaluate(y_val, pred_val, results, solver_name, label=f"{target} (Val)")
        log_lr_shap_and_importance(model, X_val, y_val, pred_val, features, solver_name, target)
        
        
        # Save model
        os.makedirs("./lr_models", exist_ok=True)
        model_path = f"./lr_models/lr_model_{solver_name}_{target}.joblib"
        joblib.dump(model, model_path)
        print(f"Saved model {model_path}")




    results_df = pd.DataFrame(results)
    results_file = "./lr_evaluation_results_reg.csv"

    if os.path.exists(results_file):
        results_df.to_csv(results_file, mode='a', index=False, header=False)
    else:
        results_df.to_csv(results_file, index=False)

In [5]:
def run_all_models(base_folder):
    for root, dirs, files in os.walk(base_folder):
        for folder in dirs:
            folder_path = os.path.join(root, folder)
            csv_files = os.listdir(folder_path)

            train_file = [f for f in csv_files if f.endswith("_train.csv")]
            test_file = [f for f in csv_files if f.endswith("_test.csv")]
            val_file = [f for f in csv_files if f.endswith("_val.csv")]

            if train_file and test_file and val_file:
                train_fp = os.path.join(folder_path, train_file[0])
                test_fp = os.path.join(folder_path, test_file[0])
                val_fp = os.path.join(folder_path, val_file[0])

                solver_name = folder
                train_linear_for_solver(solver_name, train_fp, test_fp, val_fp)

In [None]:
base_folder = "./trainingData/final_td_min"  #Path to training data
run_all_models(base_folder)


Solver: or_min
Target: SOLUTION_TIME
 TEST
VAL


  0%|          | 0/560 [00:00<?, ?it/s]

Saved Linear Regression model to ./results_min_kp/lr_models/lr_model_or_min_solution_time.joblib
Target: OPTIMALITY_GAP
 TEST
VAL


  0%|          | 0/560 [00:00<?, ?it/s]

Saved Linear Regression model to ./results_min_kp/lr_models/lr_model_or_min_optimality_gap.joblib
Target: PEAK_MEMORY
 TEST
VAL


  0%|          | 0/560 [00:00<?, ?it/s]

Saved Linear Regression model to ./results_min_kp/lr_models/lr_model_or_min_peak_memory.joblib

Solver: gurobi_min
Target: SOLUTION_TIME
 TEST
VAL


  0%|          | 0/560 [00:00<?, ?it/s]

Saved Linear Regression model to ./results_min_kp/lr_models/lr_model_gurobi_min_solution_time.joblib
Target: OPTIMALITY_GAP
 TEST
VAL


  rel_rmse = rmse / np.mean(y_true)
  rel_rmse = rmse / np.mean(y_true)


  0%|          | 0/560 [00:00<?, ?it/s]

Saved Linear Regression model to ./results_min_kp/lr_models/lr_model_gurobi_min_optimality_gap.joblib
Target: PEAK_MEMORY
 TEST
VAL


  0%|          | 0/560 [00:00<?, ?it/s]

Saved Linear Regression model to ./results_min_kp/lr_models/lr_model_gurobi_min_peak_memory.joblib

Solver: greedy_min
Target: SOLUTION_TIME
 TEST
VAL


  0%|          | 0/560 [00:00<?, ?it/s]

Saved Linear Regression model to ./results_min_kp/lr_models/lr_model_greedy_min_solution_time.joblib
Target: OPTIMALITY_GAP
 TEST
VAL


  0%|          | 0/560 [00:00<?, ?it/s]

Saved Linear Regression model to ./results_min_kp/lr_models/lr_model_greedy_min_optimality_gap.joblib
Target: PEAK_MEMORY
 TEST
VAL


  0%|          | 0/560 [00:00<?, ?it/s]

Saved Linear Regression model to ./results_min_kp/lr_models/lr_model_greedy_min_peak_memory.joblib

Solver: ga_min
Target: SOLUTION_TIME
 TEST
VAL


  0%|          | 0/560 [00:00<?, ?it/s]

Saved Linear Regression model to ./results_min_kp/lr_models/lr_model_ga_min_solution_time.joblib
Target: OPTIMALITY_GAP
 TEST
VAL


  0%|          | 0/560 [00:00<?, ?it/s]

Saved Linear Regression model to ./results_min_kp/lr_models/lr_model_ga_min_optimality_gap.joblib
Target: PEAK_MEMORY
 TEST
VAL


  0%|          | 0/560 [00:00<?, ?it/s]

Saved Linear Regression model to ./results_min_kp/lr_models/lr_model_ga_min_peak_memory.joblib

Solver: dp_min
Target: SOLUTION_TIME
 TEST
VAL


  0%|          | 0/560 [00:00<?, ?it/s]

Saved Linear Regression model to ./results_min_kp/lr_models/lr_model_dp_min_solution_time.joblib
Target: OPTIMALITY_GAP
 TEST
VAL


  0%|          | 0/560 [00:00<?, ?it/s]

Saved Linear Regression model to ./results_min_kp/lr_models/lr_model_dp_min_optimality_gap.joblib
Target: PEAK_MEMORY
 TEST
VAL


  0%|          | 0/560 [00:00<?, ?it/s]

Saved Linear Regression model to ./results_min_kp/lr_models/lr_model_dp_min_peak_memory.joblib

Solver: bb_min
Target: SOLUTION_TIME
 TEST
VAL


  0%|          | 0/560 [00:00<?, ?it/s]

Saved Linear Regression model to ./results_min_kp/lr_models/lr_model_bb_min_solution_time.joblib
Target: OPTIMALITY_GAP
 TEST
VAL


  0%|          | 0/560 [00:00<?, ?it/s]

Saved Linear Regression model to ./results_min_kp/lr_models/lr_model_bb_min_optimality_gap.joblib
Target: PEAK_MEMORY
 TEST
VAL


  0%|          | 0/560 [00:00<?, ?it/s]

Saved Linear Regression model to ./results_min_kp/lr_models/lr_model_bb_min_peak_memory.joblib
