In [None]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from collections import defaultdict
import random
import joblib
import matplotlib.pyplot as plt
import itertools

In [2]:
def discretize_target(y, n_bins=5):
    bins = np.linspace(min(y), max(y), n_bins + 1)
    labels = list(range(n_bins))
    y_binned = np.digitize(y, bins[:-1], right=False)
    return y_binned, bins, labels

In [3]:
def compute_bin_accuracy(y_true, y_pred_bins, bins):
    true_bins = np.digitize(y_true, bins[:-1], right=False)
    correct = (true_bins == y_pred_bins)
    return np.mean(correct)

In [None]:
def evaluate(y_true, y_pred, results, solver_name, label, pred_bins, bins):
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)
    rel_rmse = rmse / np.mean(y_true)
    
    safe_y_true = np.where(y_true == 0, 1e-8, y_true)
    mape = np.mean(np.abs((y_true - y_pred) / safe_y_true)) * 100 
    bin_acc = compute_bin_accuracy(y_true, pred_bins, bins)

    results.append({
        "Solver": solver_name,
        "Dataset": label,
        "MAE": mae,
        "MSE": mse,
        "RMSE": rmse,
        "R2": r2,
        "Rel_RMSE": rel_rmse,
        "MAPE (%)": mape,
        "BinAccuracy": bin_acc
    })

In [None]:
def train_q_learning(X_train, y_train, bins, alpha=0.1, gamma=0.9, epsilon=0.2, episodes=100):
    n_actions = len(bins) - 1
    q_table = defaultdict(lambda: np.zeros(n_actions))
    rewards_per_episode = []

    for episode in range(episodes):
        total_reward = 0
        for i in range(len(X_train)):
            state = tuple(np.round(X_train[i], 2))
            if np.random.rand() < epsilon:
                action = np.random.randint(0, n_actions)
            else:
                action = np.argmax(q_table[state])

            pred_val = 0.5 * (bins[action] + bins[action + 1])
            reward = -abs(pred_val - y_train[i])
            total_reward += reward

            q_table[state][action] += alpha * (
                reward + gamma * np.max(q_table[state]) - q_table[state][action]
            )

        rewards_per_episode.append(total_reward)

        if episode % 10 == 0 or episode == episodes - 1:
            print(f"Q-Learning Episode {episode}: Total Reward = {total_reward:.4f}")

            sample_state = tuple(np.round(X_train[0], 2))
            print(f"Q[{sample_state}] = {q_table[sample_state]}")

    # Plot rewards - Just for verification
    # os.makedirs("./debug", exist_ok=True)
    # plt.figure(figsize=(7, 4))
    # plt.plot(rewards_per_episode, label="Episode Reward")
    # plt.xlabel("Episode")
    # plt.ylabel("Total Reward")
    # plt.title("Q-Learning Reward Curve")
    # plt.grid(True)
    # plt.legend()
    # plt.tight_layout()
    # plt.savefig("./debug/q_learning_reward_curve.png")
    # plt.close()

    return q_table

In [None]:
def train_sarsa(X_train, y_train, bins, alpha=0.1, gamma=0.9, epsilon=0.2, episodes=100):
    n_actions = len(bins) - 1
    q_table = defaultdict(lambda: np.zeros(n_actions))
    rewards_per_episode = []

    for episode in range(episodes):
        total_reward = 0
        for i in range(len(X_train)):
            state = tuple(np.round(X_train[i], 2))
            action = np.random.randint(0, n_actions) if np.random.rand() < epsilon else np.argmax(q_table[state])

            pred_val = 0.5 * (bins[action] + bins[action + 1])
            reward = -abs(pred_val - y_train[i])
            total_reward += reward

            next_state = state  # single step
            next_action = np.random.randint(0, n_actions) if np.random.rand() < epsilon else np.argmax(q_table[next_state])

            q_table[state][action] += alpha * (
                reward + gamma * q_table[next_state][next_action] - q_table[state][action]
            )

        rewards_per_episode.append(total_reward)

        if episode % 10 == 0 or episode == episodes - 1:
            print(f"SARSA Episode {episode}: Total Reward = {total_reward:.4f}")
            sample_state = tuple(np.round(X_train[0], 2))
            print(f"Q[{sample_state}] = {q_table[sample_state]}")

     # Plot rewards - Just for verification
    # os.makedirs("./debug", exist_ok=True)
    # plt.figure(figsize=(7, 4))
    # plt.plot(rewards_per_episode, label="Episode Reward")
    # plt.xlabel("Episode")
    # plt.ylabel("Total Reward")
    # plt.title("SARSA Reward per Episode")
    # plt.grid(True)
    # plt.legend()
    # plt.tight_layout()
    # plt.savefig("./debug/sarsa_reward_curve.png")
    # plt.close()

    return q_table

In [7]:
def predict_with_q_table(X, q_table, bins):
    preds = []
    pred_bins = []
    for i in range(len(X)):
        state = tuple(np.round(X[i], 2))
        action = np.argmax(q_table[state])
        pred = 0.5 * (bins[action] + bins[action + 1])
        preds.append(pred)
        pred_bins.append(action)
    return preds, pred_bins

In [None]:
def rl_predict(solver_name, train_file, test_file, val_file, strategy="q_learning"):
    print(f"\n[RL - {strategy.upper()} - Algo: {solver_name}]")

    df_train = pd.read_csv(train_file)
    df_test = pd.read_csv(test_file)
    df_val = pd.read_csv(val_file)
    
    df_train.dropna(inplace=True)
    df_test.dropna(inplace=True)
    df_val.dropna(inplace=True)

    target_cols = ["solution_time", "optimality_gap", "peak_memory"]
    features = [
        "number_of_elements", "capacity", "max_weight", "min_weight", "mean_weight",
        "median_weight", "std_weight", "weight_range", "max_profit", "min_profit", "mean_profit",
        "median_profit", "std_profit", "profit_range", "renting_ratio", "mean_weight_profit_ratio",
        "median_weight_profit_ratio", "capacity_mean_weight_ratio", "capacity_median_weight_ratio",
        "capacity_std_weight_ratio", "std_weight_profit_ratio", "weight_profit_correlation",
        "ram", "cpu_cores"
    ]

    for df in [df_train, df_test, df_val]:
        for col in target_cols:
            df[col] = pd.to_numeric(df[col], errors='coerce')
            df.dropna(subset=[col], inplace=True)

    scaler = StandardScaler()
    X_train = scaler.fit_transform(df_train[features])
    X_test = scaler.transform(df_test[features])
    X_val = scaler.transform(df_val[features])

    results = []

    for target in target_cols:
    
        y_scaler = StandardScaler()
        y_train_scaled = y_scaler.fit_transform(df_train[[target]]).flatten()
        y_test_scaled = y_scaler.transform(df_test[[target]]).flatten()
        y_val_scaled = y_scaler.transform(df_val[[target]]).flatten()

        y_binned, bins, labels = discretize_target(y_train_scaled, n_bins=5)

         # Tune hyperparameters 
        config = tune_rl_hyperparameters(X_train, y_train_scaled, X_val, y_val_scaled, bins, solver_name, target, strategy)
        print(f"Using tuned config for {target}: {config}")

        
        if strategy == "sarsa":
            q_table = train_sarsa(X_train, y_train_scaled, bins,
                                  gamma=config["gamma"],
                                  epsilon=config["epsilon"],
                                  alpha=config["alpha"],
                                  episodes=config["episodes"])
        else:
            q_table = train_q_learning(X_train, y_train_scaled, bins,
                                       gamma=config["gamma"],
                                       epsilon=config["epsilon"],
                                       alpha=config["alpha"],
                                       episodes=config["episodes"])


        
        pred_test_scaled, pred_bins_test = predict_with_q_table(X_test, q_table, bins)
        pred_val_scaled, pred_bins_val = predict_with_q_table(X_val, q_table, bins)
        pred_test = y_scaler.inverse_transform(np.array(pred_test_scaled).reshape(-1, 1)).flatten()
        pred_val = y_scaler.inverse_transform(np.array(pred_val_scaled).reshape(-1, 1)).flatten()
        y_test = y_scaler.inverse_transform(np.array(y_test_scaled).reshape(-1, 1)).flatten()
        y_val = y_scaler.inverse_transform(np.array(y_val_scaled).reshape(-1, 1)).flatten()



        print(f"\nTarget: {target.upper()}")
        print("[Test]")
        evaluate(y_test, pred_test, results, solver_name, f"{target} (Test)", pred_bins_test, bins)
        print("[Val]")
        evaluate(y_val, pred_val, results, solver_name, f"{target} (Val)", pred_bins_val, bins)

        print(f"Accuracy (Test): {compute_bin_accuracy(y_test, pred_bins_test, bins):.4f}")
        print(f"Accuracy (Val) : {compute_bin_accuracy(y_val, pred_bins_val, bins):.4f}")

        # Optionally save model
        # joblib.dump(q_table, f"./models/qtable_{strategy}_{solver_name}_{target}.pkl")

    results_df = pd.DataFrame(results)
    results_file = f"./results_min_kp/rl_{strategy}_eval_results.csv"
    if os.path.exists(results_file):
        results_df.to_csv(results_file, mode='a', index=False, header=False)
    else:
        results_df.to_csv(results_file, index=False)

In [None]:
def tune_rl_hyperparameters(X_train, y_train, X_val, y_val, bins, solver_name, target, strategy):
   

    gammas = [0.9]
    epsilons = [0.1, 0.01]
    episodes_list = [500, 1000]
    alphas = [0.05, 0.1]

    param_grid = list(itertools.product(gammas, epsilons, episodes_list, alphas))
    best_rmse = float("inf")
    best_config = {}

    all_results = []

    for gamma, epsilon, episodes, alpha in param_grid:
        if strategy == "sarsa":
            q_table = train_sarsa(X_train, y_train, bins, gamma=gamma, epsilon=epsilon,
                                  alpha=alpha, episodes=episodes)
        else:
            q_table = train_q_learning(X_train, y_train, bins, gamma=gamma, epsilon=epsilon,
                                       alpha=alpha, episodes=episodes)

        pred_val_scaled, _ = predict_with_q_table(X_val, q_table, bins)
        y_scaler = StandardScaler().fit(y_train.reshape(-1, 1))
        pred_val = y_scaler.inverse_transform(np.array(pred_val_scaled).reshape(-1, 1)).flatten()
        y_val_orig = y_scaler.inverse_transform(np.array(y_val).reshape(-1, 1)).flatten()

        rmse = np.sqrt(mean_squared_error(y_val_orig, pred_val))
        all_results.append({
            "solver": solver_name,
            "target": target,
            "strategy": strategy,
            "gamma": gamma,
            "epsilon": epsilon,
            "episodes": episodes,
            "alpha": alpha,
            "rmse": rmse
        })

        if rmse < best_rmse:
            best_rmse = rmse
            best_config = {
                "gamma": gamma,
                "epsilon": epsilon,
                "episodes": episodes,
                "alpha": alpha
            }

    # Save tuning results
    df_tuning = pd.DataFrame(all_results)
    os.makedirs("./rl_tuning", exist_ok=True)
    df_tuning.to_csv(f"./rl_tuning/tune_{strategy}_{solver_name}_{target}.csv", index=False)

    return best_config

In [10]:
def run_rl_models(base_folder, strategies=["q_learning", "sarsa"]):
    for root, dirs, files in os.walk(base_folder):
        for folder in dirs:
            folder_path = os.path.join(root, folder)
            csv_files = os.listdir(folder_path)

            train_file = [f for f in csv_files if f.endswith("_train.csv")]
            test_file = [f for f in csv_files if f.endswith("_test.csv")]
            val_file = [f for f in csv_files if f.endswith("_val.csv")]

            if train_file and test_file and val_file:
                train_fp = os.path.join(folder_path, train_file[0])
                test_fp = os.path.join(folder_path, test_file[0])
                val_fp = os.path.join(folder_path, val_file[0])

                solver_name = folder
                for strategy in strategies:
                    rl_predict(solver_name, train_fp, test_fp, val_fp, strategy=strategy)

In [None]:
base_folder = "./trainingData/final_td_min" #Specify path to training data
run_rl_models(base_folder)


[Reinforcement Learning - Q_LEARNING - Solver: or_min]
[Q-Learning] Episode 0: Total Reward = -1877.5997
Q[(np.float64(-1.08), np.float64(-0.49), np.float64(-0.5), np.float64(-0.11), np.float64(-0.43), np.float64(-0.32), np.float64(-0.5), np.float64(-0.5), np.float64(-0.5), np.float64(-0.28), np.float64(-0.43), np.float64(-0.32), np.float64(-0.5), np.float64(-0.5), np.float64(0.68), np.float64(-0.39), np.float64(0.21), np.float64(1.55), np.float64(0.84), np.float64(0.28), np.float64(-0.42), np.float64(0.69), np.float64(-0.76), np.float64(1.0))] = [-0.03927458  0.          0.          0.          0.        ]
[Q-Learning] Episode 10: Total Reward = -1869.5420
Q[(np.float64(-1.08), np.float64(-0.49), np.float64(-0.5), np.float64(-0.11), np.float64(-0.43), np.float64(-0.32), np.float64(-0.5), np.float64(-0.5), np.float64(-0.5), np.float64(-0.28), np.float64(-0.43), np.float64(-0.32), np.float64(-0.5), np.float64(-0.5), np.float64(0.68), np.float64(-0.39), np.float64(0.21), np.float64(1.55