In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import random
import time
from scipy.stats import norm
import matplotlib.pyplot as plt

# Define the device (using GPU if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


def instance_generator(n):
    '''
    Function that generates hard Knapsack problem instances.
    Input:
        -n: desired size of set of items I, defaulted at 50,000 as we use this number in our study
    Returns:
        -v: array of values for all i items
        -w: array of weights of all i items
    
    ''' 
    v = np.round(norm.rvs(100, 10, size=n))
    w = np.zeros(n)
    for i in range(n):
        w[i] = round(norm.rvs(v[i], 5))
    return v, w

v, w = instance_generator(5)
# Problem size set-up
N = [10]

# Capacity constraint function based on the problem size
def W(n):
    return 0.45*np.sum(w[0:n])

capacity = W(N[0])
class NeuralNetworkAgent(nn.Module):
    def __init__(self, input_dim=5, hidden_dim=64, n_heads=4, n_layers=2, seq_len=1):
        super(NeuralNetworkAgent, self).__init__()

        # Fully connected layers for initial transformation
        self.fc1_1 = nn.Linear(input_dim, hidden_dim)
        self.fc1_2 = nn.Linear(hidden_dim, hidden_dim)

        # Transformer Encoder setup
        self.transformer_encoder_layer = nn.TransformerEncoderLayer(
            d_model=hidden_dim, nhead=n_heads, dim_feedforward=hidden_dim
        )
        self.transformer = nn.TransformerEncoder(self.transformer_encoder_layer, num_layers=n_layers)

        # Fully connected layers after transformer processing
        self.fc2_1 = nn.Linear(hidden_dim, hidden_dim)
        self.fc2_2 = nn.Linear(hidden_dim, 1)  # Output a single Q-value per row

        self.relu = nn.ReLU()

    def forward(self, x):
        # Initial transformation
        x = self.relu(self.fc1_1(x))
        x = self.relu(self.fc1_2(x))

        # Transformer expects input of shape (sequence length, batch size, embedding dimension)
        x = x.unsqueeze(1)  # Add sequence length dimension
        x = self.transformer(x)
        x = x.squeeze(1)  # Remove the sequence length dimension

        # Final transformation to output Q-values
        x = self.relu(self.fc2_1(x))
        qvalues = self.fc2_2(x).squeeze(-1)  # Output one Q-value per row

        return qvalues
class Knapsack_environment:
    def __init__(self, v=v, w=w, capacity=capacity):
        self.v = v
        self.w = w
        self.free = capacity
        self.filled = 0
        self.selected = np.zeros(len(v))
        self.reset(len(v))

    def reset(self, N_items):
        v_new, w_new = instance_generator(N_items)
        W_new = W(N_items)

        self.v = v_new
        self.w = w_new
        self.ratios = self.v / self.w
        self.free = np.full(N_items, W_new)
        self.canbeselected = np.zeros(N_items)
        return self.get_state()
    
    def get_state(self):
        return np.vstack([self.v, self.w, self.ratios, self.free, self.canbeselected]).T


    def step(self, action):
        #print(action)
        if np.all(self.canbeselected == 1):
            # change epi_finished
            return self.get_state(), 0,  True
        
        if self.canbeselected[action] == 1:
            
            return self.get_state(), -5, False
        
        self.canbeselected[action] = 1
        reward = self.v[action]
        self.free = self.free - np.full(len(self.v), self.w[action])

        
        self.canbeselected[self.w > self.free] = 1
        
        done = np.all(self.canbeselected == 1)
        return self.get_state(), reward, done
    
    
    
        

        

env = Knapsack_environment()



torch.set_printoptions(threshold=10000)

episodes = 100
batch_size = 64
gamma = 0.95  # Discount factor
epsilon_start = 1.0
epsilon_end = 0.01
epsilon_decay = 10

env = Knapsack_environment()
agent = NeuralNetworkAgent().to(device)
optimizer = optim.Adam(agent.parameters(), lr=1e-4)

Future_agent = NeuralNetworkAgent().to(device)
Future_agent.load_state_dict(agent.state_dict())

batch_index = 0
history_state = []
history_next_state = []
history_action = []
history_reward = []
history_done = []

for episode in range(episodes):
    epsilon = epsilon_end + (epsilon_start - epsilon_end) * np.exp(-1. * episode / epsilon_decay)
    N_items = 100
    
    epi_finished = False
    total_reward = 0

    state_matrix = env.reset(N_items)
    state = torch.FloatTensor(state_matrix).to(device)

    while not epi_finished:
        available_items = np.where(state_matrix[:,4] == 0)[0]
        if random.random() > epsilon:
            with torch.no_grad():
                q_values = agent.forward(state)
                action = q_values.argmax().item()
        else:
            action = random.choice(available_items)
        
        next_state, reward, epi_finished = env.step(action)
        next_state_matrix = next_state
        next_state = torch.FloatTensor(next_state_matrix).to(device)
        
        history_state.append(state)
        history_next_state.append(next_state)
        history_action.append(action)
        history_reward.append(reward)
        history_done.append(float(epi_finished))
        
        batch_index += 1
        state = next_state
        state_matrix = next_state_matrix

        if batch_index >= batch_size:
            q_valuestates = []
            q_values_next_states = []

            # Manually calculate q_values and next_q_values using your approach
            for q_index in range(batch_index - batch_size, batch_index):
                q_value = agent(history_state[q_index])[history_action[q_index]]
                next_q_value = Future_agent(history_next_state[q_index]).max()

                q_valuestates.append(q_value)
                q_values_next_states.append(next_q_value)

            q_valuestates = torch.stack(q_valuestates).to(device)
            q_values_next_states = torch.stack(q_values_next_states).to(device)

            rewards = torch.tensor(history_reward[-batch_size:], dtype=torch.float32).to(device)
            dones = torch.tensor(history_done[-batch_size:], dtype=torch.float32).to(device)

            # Calculate the target using next_q_values
            with torch.no_grad():
                targets = rewards + (1.0 - dones) * gamma * q_values_next_states

            targets = targets.float()
            loss = nn.MSELoss()(q_valuestates, targets)

            optimizer.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(agent.parameters(), max_norm=0.1)
            optimizer.step()

            print(f"Episode {episode} completed. Loss: {loss.item()}")
    
    #print(episode)












  attn_output = scaled_dot_product_attention(q, k, v, attn_mask, dropout_p, is_causal)


Episode 29 completed. Loss: 8804.35546875
Episode 29 completed. Loss: 8792.205078125
Episode 30 completed. Loss: 8750.978515625
Episode 30 completed. Loss: 8772.1875
Episode 31 completed. Loss: 8843.6337890625
Episode 31 completed. Loss: 8687.078125
Episode 31 completed. Loss: 8613.693359375
Episode 32 completed. Loss: 8647.7919921875
Episode 32 completed. Loss: 8703.396484375
Episode 33 completed. Loss: 8718.951171875
Episode 33 completed. Loss: 8712.625
Episode 34 completed. Loss: 8714.34375
Episode 34 completed. Loss: 8741.298828125
Episode 35 completed. Loss: 8740.2890625
Episode 35 completed. Loss: 8667.9658203125
Episode 36 completed. Loss: 8702.33984375
Episode 36 completed. Loss: 8541.302734375
Episode 36 completed. Loss: 8552.7802734375
Episode 37 completed. Loss: 8657.1025390625
Episode 37 completed. Loss: 8496.9072265625
Episode 37 completed. Loss: 8450.4716796875
Episode 38 completed. Loss: 8449.732421875
Episode 38 completed. Loss: 8464.67578125
Episode 39 completed. Loss:

GOING FURTHER WITH TRAINING THE SAME AGENT


In [2]:
optimizer.param_groups[0]['lr'] = 1e-3
episodes = 50
batch_size = 256
gamma = 0.95  # Discount factor
epsilon_start = 1.0
epsilon_end = 0.01
epsilon_decay = 10

for episode in range(episodes):
    epsilon = epsilon_end + (epsilon_start - epsilon_end) * np.exp(-1. * episode / epsilon_decay)
    N_items = 100
    
    epi_finished = False
    total_reward = 0

    state_matrix = env.reset(N_items)
    state = torch.FloatTensor(state_matrix).to(device)

    while not epi_finished:
        available_items = np.where(state_matrix[:,4] == 0)[0]
        if random.random() > epsilon:
            with torch.no_grad():
                q_values = agent.forward(state)
                action = q_values.argmax().item()
        else:
            action = random.choice(available_items)
        
        next_state, reward, epi_finished = env.step(action)
        next_state_matrix = next_state
        next_state = torch.FloatTensor(next_state_matrix).to(device)
        
        history_state.append(state)
        history_next_state.append(next_state)
        history_action.append(action)
        history_reward.append(reward)
        history_done.append(float(epi_finished))
        
        batch_index += 1
        state = next_state
        state_matrix = next_state_matrix

        if batch_index >= batch_size:
            q_valuestates = []
            q_values_next_states = []

            # Manually calculate q_values and next_q_values using your approach
            for q_index in range(batch_index - batch_size, batch_index):
                q_value = agent(history_state[q_index])[history_action[q_index]]
                next_q_value = Future_agent(history_next_state[q_index]).max()

                q_valuestates.append(q_value)
                q_values_next_states.append(next_q_value)

            q_valuestates = torch.stack(q_valuestates).to(device)
            q_values_next_states = torch.stack(q_values_next_states).to(device)

            rewards = torch.tensor(history_reward[-batch_size:], dtype=torch.float32).to(device)
            dones = torch.tensor(history_done[-batch_size:], dtype=torch.float32).to(device)

            # Calculate the target using next_q_values
            with torch.no_grad():
                targets = rewards + (1.0 - dones) * gamma * q_values_next_states

            targets = targets.float()
            loss = nn.MSELoss()(q_valuestates, targets)

            optimizer.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(agent.parameters(), max_norm=0.1)
            optimizer.step()

            print(f"Episode {episode} completed. Loss: {loss.item()}")
    
    #print(episode)


Episode 22 completed. Loss: 8199.025390625
Episode 22 completed. Loss: 8154.6083984375
Episode 23 completed. Loss: 8113.1435546875
Episode 23 completed. Loss: 8076.8544921875
Episode 24 completed. Loss: 8046.501953125
Episode 24 completed. Loss: 8006.2373046875
Episode 25 completed. Loss: 7968.662109375
Episode 25 completed. Loss: 7918.8154296875
Episode 26 completed. Loss: 7886.767578125
Episode 26 completed. Loss: 7837.5888671875
Episode 27 completed. Loss: 7800.60107421875
Episode 27 completed. Loss: 7756.1962890625
Episode 28 completed. Loss: 7715.90283203125
Episode 28 completed. Loss: 7659.564453125
Episode 29 completed. Loss: 7611.90087890625
Episode 29 completed. Loss: 7546.068359375
Episode 30 completed. Loss: 7470.34423828125
Episode 30 completed. Loss: 7421.7099609375
Episode 31 completed. Loss: 7357.3544921875
Episode 31 completed. Loss: 7285.271484375
Episode 32 completed. Loss: 7203.861328125
Episode 32 completed. Loss: 7139.8349609375
Episode 33 completed. Loss: 7065.214

In [3]:
# 1. Greedy Heuristic Function
def greedy_knapsack(env):
    """
    A simple greedy heuristic that selects items based on the value-to-weight ratio.
    It selects the item with the highest value-to-weight ratio that fits in the remaining capacity.
    
    Args:
        env: The Knapsack_environment object.
    
    Returns:
        total_value: The total value collected by the greedy heuristic.
    """
    state_matrix = env.get_state()
    total_value = 0
    remaining_capacity = env.free[0]  # Ensure you are working with the current knapsack capacity
    
    # Compute value-to-weight ratio for each item
    value_to_weight_ratios = env.v / env.w
    
    # Sort items by value-to-weight ratio in descending order
    sorted_items = np.argsort(-value_to_weight_ratios)
    
    for item in sorted_items:
        # Check if the item fits in the remaining capacity
        if env.w[item] <= remaining_capacity:
            total_value += env.v[item]
            remaining_capacity -= env.w[item]
    
    return total_value
def dynamic_programming_knapsack(env):
    """
    A dynamic programming solution for the knapsack problem.
    
    Args:
        env: The Knapsack_environment object.
    
    Returns:
        total_value: The total value collected by the dynamic programming approach.
    """
    v, w = env.v, env.w
    capacity = int(env.free[0])  # The knapsack capacity

    # Initialize DP table
    n = len(v)
    dp = np.zeros((n + 1, capacity + 1))

    # Build DP table
    for i in range(1, n + 1):
        for c in range(capacity + 1):
            if w[i - 1] <= c:
                dp[i][c] = max(dp[i - 1][c], dp[i - 1][c - int(w[i - 1])] + v[i - 1])
            else:
                dp[i][c] = dp[i - 1][c]

    # The result is in dp[n][capacity]
    total_value = dp[n][capacity]
    return total_value


In [4]:
import time
import numpy as np
import torch

# Timing utility
def get_execution_time(func, env):
    start_time = time.perf_counter()  # Start the timer
    result = func(env)
    end_time = time.perf_counter()  # End the timer
    return result, end_time - start_time  # Return both the result and the time taken

def compare_agent_vs_greedy_dp_multiple(agent, env, N_items, runs=100):
    """
    Compares the trained agent's performance against the greedy heuristic and dynamic programming method over multiple runs,
    including timing the execution of each method.
    
    Args:
        agent: The trained neural network agent.
        env: The Knapsack_environment object.
        N_items: The number of items in the knapsack problem.
        runs: The number of times to run the comparison.
    
    Returns:
        agent_rewards: A list of total rewards collected by the agent over all runs.
        greedy_rewards: A list of total rewards collected by the greedy heuristic over all runs.
        dp_rewards: A list of total rewards collected by the dynamic programming approach over all runs.
        agent_times: A list of execution times for the agent over all runs.
        greedy_times: A list of execution times for the greedy heuristic over all runs.
        dp_times: A list of execution times for the dynamic programming approach over all runs.
    """
    agent_rewards = []
    greedy_rewards = []
    dp_rewards = []
    agent_times = []
    greedy_times = []
    dp_times = []

    for _ in range(runs):
        # Run and time the agent
        env.reset(N_items)
        start_time = time.perf_counter()  # Start the timer for the agent
        state_matrix = env.reset(N_items)
        state = torch.FloatTensor(state_matrix).to(device)

        agent_reward = 0
        epi_finished = False
        while not epi_finished:
            with torch.no_grad():
                q_values = agent(state)
                action = q_values.argmax().item()
            next_state_matrix, reward, epi_finished = env.step(action)
            agent_reward += reward
            state = torch.FloatTensor(next_state_matrix).to(device)
        
        end_time = time.perf_counter()  # End the timer for the agent
        agent_time = end_time - start_time

        # Reset the environment for the greedy heuristic and dynamic programming
        env.reset(N_items)

        # Run and time the greedy heuristic
        greedy_reward, greedy_time = get_execution_time(greedy_knapsack, env)

        # Run and time the dynamic programming solution
        dp_reward, dp_time = get_execution_time(dynamic_programming_knapsack, env)

        # Collect rewards and times
        agent_rewards.append(agent_reward)
        greedy_rewards.append(greedy_reward)
        dp_rewards.append(dp_reward)
        agent_times.append(agent_time)
        greedy_times.append(greedy_time)
        dp_times.append(dp_time)

    return agent_rewards, greedy_rewards, dp_rewards, agent_times, greedy_times, dp_times


# Parameters
N_items = 100
runs = 100  # Set the number of runs for comparison

# Compare agent, greedy, and dynamic programming over multiple runs
agent_rewards, greedy_rewards, dp_rewards, agent_times, greedy_times, dp_times = compare_agent_vs_greedy_dp_multiple(agent, env, N_items, runs)

# Calculate and print average rewards
avg_agent_reward = np.mean(agent_rewards)
avg_greedy_reward = np.mean(greedy_rewards)
avg_dp_reward = np.mean(dp_rewards)

# Calculate and print average execution times
avg_agent_time = np.mean(agent_times)
avg_greedy_time = np.mean(greedy_times)
avg_dp_time = np.mean(dp_times)

print(f"Average reward of the agent over {runs} runs: {avg_agent_reward}")
print(f"Average reward of the greedy heuristic over {runs} runs: {avg_greedy_reward}")
print(f"Average reward of the dynamic programming method over {runs} runs: {avg_dp_reward}")
print(f"Average execution time of the agent over {runs} runs: {avg_agent_time:.10f} seconds")
print(f"Average execution time of the greedy heuristic over {runs} runs: {avg_greedy_time:.10f} seconds")
print(f"Average execution time of the dynamic programming method over {runs} runs: {avg_dp_time:.10f} seconds")


Average reward of the agent over 100 runs: 196.04
Average reward of the greedy heuristic over 100 runs: 195.28
Average reward of the dynamic programming method over 100 runs: 232.03
Average execution time of the agent over 100 runs: 0.0064553930 seconds
Average execution time of the greedy heuristic over 100 runs: 0.0000359930 seconds
Average execution time of the dynamic programming method over 100 runs: 0.0132765350 seconds
