In [2]:
#Imports
import time
import random

import numpy as np
from scipy.stats import norm
import gurobipy as gb
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt

#Set seed for reproducibility of instances
np.random.seed(25)

print_selections = False

def instance_generator(n=50000):
    '''
    Function that generates hard Knapsack problem instances.
    Input:
        -n: desired size of set of items I, defaulted at 50,000 as we use this number in our study
    Returns:
        -v: array of values for all i items
        -w: array of weights of all i items
    ''' 
    v = np.round(norm.rvs(100, 10, size=n))
    w = np.zeros(n)
    for i in range(n):
        w[i] = round(norm.rvs(v[i], 5))
    return v, w

v, w = instance_generator()

# Problem size set-up
N = [10, 25, 50, 100, 250, 500, 750, 1000, 1500]

# Capacity constraint function based on the problem size
def W(n):
    return round(0.45*np.sum(w[0:n]))

# Binary problem
def binary_method(n, v=v, w=w):
    '''
    Function that runs the Gurobi-binary problem.
    Input:
        -n: Input problem size
    Output:
        -obj_val: Outcome of the optimizatoin
        -running_time: Time to run the algorithm on the problem size
    '''
    # Selecting relevant i for problem size
    W_gb = W(n)

    start_time = time.time()

    m = gb.Model("Binary model")
    x = m.addVars(n, vtype=gb.GRB.BINARY, name="x")

    m.setObjective(gb.quicksum(v[i]*x[i] for i in range(n)), gb.GRB.MAXIMIZE)

    m.addConstr(gb.quicksum(w[i]*x[i] for i in range(n)) <= W_gb)

    m.update()
    m.Params.LogToConsole = 0
    m.optimize()
    
    obj_val = m.objVal
    running_time = time.time() - start_time
    selected_items = [i for i in range(n) if x[i].X == 1] 
    return obj_val, running_time, selected_items

optimal_solution_binary = [binary_method(n)[0] for n in N]
running_time_binary = [binary_method(n)[1] for n in N]
print(f"For n = {N}:")
print(f"Gurobi model optimal solution: {optimal_solution_binary}")
print(f"Gurobi model running times: {running_time_binary}")

if print_selections: 
    print(f"Gurobi model selected items:{[binary_method(n)[2] for n in N]}")

# Dynamic programming
def dyn_prog_method(n, v=v, w=w):
    
    # Selecting relevant i for problem size, and calculate knapsack capacity
    W_dp = int(W(n))

    # Start runtime measurement
    start_time = time.perf_counter()

    # Create table for bottom up dynamic programming
    OPT_table = [[0 for i in range(W_dp+1)] for i in range(n+1)]
    
    # Dynamic programming algorithm
    for i in range(1, n+1):
        for j in range(1, W_dp+1):
            if w[i-1] <= j:
                OPT_table[i][j] = max(OPT_table[i-1][j], v[i-1]+ OPT_table[i-1][int(j-w[i-1])])
            else:
                OPT_table[i][j] = OPT_table[i-1][j]

    # Backtrack to find the selected items
    selected_items = []
    j = W_dp
    for i in range(n, 0, -1):
        if OPT_table[i][j] != OPT_table[i - 1][j]:  # Item i-1 is included
            selected_items.append(i - 1)  # Add the index of the item
            j -= int(w[i - 1])  # Reduce the remaining capacity
    selected_items.reverse()  # Reverse the list to get the correct order

    # End runtime measurement
    running_time = time.perf_counter() - start_time

    # Return the value in the knapsack and the running time
    return OPT_table[n][W_dp], running_time, selected_items

optimal_solution_dp, running_time_dp, selected_items_dp = [], [], []
print("Warning: total DP analysis can be lengthy (around 2 minutes)")
for n in N:
    DP_sol = dyn_prog_method(n)
    optimal_solution_dp.append(DP_sol[0])
    running_time_dp.append(DP_sol[1])
    selected_items_dp.append(DP_sol[2])
    print(f"DP case N={n} analysed!")

print(f"For n = {N}:")
print(f"DP optimal solution: {optimal_solution_dp}")
print(f"DP running times: {running_time_dp}")
if print_selections:
    print(f"DP selected items: {selected_items_dp}")

# Greedy Hueristic
def greedy_heuristic(n, v=v, w=w):
    
    # Selecting relevant i for problem size, and calculate knapsack capacity
    W_gy = W(n)

    # Start runtime measurement
    start_time = time.perf_counter()

    # Calculate ratios
    ratios = [(v[i]/w[i], v[i], w[i]) for i in range(n) ]
    ratios.sort(reverse=True)

    # initialize empty knapsack
    weight_in_knapsack = 0
    value_in_knapsack = 0

    # fill up iteratively over items in ratio order
    for _, value, weight in ratios:
        if weight_in_knapsack + weight <= W_gy:
            value_in_knapsack += value
            weight_in_knapsack += weight

    # End runtime measurement
    running_time = time.perf_counter() - start_time

    # Return the value in the knapsack and the running time
    return value_in_knapsack, running_time

optimal_solution_gh = [greedy_heuristic(n)[0] for n in N]
running_time_gh = [greedy_heuristic(n)[1] for n in N]
print(f"For n = {N}:")
print(f"GH optimal solution: {optimal_solution_gh}")
print(f"GH running times: {running_time_gh}")  

For n = [10, 25, 50, 100, 250, 500, 750, 1000, 1500]:
Gurobi model optimal solution: [453.0, 1173.0, 2337.0, 4679.0, 11673.0, 23493.0, 35247.0, 47151.0, 70656.0]
Gurobi model running times: [0.0009717941284179688, 0.0020089149475097656, 0.003991603851318359, 0.015952110290527344, 0.2173154354095459, 0.03556561470031738, 0.047983646392822266, 0.07061147689819336, 0.3459641933441162]
DP case N=10 analysed!
DP case N=25 analysed!
DP case N=50 analysed!
DP case N=100 analysed!
DP case N=250 analysed!
DP case N=500 analysed!
DP case N=750 analysed!
DP case N=1000 analysed!
DP case N=1500 analysed!
For n = [10, 25, 50, 100, 250, 500, 750, 1000, 1500]:
DP optimal solution: [453.0, 1173.0, 2337.0, 4679.0, 11673.0, 23494.0, 35247.0, 47152.0, 70661.0]
DP running times: [0.001750499999616295, 0.011972600001172395, 0.060263200000918005, 0.21901449999859324, 1.3958136000001105, 5.6041562000027625, 13.154577500001324, 22.928197599998384, 51.19433579999895]
For n = [10, 25, 50, 100, 250, 500, 750, 10

In [115]:
# RL Agent



#Initial training values for capacity and size of item set
max_items = 150
v, w = instance_generator(max_items)
capacity = W(max_items)

# Define the device (using GPU if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class NeuralNetworkAgent(nn.Module):
    def __init__(self, input_dim=5, hidden_dim=64, n_heads=4, n_layers=2, seq_len=1):
        super(NeuralNetworkAgent, self).__init__()

        # Fully connected layers for initial transformation
        self.fc1_1 = nn.Linear(input_dim, hidden_dim)
        self.fc1_2 = nn.Linear(hidden_dim, hidden_dim)

        # Transformer Encoder setup
        self.transformer_encoder_layer = nn.TransformerEncoderLayer(
            d_model=hidden_dim, nhead=n_heads, dim_feedforward=hidden_dim
        )
        self.transformer = nn.TransformerEncoder(self.transformer_encoder_layer, num_layers=n_layers)

        # Fully connected layers after transformer processing
        self.fc2_1 = nn.Linear(hidden_dim, hidden_dim)
        self.fc2_2 = nn.Linear(hidden_dim, 1)  # Output a single Q-value per row

        self.relu = nn.ReLU()

    def forward(self, x):
        # Initial transformation
        x = self.relu(self.fc1_1(x))
        x = self.relu(self.fc1_2(x))

        # Transformer expects input of shape (sequence length, batch size, embedding dimension)
        x = x.unsqueeze(1)  # Add sequence length dimension
        x = self.transformer(x)
        x = x.squeeze(1)  # Remove the sequence length dimension

        # Final transformation to output Q-values
        x = self.relu(self.fc2_1(x))
        qvalues = self.fc2_2(x).squeeze(-1)  # Output one Q-value per row

        return qvalues

# Knapsack environment created through class
class Knapsack_environment:
    def __init__(self, v=v, w=w, capacity=capacity, max_items=max_items, new_input=True):
        self.v = v # concatenate allows for later adaptation of amount of training set items
        self.w = w
        self.capacity = capacity
        self.free = np.full(len(v), capacity)
        print(self.free.shape)
        self.selected = np.zeros(len(v))
        self.reset(len(v), new_input = new_input)

    # Re-set knapsack to empty for new problem
    def reset(self, N_items, new_input=True):
        
        if new_input:
            self.v, self.w = instance_generator(N_items)
            self.capacity = W(N_items)

        '''
        self.v = np.concatenate((v_new, np.zeros(max_items-N_items)))
        self.w = np.concatenate((w_new, np.ones(max_items-N_items)))
        print(w_new.shape)
        print(w_new.shape)
        self.ratios =  self.v / self.w
        
        sorted_indices = np.argsort(self.ratios)[::-1]
        self.v = self.v[sorted_indices]
        self.w = self.w[sorted_indices]
        self.ratios = self.ratios[sorted_indices]  # Update ratios after sorting

        self.free =  np.concatenate((np.full(N_items, W_new), np.zeros(max_items-N_items)))
        self.preselected =  np.concatenate((np.zeros(N_items), np.ones(max_items-N_items)))
        '''

        self.ratios = self.v / self.w
        self.free = np.full(N_items, self.capacity)
        self.preselected = np.zeros(N_items)
        self.preselected[self.w > self.free] = 1 # add item to list of pre-selected items
        return self.get_state()
    
    def get_state(self):
        return np.vstack([self.v, self.w, self.ratios, self.free, self.preselected]).T

    def step(self, action):
        if np.all(self.preselected == 1):
            return self.get_state(), 0,  True
        
        if self.preselected[action] == 1: 
            return self.get_state(), -5, False # wanting to include a pre-selected items is penalized
        
        self.preselected[action] = 1
        reward = self.v[action] # add reward as value of addd item
        self.free = self.free - np.full(len(self.v), self.w[action]) # subtract item weight from leftover capacity
        self.preselected[self.w > self.free] = 1 # add item to list of pre-selected items
        
        done = np.all(self.preselected == 1) # termination upon having inspected all items
        return self.get_state(), reward, done


train = True # CHANGE IF YOU WANT TO RE-TRAIN MODEL
model_save_path = "trained_knapsack_agent.pth"

# Training method variables
if train:
    episodes = 20
    batch_size = 64
    gamma = 0.95  # Discount factor
    epsilon_start = 1.0
    epsilon_end = 0.01
    epsilon_decay = 10

    env = Knapsack_environment()
    agent = NeuralNetworkAgent().to(device)
    optimizer = optim.Adam(agent.parameters(), lr=1e-3)

    # Save trained agent
    future_agent = NeuralNetworkAgent().to(device)
    future_agent.load_state_dict(agent.state_dict())

    batch_index = 0
    history_state = []
    history_next_state = []
    history_action = []
    history_reward = []
    history_done = []

    # training episodes
    for episode in range(episodes):
        epsilon = epsilon_end + (epsilon_start - epsilon_end) * np.exp(-1. * episode / epsilon_decay)
        N_items = 30 # Amount of items included in training
        
        # Initalise termination status and zero reward
        ep_finished = False
        total_reward = 0

        # Get a new state
        state_matrix = env.reset(N_items)
        state = torch.FloatTensor(state_matrix).to(device)

        while not ep_finished: 
            # Epsilon greedy policy
            if random.random() > epsilon:
                with torch.no_grad():
                    q_values = agent.forward(state)
                    action = q_values.argmax().item()
            else:
                available_items = np.where(state_matrix[:,4] == 0)[0] #only consider selectable items
                action = random.choice(available_items)
            
            next_state, reward, ep_finished = env.step(action)
            next_state_matrix = next_state
            next_state = torch.FloatTensor(next_state_matrix).to(device)
            
            # Add step in episode to history
            history_state.append(state)
            history_next_state.append(next_state)
            history_action.append(action)
            history_reward.append(reward)
            history_done.append(float(ep_finished))
            
            # Update to inspect next step
            batch_index += 1
            state = next_state
            state_matrix = next_state_matrix

            if batch_index >= batch_size:
                q_valuestates = []
                q_values_next_states = []

                # Manually calculate q_values and next_q_values using your approach
                for q_index in range(batch_index - batch_size, batch_index):
                    q_value = agent(history_state[q_index])[history_action[q_index]]
                    next_q_value = future_agent(history_next_state[q_index]).max() # find optimal q-value
                    q_valuestates.append(q_value)
                    q_values_next_states.append(next_q_value)

                q_valuestates = torch.stack(q_valuestates).to(device)
                q_values_next_states = torch.stack(q_values_next_states).to(device)

                rewards = torch.tensor(history_reward[-batch_size:], dtype=torch.float32).to(device)
                dones = torch.tensor(history_done[-batch_size:], dtype=torch.float32).to(device)

                # Calculate the target using next_q_values
                with torch.no_grad():
                    targets = rewards + (1.0 - dones) * gamma * q_values_next_states # Bellman equation

                targets = targets.float()
                loss = nn.MSELoss()(q_valuestates, targets)

                # Update neural network parameters
                optimizer.zero_grad()
                loss.backward()
                torch.nn.utils.clip_grad_norm_(agent.parameters(), max_norm=0.1)
                optimizer.step()

                print(f"Episode {episode} completed. Loss: {loss.item()}")

    # Store agent locally so it won't need to be trained again
    torch.save(agent.state_dict(), model_save_path)
    print(f"Model saved to {model_save_path}")

else:
    env = Knapsack_environment()
    agent = NeuralNetworkAgent().to(device)
    agent.load_state_dict(torch.load(model_save_path))
    agent.eval()
    print(f"Model loaded from {model_save_path}")

(150,)




Episode 4 completed. Loss: 9550.833984375
Episode 4 completed. Loss: 9515.30859375
Episode 4 completed. Loss: 9453.5390625
Episode 4 completed. Loss: 9356.603515625
Episode 4 completed. Loss: 9310.3603515625
Episode 5 completed. Loss: 9235.6513671875
Episode 5 completed. Loss: 9139.126953125
Episode 5 completed. Loss: 9094.5947265625
Episode 5 completed. Loss: 9042.69921875
Episode 5 completed. Loss: 8940.0556640625
Episode 5 completed. Loss: 8856.6376953125
Episode 5 completed. Loss: 8749.71875
Episode 5 completed. Loss: 8548.228515625
Episode 5 completed. Loss: 8354.5576171875
Episode 5 completed. Loss: 8398.2275390625
Episode 5 completed. Loss: 8309.4306640625
Episode 5 completed. Loss: 8298.494140625
Episode 5 completed. Loss: 8247.9248046875
Episode 5 completed. Loss: 8247.9375
Episode 5 completed. Loss: 8241.17578125
Episode 6 completed. Loss: 8181.1376953125
Episode 6 completed. Loss: 8191.525390625
Episode 6 completed. Loss: 8120.73046875
Episode 6 completed. Loss: 8130.1611328

In [48]:
optimizer.param_groups[0]['lr'] = 1e-4

# Training method variables
if train:
    episodes = 20
    batch_size = 128
    gamma = 0.95  # Discount factor
    epsilon_start = 1.0
    epsilon_end = 0.01
    epsilon_decay = 10

    # training episodes
    for episode in range(episodes):
        epsilon = epsilon_end + (epsilon_start - epsilon_end) * np.exp(-1. * episode / epsilon_decay)
        N_items = 20 # Amount of items included in training
        
        # Initalise termination status and zero reward
        ep_finished = False
        total_reward = 0

        # Get a new state
        state_matrix = env.reset(N_items)
        state = torch.FloatTensor(state_matrix).to(device)

        while not ep_finished: 
            # Epsilon greedy policy
            if random.random() > epsilon:
                with torch.no_grad():
                    q_values = agent.forward(state)
                    action = q_values.argmax().item()
            else:
                available_items = np.where(state_matrix[:,4] == 0)[0] #only consider selectable items
                action = random.choice(available_items)
            
            next_state, reward, ep_finished = env.step(action)
            next_state_matrix = next_state
            next_state = torch.FloatTensor(next_state_matrix).to(device)
            
            # Add step in episode to history
            history_state.append(state)
            history_next_state.append(next_state)
            history_action.append(action)
            history_reward.append(reward)
            history_done.append(float(ep_finished))
            
            # Update to inspect next step
            batch_index += 1
            state = next_state
            state_matrix = next_state_matrix

            if batch_index >= batch_size:
                q_valuestates = []
                q_values_next_states = []

                # Manually calculate q_values and next_q_values using your approach
                for q_index in range(batch_index - batch_size, batch_index):
                    q_value = agent(history_state[q_index])[history_action[q_index]]
                    next_q_value = future_agent(history_next_state[q_index]).max() # find optimal q-value
                    q_valuestates.append(q_value)
                    q_values_next_states.append(next_q_value)

                q_valuestates = torch.stack(q_valuestates).to(device)
                q_values_next_states = torch.stack(q_values_next_states).to(device)

                rewards = torch.tensor(history_reward[-batch_size:], dtype=torch.float32).to(device)
                dones = torch.tensor(history_done[-batch_size:], dtype=torch.float32).to(device)

                # Calculate the target using next_q_values
                with torch.no_grad():
                    targets = rewards + (1.0 - dones) * gamma * q_values_next_states # Bellman equation

                targets = targets.float()
                loss = nn.MSELoss()(q_valuestates, targets)

                # Update neural network parameters
                optimizer.zero_grad()
                loss.backward()
                torch.nn.utils.clip_grad_norm_(agent.parameters(), max_norm=0.1)
                optimizer.step()

                print(f"Episode {episode} completed. Loss: {loss.item()}")

    # Store agent locally so it won't need to be trained again
    torch.save(agent.state_dict(), model_save_path)
    print(f"Model saved to {model_save_path}")

else:
    env = Knapsack_environment()
    agent = NeuralNetworkAgent().to(device)
    agent.load_state_dict(torch.load(model_save_path))
    agent.eval()
    print(f"Model loaded from {model_save_path}")

Episode 0 completed. Loss: 1519.0625
Episode 0 completed. Loss: 1536.22802734375
Episode 0 completed. Loss: 1516.185791015625
Episode 0 completed. Loss: 1542.08203125
Episode 0 completed. Loss: 1553.28759765625


KeyboardInterrupt: 

In [114]:
def knapsackSolver(n, W, v, w, return_all=False):
    env = Knapsack_environment(v=v[:n], w=w[:n], capacity=W, new_input=False) # Create appropriate knapsack environment
    agent = NeuralNetworkAgent().to(device) # Initialise empty agent
    agent.load_state_dict(torch.load("trained_knapsack_agent.pth")) # Load pre-trained model into it
    agent.eval() # Set to evaluation mode

    state_matrix = env.reset(n, new_input=False)
    state = torch.FloatTensor(state_matrix).to(device)

    selected_items = []
    total_value = 0
    start_time = time.perf_counter()

    while True:
        with torch.no_grad():
            q_values = agent.forward(state)
            action = q_values.argmax().item() # Choose action with highest Q-value

        next_state, reward, done = env.step(action)
        # update knapsack entries when item is selected
        if reward >= 0:
            selected_items.append(action)
            total_value += reward
        
        state_matrix = next_state
        state = torch.FloatTensor(state_matrix).to(device)
        
        if done:
            break # Terminate when all items inspected

    running_time = time.perf_counter() - start_time
    
    if return_all:
        return selected_items, total_value, running_time
    
    return selected_items

knapsackSolver(150, W(150), v, w)

(150,)
131
131


  agent.load_state_dict(torch.load("trained_knapsack_agent.pth")) # Load pre-trained model into it


[131]