## Test Section : FAILED

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import matplotlib.pyplot as plt
from tqdm import tqdm


#### Encoder

In [4]:
class TaskEncoder(nn.Module):
    def __init__(self, embedding_dim=128):
        super().__init__()
        self.ff_layer = nn.Linear(2, embedding_dim)
        self.gru_encoder = nn.GRU(input_size=embedding_dim, hidden_size=embedding_dim, batch_first=True)

    def forward(self, input_coords):
        embedded = self.ff_layer(input_coords)
        encoder_outputs, hidden = self.gru_encoder(embedded)
        return embedded, encoder_outputs, hidden
    


#### Example usage of Encoder

In [None]:
# Task point coordinates: 4 tasks, each with (x, y)
input_coords = torch.tensor([
    [2.0, 3.0],
    [5.0, 1.0],
    [6.0, 4.0],
    [1.0, 2.0]
], dtype=torch.float32).unsqueeze(0)  # shape: (1, 4, 2) → batch_size = 1

# Example usage:
encoder = TaskEncoder(embedding_dim=128)
embedded, encoder_outputs, hidden2 = encoder(input_coords)
print("Embedded shape     :", embedded.shape)
print("Encoder output shape:", encoder_outputs.shape)
print("Hidden state shape :", hidden2.shape)


#### Attention

In [5]:
class Attention(nn.Module):
    def __init__(self, hidden_dim):
        super().__init__()
        self.W1 = nn.Linear(hidden_dim, hidden_dim, bias=False)
        self.W2 = nn.Linear(hidden_dim, hidden_dim, bias=False)
        self.v = nn.Linear(hidden_dim, 1, bias=False)

    def forward(self, decoder_hidden, encoder_outputs, mask):
        # decoder_hidden: (1, batch, hidden_dim) → (batch, 1, hidden_dim)
        decoder_hidden = decoder_hidden.transpose(0, 1)
        # encoder_outputs: (batch, seq_len, hidden_dim)
        score = self.v(torch.tanh(self.W1(encoder_outputs) + self.W2(decoder_hidden)))  # (batch, seq, 1)
        score = score.squeeze(-1)  # (batch, seq)
        score[mask == 0] = -1e9   # Mask out already visited tasks
        attn_weights = F.softmax(score, dim=-1)
        return attn_weights

#### Decoder

In [6]:
class Decoder(nn.Module):
    def __init__(self, embedding_dim, hidden_dim):
        super().__init__()
        self.gru = nn.GRU(embedding_dim, hidden_dim, batch_first=True)
        self.attention = Attention(hidden_dim)

    def forward(self, decoder_input, hidden, encoder_outputs, mask):
        output, hidden = self.gru(decoder_input, hidden)  # output: (batch, 1, hidden_dim)
        attn_weights = self.attention(hidden, encoder_outputs, mask)  # (batch, seq_len)
        return attn_weights, hidden

#### Example Usage of Decoder and Attention

In [None]:
batch_size = 1
seq_len = 4
embedding_dim = hidden_dim = 128

decoder_input = torch.randn(batch_size, 1, embedding_dim)      # initial SOS input
encoder_outputs = torch.randn(batch_size, seq_len, hidden_dim) # output of encoder
hidden = torch.randn(1, batch_size, hidden_dim)                # initial decoder hidden
mask = torch.ones(batch_size, seq_len)                         # all tasks unvisited

decoder = Decoder(embedding_dim, hidden_dim)
attn_weights, next_hidden = decoder(decoder_input, hidden, encoder_outputs, mask)

print("Attention Weights:", attn_weights)
print("Next Hidden State:", next_hidden.shape)


#### Critic Network

In [7]:
# --- Critic Network ---
class Critic(nn.Module):
    def __init__(self, hidden_dim):
        super().__init__()
        self.fc = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, 1)
        )

    def forward(self, encoder_outputs):
        # Mean pooling
        pooled = encoder_outputs.mean(dim=1)
        return self.fc(pooled).squeeze(-1)


#### Full Actor Loop Testing

In [None]:
# Parameters
batch_size = 1
seq_len = 6                 # number of tasks
embedding_dim = hidden_dim = 128



# Create random task coordinates within (0, 100)
input_coords = torch.rand(batch_size, seq_len, 2) * 100
# Mask to track visited tasks
mask = torch.ones(batch_size, seq_len)           # 1 = not yet visited
# Initial decoder input: learnable start token
decoder_input = torch.randn(batch_size, 1, embedding_dim)

# Initialize encoder
encoder = TaskEncoder(embedding_dim=embedding_dim)
# Decoder setup (reuse Decoder class from earlier)
decoder = Decoder(embedding_dim, hidden_dim)

# Pass through encoder
embedded, encoder_outputs, encoder_hidden = encoder(input_coords)
# Initialize decoder hidden state
hidden = encoder_hidden


# Store selected task indices
selected_indices = []

print("Input Coords:", input_coords)

# Loop over each decoding step (select 1 task per step)
for step in range(seq_len):
    attn_weights, hidden = decoder(decoder_input, hidden, encoder_outputs, mask)

    # Greedily pick the highest attention score (unvisited task)
    selected = torch.argmax(attn_weights, dim=1).item()
    selected_indices.append(selected)

    # Update mask to mark this task as visited
    mask[0, selected] = 0

    # Get the embedding of the selected task from encoder outputs as next input
    decoder_input = encoder_outputs[0, selected].unsqueeze(0).unsqueeze(1)  # shape: (1, 1, 128)

    #print attention weights for this step
    print(f"Step {step + 1} - Selected Task Index: {selected}, Attention Weights: {attn_weights}")

# Final visiting sequence
print("Generated task visiting order:", selected_indices)

### **Full Code of DRL Training**

In [39]:

# ----- Encoder -----
class TaskEncoder(nn.Module):
    def __init__(self, embedding_dim=128):
        super().__init__()
        self.ff_layer = nn.Linear(2, embedding_dim)
        self.gru_encoder = nn.GRU(input_size=embedding_dim, hidden_size=embedding_dim, batch_first=True)

    def forward(self, input_coords):
        embedded = self.ff_layer(input_coords)
        encoder_outputs, hidden = self.gru_encoder(embedded)
        return embedded, encoder_outputs, hidden

# ----- Attention -----
class Attention(nn.Module):
    def __init__(self, hidden_dim):
        super().__init__()
        self.W1 = nn.Linear(hidden_dim, hidden_dim, bias=False)
        self.W2 = nn.Linear(hidden_dim, hidden_dim, bias=False)
        self.v = nn.Linear(hidden_dim, 1, bias=False)

    def forward(self, decoder_hidden, encoder_outputs, mask):
        decoder_hidden = decoder_hidden.transpose(0, 1)  # (1, B, H) -> (B, 1, H)
        score = self.v(torch.tanh(self.W1(encoder_outputs) + self.W2(decoder_hidden)))  # (B, S, 1)
        score = score.squeeze(-1)
        score[mask == 0] = -1e9  # Mask visited
        attn_weights = F.softmax(score, dim=-1)
        return attn_weights

# ----- Decoder -----
class Decoder(nn.Module):
    def __init__(self, embedding_dim, hidden_dim):
        super().__init__()
        self.gru = nn.GRU(embedding_dim, hidden_dim, batch_first=True)
        self.attention = Attention(hidden_dim)

    def forward(self, decoder_input, hidden, encoder_outputs, mask):
        output, hidden = self.gru(decoder_input, hidden)
        attn_weights = self.attention(hidden, encoder_outputs, mask)
        return attn_weights, hidden

# ----- Critic -----
class Critic(nn.Module):
    def __init__(self, hidden_dim):
        super().__init__()
        self.fc = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, 1)
        )

    def forward(self, encoder_outputs):
        pooled = encoder_outputs.mean(dim=1)
        return self.fc(pooled).squeeze(-1)



#### **Training**

In [67]:
   
    
reward_history = []
# ----- Training -----
embedding_dim = hidden_dim = 256
seq_len = 50
batch_size = 1
lr = 5e-4
epochs = 20
n_batches = 1000

encoder = TaskEncoder(embedding_dim)
decoder = Decoder(embedding_dim, hidden_dim)
critic = Critic(hidden_dim)

actor_optim = optim.Adam(list(encoder.parameters()) + list(decoder.parameters()), lr=lr)
critic_optim = optim.Adam(critic.parameters(), lr=lr)

for _ in tqdm(range(n_batches)):
    input_coords = torch.rand(batch_size, seq_len, 2)

    for epoch in range(epochs):
        total_reward = 0
        _, encoder_outputs, encoder_hidden = encoder(input_coords)

        mask = torch.ones(batch_size, seq_len)
        decoder_input = torch.randn(batch_size, 1, embedding_dim)
        hidden = encoder_hidden
        log_probs = []
        tour = []

        for _ in range(seq_len):
            attn_weights, hidden = decoder(decoder_input, hidden, encoder_outputs, mask.clone())
            dist = torch.distributions.Categorical(attn_weights)
            selected = dist.sample()
            # selected = torch.argmax(attn_weights, dim=1)
            log_prob = dist.log_prob(selected)
            log_probs.append(log_prob.squeeze())
            idx = selected.item()
            tour.append(idx)
            mask = mask.clone()
            mask.scatter_(1, torch.tensor([[idx]]), 0)  # safe scatter op
            decoder_input = encoder_outputs[0, idx].detach().clone().unsqueeze(0).unsqueeze(1)

        coords = input_coords[0][tour]
        path = torch.cat([coords, coords[0].unsqueeze(0)], dim=0)
        reward = -torch.norm(path[1:] - path[:-1], dim=1).sum()
        total_reward += reward.item()

        value = critic(encoder_outputs)
        advantage = reward.detach() - value
        actor_loss = -torch.stack(log_probs).sum() * advantage.detach()
        critic_loss = (value - reward.detach()).pow(2).mean()

        critic_optim.zero_grad()
        critic_loss.backward(retain_graph=True)
        critic_optim.step()
        
        actor_optim.zero_grad()
        actor_loss.backward()
        actor_optim.step()

        
    # print(f"Epoch {epoch+1}/{epochs} - Actor Loss: {actor_loss.item():.4f}, Critic Loss: {critic_loss.item():.4f}")

# Save the trained models
torch.save({
    'encoder_state_dict': encoder.state_dict(),
    'decoder_state_dict': decoder.state_dict(),
    'critic_state_dict': critic.state_dict(),
    'actor_optimizer_state_dict': actor_optim.state_dict(),
    'critic_optimizer_state_dict': critic_optim.state_dict()
}, 'tsp_actor_critic_02.pth')

# plt.plot(reward_history)
# plt.title("Reward over Epochs (fixed input)")
# plt.show()


  0%|          | 0/1000 [00:00<?, ?it/s]

100%|██████████| 1000/1000 [57:24<00:00,  3.44s/it]


#### **Evaluation**

In [134]:
# Load the trained models

def test_model(model_path, embedding_dim=128):
    embedding_dim = hidden_dim = embedding_dim
    seq_len = 40
    checkpoint = torch.load(model_path)
    encoder = TaskEncoder(embedding_dim)
    decoder = Decoder(embedding_dim, hidden_dim)
    # critic = Critic(hidden_dim)
    encoder.load_state_dict(checkpoint['encoder_state_dict'])
    decoder.load_state_dict(checkpoint['decoder_state_dict'])
    # critic.load_state_dict(checkpoint['critic_state_dict'])
    encoder.eval()
    decoder.eval()
    # critic.eval()

    # Inference on a new random TSP instance
    with torch.no_grad():
        test_input = torch.rand(1, seq_len, 2)
        # print(test_input)
        _, test_encoder_outputs, test_encoder_hidden = encoder(test_input)
        test_mask = torch.ones(1, seq_len)
        test_decoder_input = torch.randn(1, 1, embedding_dim)
        test_hidden = test_encoder_hidden
        test_tour = []
        for _ in range(seq_len):
            test_attn_weights, test_hidden = decoder(test_decoder_input, test_hidden, test_encoder_outputs, test_mask.clone())
            test_selected = torch.argmax(test_attn_weights, dim=1).item()
            test_tour.append(test_selected)
            test_mask[0, test_selected] = 0
            test_decoder_input = test_encoder_outputs[0, test_selected].unsqueeze(0).unsqueeze(1)
        test_tour_coords = test_input[0][test_tour]
        test_path = torch.cat([test_tour_coords, test_tour_coords[0].unsqueeze(0)], dim=0)
        test_tour_length = torch.norm(test_path[1:] - test_path[:-1], dim=1).sum().item()

    # print("Tour:", test_tour)
    # print("Tour length:", test_tour_length)
    # plot_path(test_path)
    return test_tour_length

M = 0
for _ in range(100):
    l1 = test_model('tsp_actor_critic.pth')
    # test_model('tsp_actor_critic_01.pth')
    l2 = test_model('tsp_actor_critic_02.pth',256)
    
    if l1 > l2:
        M += 1
print("Model 1 better than Model 2:", M)

  checkpoint = torch.load(model_path)


Model 1 better than Model 2: 89


In [43]:
def plot_path(test_path):
    plt.figure(figsize=(8, 8))
    plt.plot(test_path[:, 0], test_path[:, 1], marker='o', linestyle='-', color='b', label='Path')

    # Annotate each point with its visiting order
    for i, (x, y) in enumerate(test_path[:-1]):  # skip the last point (duplicate of start)
        plt.text(x, y + 0.02, str(i), fontsize=10, color='red', ha='center', va='center')

    plt.scatter(test_path[0, 0], test_path[0, 1], color='green', s=100, label='Start')
    plt.scatter(test_path[-2, 0], test_path[-2, 1], color='orange', s=100, label='End')
    plt.title("TSP Path with Visiting Order using DRL")
    plt.xlabel("X")
    plt.ylabel("Y")
    plt.legend()
    plt.grid(True)
    plt.show()