In [1]:
import gymnasium as gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import random
from collections import deque
from gymnasium.envs.toy_text.frozen_lake import generate_random_map
import matplotlib.pyplot as plt

In [None]:
# Create the FrozenLake environment
random.seed(21)
np.random.seed(21)
torch.manual_seed(21)
env = gym.make("FrozenLake-v1",desc=generate_random_map(size=5), is_slippery=False)
state_space = env.observation_space.n
action_space = env.action_space.n
env.action_space.seed(21)
env.observation_space.seed(21)  # Seed the observation space
env.reset(seed=21)

# Parameters
gamma = 0.99
epsilon = 1.0
epsilon_min = 0.1
epsilon_decay = 0.999
batch_size = 64
buffer_size = 10000
target_update_freq = 10
num_episodes = 500
history_size = 15 # Number of past states to consider
backward_steps = 5  # Number of steps for backward planning

In [None]:
# Replay buffer
class ReplayBuffer:
    def __init__(self, capacity, history_size):
        self.buffer = deque(maxlen=capacity)
        self.history_size = history_size

    def add(self, transition):
        """transition = (state, action, reward, next_state, done)."""
        transition = (transition[0], transition[1], transition[2], transition[3], transition[4])
        self.buffer.append(transition)

    def sample(self, batch_size):
        batch = random.sample(self.buffer, batch_size)  ##Sample a batch of transitions uniformly
        actions, rewards, next_states, dones = zip(*[transition[1:] for transition in batch])

        state_histories, reward_histories = [], []

        for idx in range(batch_size):
            state_history, reward_history = self._get_history(idx)
            state_histories.append(state_history)
            reward_histories.append(reward_history)

        return (np.array(state_histories),  # Histories of states
            np.array(actions),          # Actions
            np.array(rewards),          # Current rewards
            np.array(next_states),      # Next states
            np.array(dones),            # Done flags
            np.array(reward_histories)) # Histories of rewards

    def _get_history(self, index):
        state_history = []
        reward_history = []

        for i in range(self.history_size):
            if index - i >= 0:
                transition = self.buffer[index - i]
                state = transition[0]
                reward = transition[2]

                # Ensuring the state and reward have consistent shapes
                state = np.array(state)
                if state.ndim == 1:  
                    state = state[np.newaxis, :]
                state_history.append(state)

                reward = np.array(reward)
                reward_history.append(reward)
            else:
                # Padding with zeros if insufficient history
                padded_state = np.zeros_like(np.array(self.buffer[0][0]))
                if padded_state.ndim == 1:  
                    padded_state = padded_state[np.newaxis, :]
                
                state_history.append(padded_state)
                reward_history.append(0)
        state_history = np.vstack(state_history)  #  (history_size, state_space)
        reward_history = np.array(reward_history).reshape(-1, 1)  # (history_size, 1)

        return state_history, reward_history


    def __len__(self):
        return len(self.buffer)

In [None]:
replay_buffer = ReplayBuffer(buffer_size, history_size)  #Calling the replaybuffer

In [5]:
# Convert state to one-hot encoding
def one_hot_state(state, num_states):
    one_hot = np.zeros(num_states)
    one_hot[state] = 1
    return one_hot

In [None]:
def custom_reward(state, action, reward, done, goal_state, visited_states):
    if int(state.argmax()) == int(goal_state.argmax()):
        return reward + 100  # High reward for reaching the goal
    elif done:
        return -20  # Heavy penalty for falling
    if tuple(state) in visited_states:
        return reward-5  # Penalty for revisiting a state (looping)
    else:
        return 1

In [None]:
# Attention mechanism
class AttentionMechanism(nn.Module):
    def __init__(self, state_dim, hidden_dim):
        super(AttentionMechanism, self).__init__()
        self.key_layer = nn.Linear(state_dim, hidden_dim)
        self.query_layer = nn.Linear(state_dim, hidden_dim)
        self.value_layer = nn.Linear(1 , hidden_dim)

    def forward(self, query, keys, values):
        query = self.query_layer(query).unsqueeze(1)  # (batch_size, 1, hidden_dim)
        keys = self.key_layer(keys)                   # (batch_size, seq_len, hidden_dim)
        values = self.value_layer(values)             # (batch_size, seq_len, hidden_dim)  (64,10,512)

        # Computing attention scores
        scores = torch.bmm(query, keys.transpose(1, 2)/ (keys.size(-1) ** 0.5))  # Dot product-  relevance score # (batch_size, 1, seq_len)
        

        include_rewards = values[..., 0]   # Remove last dim, shape: (batch_size, seq_len)
        
        include_rewards = include_rewards/ (include_rewards.norm(dim=-1, keepdim=True) + 1e-8)  # Normalize
        include_rewards= include_rewards.unsqueeze(1)
        scores = scores + include_rewards  # Shape: (batch_size, 1, seq_len)

        attention_weights = torch.softmax(scores, dim=-1)  # (batch_size, 1, seq_len)

        context = torch.bmm(attention_weights, values).squeeze(1)  # Weighted sum  # (batch_size, hidden_dim)
        
        return context

In [None]:
# DQN with Attention
class AttentionDQN(nn.Module):
    def __init__(self, state_dim, action_dim, hidden_dim=256):
        super(AttentionDQN, self).__init__()
        self.attention = AttentionMechanism(state_dim, hidden_dim)
        self.fc = nn.Sequential(
            nn.Linear(hidden_dim + state_dim, 256),
            nn.ReLU(),
            nn.Linear(256, 256),
            nn.ReLU(),
            nn.Linear(256, 256),
            nn.ReLU(),
            nn.Linear(256, action_dim))

    def forward(self, current_state, history, rewards):
        """
        current_state: Current state (batch_size, state_dim)
        history: Past states (batch_size, seq_len, state_dim)
        rewards: Past rewards (batch_size, seq_len, 1)
        """
        context = self.attention(current_state, history, rewards)  # Applying attention
        combined = torch.cat([current_state, context], dim=-1)  # Combine context with current state
        return self.fc(combined)

In [None]:
# Backward Model
class BackwardModel(nn.Module):
    def __init__(self, state_dim, action_dim, hidden_dim=256):
        super(BackwardModel, self).__init__()
        self.fc1 = nn.Linear(state_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.state_head = nn.Linear(hidden_dim, state_dim)  # Predicts previous state
        self.action_head = nn.Linear(hidden_dim, action_dim)  # Predicts action

    def forward(self, current_state):
        x = torch.relu(self.fc1(current_state))
        x = torch.relu(self.fc2(x))
        previous_state = torch.tanh(self.state_head(x)) 
        action = torch.softmax(self.action_head(x), dim=-1)
        return previous_state, action


In [None]:
# Generate Backward Transitions
def generate_backward_transitions(goal_state, backward_model, steps):
    transitions = []
    current_state = goal_state
    for _ in range(steps):
        with torch.no_grad():
            previous_state, action_probs = backward_model(current_state.unsqueeze(0))
            action = torch.argmax(action_probs).item()  # Selecting the most probable action
            previous_state = previous_state.squeeze(0) 

        
        reward = custom_reward(previous_state, action, 0, done=False, goal_state=goal_state, visited_states=set())
        transitions.append((previous_state.numpy(), action, reward, current_state.numpy(), False))
        current_state = previous_state
    return transitions


In [None]:
policy_net = AttentionDQN(state_space, action_space)
target_net = AttentionDQN(state_space, action_space)
target_net.load_state_dict(policy_net.state_dict())
optimizer_forward = optim.Adam(policy_net.parameters(), lr=0.1)
backward_model = BackwardModel(state_space, action_space)
optimizer_backward = optim.Adam(backward_model.parameters(), lr=0.05)

In [None]:
#Training Loop
total_rewards = []
success_count = 0
steps_to_goal = []

for episode in range(num_episodes):
    state, _ = env.reset()
    state = one_hot_state(state, state_space)
    goal_state = one_hot_state(state_space - 1, state_space) # Assumption - the last state corresponds to the goal
    done = False
    total_reward = 0
    step_count = 0
    visited_states = set()
    history = deque(maxlen=history_size)
    rewards = deque(maxlen=history_size)

    while not done:
        while len(history) < history_size:
            history.append(np.zeros_like(state)) # Padding the history with zeros before enough steps are taken.
            rewards.append(0)

        # Converting histories to tensors
        history_tensor = torch.tensor(np.array(history), dtype=torch.float32).unsqueeze(0)
        rewards_tensor = torch.tensor(np.array(rewards), dtype=torch.float32).unsqueeze(-1).unsqueeze(0)
        state_tensor = torch.tensor(state, dtype=torch.float32).unsqueeze(0)

        # Epsilon-greedy action selection
        if random.random() < epsilon:
            action = random.choice(range(action_space))  # Exploration
        else:
            with torch.no_grad():
                action = torch.argmax(policy_net(state_tensor, history_tensor, rewards_tensor)).item()  # Exploitation

        next_state, reward, done, _, _ = env.step(action)
        next_state = one_hot_state(next_state, state_space)

        reward = custom_reward(next_state, action, reward, done, goal_state, visited_states=set())
        total_reward += reward
        step_count += 1

        visited_states.add(tuple(next_state))  #updating visiting states for checking looping

        # Update histories
        history.append(state)
        rewards.append(reward)
        replay_buffer.add((state, action, reward, next_state, done))  # Storing transition in replay buffer
        state = next_state

        # Training the Q-Network
        if len(replay_buffer) > batch_size:
            batch = replay_buffer.sample(batch_size)
            state_histories, actions, rewards_batch, next_states, dones, reward_histories = batch

            state_histories = torch.tensor(state_histories, dtype=torch.float32)
            reward_histories = torch.tensor(reward_histories, dtype=torch.float32)#.unsqueeze(-1)
            actions = torch.tensor(actions, dtype=torch.long)
            rewards_batch = torch.tensor(rewards_batch, dtype=torch.float32)
            next_states = torch.tensor(next_states, dtype=torch.float32)
            dones = torch.tensor(dones, dtype=torch.float32)

            # Computing target Q-values
            with torch.no_grad():
                next_state_histories = torch.cat((state_histories[:, 1:, :], next_states.unsqueeze(1)), dim=1)
                next_reward_histories = torch.cat((reward_histories[:, 1:, :], rewards_batch.unsqueeze(1).unsqueeze(-1)), dim=1)
                target_q_values = rewards_batch + gamma * (1 - dones) * torch.max(
                    target_net(next_states, next_state_histories, next_reward_histories), dim=1)[0]

            current_q_values = policy_net(state_histories[:, -1, :], state_histories, reward_histories).gather(1, actions.unsqueeze(1)).squeeze(1)

            loss = nn.MSELoss()(current_q_values, target_q_values)
            optimizer_forward.zero_grad()
            loss.backward()

            optimizer_forward.step()

    epsilon = max(epsilon * epsilon_decay, epsilon_min)

    if episode % target_update_freq == 0:
        target_net.load_state_dict(policy_net.state_dict())

    # Adding backward transitions every 10 episodes
    if episode % 10 == 0:
        backward_transitions = generate_backward_transitions(torch.tensor(goal_state, dtype=torch.float32), backward_model, backward_steps)
        for transition in backward_transitions:
            replay_buffer.add(transition)
        
        # Training the backward model
        for transition in backward_transitions:
            state, action, reward, next_state, done = transition

            state_tensor = torch.tensor(state, dtype=torch.float32).unsqueeze(0)
            next_state_tensor = torch.tensor(next_state, dtype=torch.float32).unsqueeze(0)
            action_tensor = torch.tensor([action], dtype=torch.long)

            # Predicted backward state-action pairs
            predicted_state, predicted_action = backward_model(next_state_tensor)

            state_loss = nn.MSELoss()(predicted_state, state_tensor)
            action_loss = nn.CrossEntropyLoss()(predicted_action, action_tensor)
            backward_loss = state_loss + action_loss

            optimizer_backward.zero_grad()
            backward_loss.backward()
            optimizer_backward.step()

    total_rewards.append(total_reward)
    if done and int(state.argmax()) == int(goal_state.argmax()):
        success_count += 1
        steps_to_goal.append(step_count)

    print(f"Episode {episode + 1}, Total Reward: {total_reward:.2f}, Steps: {step_count}, Goal Reached: {'Yes' if done and int(state.argmax()) == int(goal_state.argmax()) else 'No'}")

# Calculating metrics
avg_reward = np.mean(total_rewards)
success_rate = (success_count / num_episodes) * 100
avg_steps = np.mean(steps_to_goal) if steps_to_goal else float('inf')

print("\n--- Training Metrics ---")
print(f"Average Reward: {avg_reward:.2f}")
print(f"Success Rate: {success_rate:.2f}%")
print(f"Average Steps to Goal (for successful episodes): {avg_steps:.2f}")
print("------------------------")

# Saving the trained model
torch.save(policy_net.state_dict(), "model/Backward_DQN_policy_net.pth")
print("Training complete. Model saved.")

# Saving the backward model
torch.save(backward_model.state_dict(), "model/Backward_DQN_backward_model.pth")
print("Backward model saved successfully.")



Episode 1, Total Reward: 15.00, Steps: 36, Goal Reached: No
Episode 2, Total Reward: -18.00, Steps: 3, Goal Reached: No
Episode 3, Total Reward: -6.00, Steps: 15, Goal Reached: No
Episode 4, Total Reward: 4.00, Steps: 25, Goal Reached: No
Episode 5, Total Reward: -10.00, Steps: 11, Goal Reached: No
Episode 6, Total Reward: -11.00, Steps: 10, Goal Reached: No
Episode 7, Total Reward: -10.00, Steps: 11, Goal Reached: No
Episode 8, Total Reward: 10.00, Steps: 31, Goal Reached: No
Episode 9, Total Reward: -17.00, Steps: 4, Goal Reached: No
Episode 10, Total Reward: 122.00, Steps: 22, Goal Reached: Yes
Episode 11, Total Reward: -11.00, Steps: 10, Goal Reached: No
Episode 12, Total Reward: 15.00, Steps: 36, Goal Reached: No
Episode 13, Total Reward: -7.00, Steps: 14, Goal Reached: No
Episode 14, Total Reward: -12.00, Steps: 9, Goal Reached: No
Episode 15, Total Reward: -11.00, Steps: 10, Goal Reached: No
Episode 16, Total Reward: -11.00, Steps: 10, Goal Reached: No
Episode 17, Total Reward: 

In [None]:
# Testing loop 
def test_model_with_backward(policy_net, backward_model, env, num_test_episodes=100, history_size=10, use_backward=False):
    total_rewards = []
    success_count = 0
    steps_to_goal = []

    policy_net.load_state_dict(torch.load("model/Backward_DQN_policy_net.pth"))
    policy_net.eval()  
    backward_model.load_state_dict(torch.load("model/DQN_trained_backward_model.pth"))
    backward_model.eval()  

    for episode in range(num_test_episodes):
        state, _ = env.reset()
        state = one_hot_state(state, env.observation_space.n)
        goal_state = one_hot_state(env.observation_space.n - 1, env.observation_space.n)
        done = False
        total_reward = 0
        step_count = 0
        visited_states = set()
        history = deque(maxlen=history_size)
        rewards = deque(maxlen=history_size)

        while not done:
            while len(history) < history_size:
                history.append(np.zeros_like(state))
                rewards.append(0)

            history_tensor = torch.tensor(np.array(history), dtype=torch.float32).unsqueeze(0)
            rewards_tensor = torch.tensor(np.array(rewards), dtype=torch.float32).unsqueeze(-1).unsqueeze(0)
            state_tensor = torch.tensor(state, dtype=torch.float32).unsqueeze(0)

            if random.random() < 0.1:  # 10% chance to explore
                     action = random.choice(range(action_space))
            else: 
                with torch.no_grad():
                    action = torch.argmax(policy_net(state_tensor, history_tensor, rewards_tensor)).item()

            next_state, reward, done, _, _ = env.step(action)
            next_state = one_hot_state(next_state, env.observation_space.n)

            reward = custom_reward(next_state, action, reward, done, goal_state, visited_states)
            total_reward += reward
            step_count += 1

            visited_states.add(tuple(next_state))
            history.append(state)
            rewards.append(reward)

            state = next_state

            # If backward planning is enabled
            if use_backward and step_count % 10 == 0:
                with torch.no_grad():
                    backward_state_tensor = torch.tensor(next_state, dtype=torch.float32).unsqueeze(0)
                    predicted_state, predicted_action = backward_model(backward_state_tensor)


        # Updating metrics
        total_rewards.append(total_reward)
        if int(state.argmax()) == int(goal_state.argmax()):
            success_count += 1
            steps_to_goal.append(step_count)

        print(f"Test Episode {episode + 1}: Reward = {total_reward:.2f}, Steps = {step_count}, Goal Reached: {'Yes' if done and int(state.argmax()) == int(goal_state.argmax()) else 'No'}")

    # Calculate final metrics
    avg_reward = np.mean(total_rewards)
    success_rate = (success_count / num_test_episodes) * 100
    avg_steps = np.mean(steps_to_goal) if steps_to_goal else float('inf')

    print("\n--- Testing Metrics ---")
    print(f"Average Reward: {avg_reward:.2f}")
    print(f"Success Rate: {success_rate:.2f}%")
    print(f"Average Steps to Goal (for successful episodes): {avg_steps:.2f}")
    print("------------------------")

# Return metrics as a dictionary
    metrics = {"Average Reward": avg_reward,
        "Success Rate (%)": success_rate,
        "Average Steps to Goal": avg_steps}
    return metrics




In [None]:
# Run the testing loop
DQN_Backward_planning_metrics = test_model_with_backward(policy_net, backward_model, env, num_test_episodes=200, use_backward=True)


  policy_net.load_state_dict(torch.load("Backward_DQN_policy_net.pth"))
  backward_model.load_state_dict(torch.load("DQN_trained_backward_model.pth"))


Test Episode 1: Reward = -16.00, Steps = 5, Goal Reached: No
Test Episode 2: Reward = -2944.00, Steps = 622, Goal Reached: Yes
Test Episode 3: Reward = -56.00, Steps = 19, Goal Reached: No
Test Episode 4: Reward = -17.00, Steps = 33, Goal Reached: Yes
Test Episode 5: Reward = -908.00, Steps = 216, Goal Reached: Yes
Test Episode 6: Reward = -738.00, Steps = 153, Goal Reached: No
Test Episode 7: Reward = -610.00, Steps = 154, Goal Reached: Yes
Test Episode 8: Reward = -17.00, Steps = 4, Goal Reached: No
Test Episode 9: Reward = -1203.00, Steps = 275, Goal Reached: Yes
Test Episode 10: Reward = -62.00, Steps = 42, Goal Reached: Yes
Test Episode 11: Reward = -436.00, Steps = 118, Goal Reached: Yes
Test Episode 12: Reward = -619.00, Steps = 157, Goal Reached: Yes
Test Episode 13: Reward = -105.00, Steps = 24, Goal Reached: No
Test Episode 14: Reward = -879.00, Steps = 186, Goal Reached: No
Test Episode 15: Reward = -535.00, Steps = 139, Goal Reached: Yes
Test Episode 16: Reward = -913.00, S

Implementation of Metrics

1. Success Rate

In [15]:
def calculate_success_rate(model, env_test, num_episodes=100):
    success_count = 0
    for episode in range(num_episodes):
        state, _ = env.reset()
        state = one_hot_state(state, state_space)
        history = deque([np.zeros(state_space) for _ in range(history_size)], maxlen=history_size)
        rewards = deque([0 for _ in range(history_size)], maxlen=history_size)
        done = False

        while not done:
            # Prepare tensors for history and rewards
            history_tensor = torch.tensor(np.array(history), dtype=torch.float32).unsqueeze(0)
            rewards_tensor = torch.tensor(np.array(rewards), dtype=torch.float32).unsqueeze(-1).unsqueeze(0)
            state_tensor = torch.tensor(state, dtype=torch.float32).unsqueeze(0)

            with torch.no_grad():
                action = torch.argmax(model(state_tensor, history_tensor, rewards_tensor)).item()
            next_state, reward, done, _, _ = env.step(action)
            state = one_hot_state(next_state, state_space)

            # Update history and rewards
            history.append(state)
            rewards.append(reward)

            if done and reward > 0:  # Reached the goal
                success_count += 1

    success_rate = (success_count / num_episodes) * 100
    return success_rate



2.Average Reward

In [16]:
def calculate_average_reward(model, env_test, num_episodes=100):
    total_reward = 0
    for episode in range(num_episodes):
        state, _ = env.reset()
        state = one_hot_state(state, state_space)
        history = deque([np.zeros(state_space) for _ in range(history_size)], maxlen=history_size)
        rewards = deque([0 for _ in range(history_size)], maxlen=history_size)
        done = False

        while not done:
            # Prepare tensors for history and rewards
            history_tensor = torch.tensor(np.array(history), dtype=torch.float32).unsqueeze(0)
            rewards_tensor = torch.tensor(np.array(rewards), dtype=torch.float32).unsqueeze(-1).unsqueeze(0)
            state_tensor = torch.tensor(state, dtype=torch.float32).unsqueeze(0)

            with torch.no_grad():
                action = torch.argmax(model(state_tensor, history_tensor, rewards_tensor)).item()
            next_state, reward, done, _, _ = env.step(action)
            state = one_hot_state(next_state, state_space)

            # Update history and rewards
            history.append(state)
            rewards.append(reward)

            total_reward += reward

    average_reward = total_reward / num_episodes
    return average_reward



3. Steps taken to goal

In [17]:
def calculate_steps_to_goal(model, env_test, num_episodes=100):
    total_steps = 0
    success_count = 0

    for episode in range(num_episodes):
        state, _ = env.reset()
        state = one_hot_state(state, state_space)
        history = deque([np.zeros(state_space) for _ in range(history_size)], maxlen=history_size)
        rewards = deque([0 for _ in range(history_size)], maxlen=history_size)
        done = False
        steps = 0

        while not done:
            # Prepare tensors for history and rewards
            history_tensor = torch.tensor(np.array(history), dtype=torch.float32).unsqueeze(0)
            rewards_tensor = torch.tensor(np.array(rewards), dtype=torch.float32).unsqueeze(-1).unsqueeze(0)
            state_tensor = torch.tensor(state, dtype=torch.float32).unsqueeze(0)

            with torch.no_grad():
                action = torch.argmax(model(state_tensor, history_tensor, rewards_tensor)).item()
            next_state, reward, done, _, _ = env.step(action)
            state = one_hot_state(next_state, state_space)

            # Update history and rewards
            history.append(state)
            rewards.append(reward)
            steps += 1

            if done and reward > 0:  # Reached the goal
                total_steps += steps
                success_count += 1

    if success_count == 0:
        return None  # No successful episodes
    return total_steps / success_count



In [18]:
torch.cuda.empty_cache()  # Clear unused GPU memory before plotting

In [19]:
num_eval_episodes = 100
# Evaluate the Attention-DQN model
success_rate = calculate_success_rate(policy_net, env_test, num_eval_episodes)
average_reward = calculate_average_reward(policy_net, env_test, num_eval_episodes)
steps_to_goal = calculate_steps_to_goal(policy_net, env_test, num_eval_episodes)


NameError: name 'env_test' is not defined

In [None]:
# Visualization of metrics
metrics = ['Success Rate (%)', 'Average Reward', 'Average Steps to Goal']
values = [success_rate, average_reward, steps_to_goal if steps_to_goal is not None else 0]

In [None]:
torch.cuda.empty_cache()  # Clear unused GPU memory before plotting

In [None]:
plt.figure(figsize=(10, 6))
plt.bar(metrics, values, color=['blue', 'green', 'orange'])
plt.title('Performance Metrics of the Attention-Enhanced Model')
plt.ylabel('Values')
plt.ylim(0, max(values) + 10)  # Adjust Y-axis limits based on metrics
for i, value in enumerate(values):
    plt.text(i, value + 1, f'{value:.2f}', ha='center', fontsize=12)
plt.grid(axis='y')
plt.show()

# Print metrics
print(f"Success Rate: {success_rate:.2f}%")
print(f"Average Reward: {average_reward:.2f}")
print(f"Average Steps to Goal: {steps_to_goal if steps_to_goal is not None else 'No successful episodes'}")


In [None]:
# Assuming both models are trained: policy_net (Original DQN) and attention_policy_net (Attention DQN)

# Evaluate both models
# num_episodes = 100  # Number of test episodes

# # Original DQN
# success_rate_dqn = calculate_success_rate(policy_net, env_test, num_episodes)
# average_reward_dqn = calculate_average_reward(policy_net, env_test, num_episodes)
# steps_to_goal_dqn = calculate_steps_to_goal(policy_net, env_test, num_episodes)
# num_episodes = 100
# # # Attention DQN
# success_rate_attention_dqn = calculate_success_rate(policy_net, env_test, num_episodes)
# average_reward_attention_dqn = calculate_average_reward(policy_net, env_test, num_episodes)
# steps_to_goal_attention_dqn = calculate_steps_to_goal(policy_net, env_test, num_episodes)


Visualising the results

In [None]:
# # # Prepare for plotting
# metrics = ['Success Rate (%)', 'Average Reward', 'Steps to Goal']
# # dqn_values = [success_rate_dqn, average_reward_dqn, steps_to_goal_dqn]
# attention_dqn_values = [success_rate_attention_dqn, average_reward_attention_dqn, steps_to_goal_attention_dqn]

# # # Plotting each metric
# # fig, axs = plt.subplots(1, 3, figsize=(18, 5))

# # # Success Rate Comparison
# axs[0].bar(['Original DQN', 'Attention DQN'], [success_rate_dqn, success_rate_attention_dqn], color=['blue', 'green'])
# axs[0].set_title('Success Rate Comparison')
# axs[0].set_ylabel('Success Rate (%)')

# # Average Reward Comparison
# axs[1].bar(['Original DQN', 'Attention DQN'], [average_reward_dqn, average_reward_attention_dqn], color=['blue', 'green'])
# axs[1].set_title('Average Reward Comparison')
# axs[1].set_ylabel('Average Reward')

# # Steps to Goal Comparison
# axs[2].bar(['Original DQN', 'Attention DQN'], [steps_to_goal_dqn, steps_to_goal_attention_dqn], color=['blue', 'green'])
# axs[2].set_title('Steps to Goal Comparison')
# axs[2].set_ylabel('Steps to Goal')

# Show the plot
# plt.tight_layout()
# plt.show()
