In [None]:
import gymnasium as gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import random
from collections import deque
from gymnasium.envs.toy_text.frozen_lake import generate_random_map
import time

In [13]:
# Create the FrozenLake environment 
random.seed(21)
np.random.seed(21)
torch.manual_seed(21)
env = gym.make("FrozenLake-v1",render_mode='human',desc=generate_random_map(size=5), is_slippery = False)  # Disable randomness for simplicity
state_space = env.observation_space.n
action_space = env.action_space.n
env.action_space.seed(21)
env.observation_space.seed(21)  # Seed the observation space
env.reset(seed=21)


# Parameters
gamma = 0.99
epsilon = 1.0
epsilon_min = 0.1
epsilon_decay = 0.999
learning_rate = 0.1
batch_size = 64
buffer_size = 10000
target_update_freq = 10
num_episodes = 500

In [None]:
class ReplayBuffer:
    def __init__(self, capacity):
        self.buffer = deque(maxlen=capacity)

    def add(self, transition):
        self.buffer.append(transition)

    def sample(self, batch_size):
        batch = random.sample(self.buffer, batch_size)  #Sample a batch of transitions uniformly
        states, actions, rewards, next_states, dones = zip(*batch)
        return np.array(states), np.array(actions), np.array(rewards), np.array(next_states), np.array(dones)

    def __len__(self):
        return len(self.buffer)


In [15]:
# Replay buffer
replay_buffer = ReplayBuffer(buffer_size)

In [16]:
# Convert state to one-hot encoding
def one_hot_state(state, num_states):
    one_hot = np.zeros(num_states)
    one_hot[state] = 1
    return one_hot

In [None]:
def custom_reward(state, action, reward, done, goal_state, visited_states):
    if int(state.argmax()) == int(goal_state.argmax()):
        return reward+100  # High reward for reaching the goal
    elif done:
        return -20  # Heavy penalty for falling
    if tuple(state) in visited_states:
        return reward-5  # Penalty for revisiting a state
    else:
        return 1

In [18]:
# Neural Network for DQN
class DQN(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(DQN, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(input_dim, 256),
            nn.ReLU(),
            nn.Linear(256, 256),
            nn.ReLU(),
            nn.Linear(256, 256),
            nn.ReLU(),
            nn.Linear(256, output_dim))
    
    def forward(self, x):
        return self.fc(x)

In [None]:
policy_net = DQN(state_space, action_space)  #neural network that approximates the Q-value
target_net = DQN(state_space, action_space)  # provides fixed Q-value targets for several steps
target_net.load_state_dict(policy_net.state_dict())
optimizer = optim.Adam(policy_net.parameters(), lr=learning_rate)


In [None]:
#Training Loop
total_rewards = []
success_count = 0
steps_to_goal = []

for episode in range(num_episodes):
    state, _ = env.reset()
    state = one_hot_state(state, state_space)
    done = False
    total_reward = 0
    step_count = 0
    goal_state = state_space - 1
    goal_state = one_hot_state(goal_state, state_space)
    visited_states = set()

    while not done:
        if random.random() < epsilon:
            action = random.choice(range(action_space))  # Exploration
        else:
            with torch.no_grad():
                action = torch.argmax(policy_net(torch.tensor(state, dtype=torch.float32))).item()  # Exploitation

        next_state, reward, done, _, _ = env.step(action)
        next_state = one_hot_state(next_state, state_space)

        reward = custom_reward(next_state, action, reward, done, goal_state, visited_states)
        total_reward += reward
        step_count += 1

        visited_states.add(tuple(next_state))  #updating visiting states for checking looping

        replay_buffer.add((state, action, reward, next_state, done))
        state = next_state

        # Training Q-Network
        if len(replay_buffer) > batch_size:
            states, actions, rewards, next_states, dones = replay_buffer.sample(batch_size)

            states = torch.tensor(states, dtype=torch.float32)
            actions = torch.tensor(actions, dtype=torch.long)
            rewards = torch.tensor(rewards, dtype=torch.float32)
            next_states = torch.tensor(next_states, dtype=torch.float32)
            dones = torch.tensor(dones, dtype=torch.float32)

            # Computing target Q-values
            with torch.no_grad():
                target_q_values = rewards + gamma * (1 - dones) * torch.max(target_net(next_states), dim=1)[0]

            current_q_values = policy_net(states).gather(1, actions.unsqueeze(1)).squeeze(1)

            loss = nn.MSELoss()(current_q_values, target_q_values)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

    epsilon = max(epsilon * epsilon_decay, epsilon_min)

    # Updating target network every 10 episodes
    if episode % target_update_freq == 0:
        target_net.load_state_dict(policy_net.state_dict())

    total_rewards.append(total_reward)
    if done and int(state.argmax()) == int(goal_state.argmax()):
        success_count += 1
        steps_to_goal.append(step_count)

    print(f"Episode {episode + 1}, Total Reward: {total_reward}, Steps: {step_count}, Goal Reached: {'Yes' if done and int(state.argmax()) == int(goal_state.argmax()) else 'No'}")

avg_reward = np.mean(total_rewards)
success_rate = (success_count / num_episodes) * 100
avg_steps_to_goal = np.mean(steps_to_goal) if steps_to_goal else float('inf')

print("\n--- Training Metrics ---")
print(f"Average Reward: {avg_reward:.2f}")
print(f"Success Rate: {success_rate:.2f}%")
print(f"Average Steps to Goal (for successful episodes): {avg_steps_to_goal:.2f}")
print("------------------------")

# Saving the trained model
torch.save(policy_net.state_dict(), "model/DQN_policy_net.pth")
print("Training complete. Model saved.")


Episode 1, Total Reward: -76.0, Steps: 17, Goal Reached: No
Episode 2, Total Reward: -71.0, Steps: 16, Goal Reached: No
Episode 3, Total Reward: -18, Steps: 3, Goal Reached: No
Episode 4, Total Reward: -19, Steps: 2, Goal Reached: No
Episode 5, Total Reward: -19, Steps: 2, Goal Reached: No
Episode 6, Total Reward: -46.0, Steps: 11, Goal Reached: No
Episode 7, Total Reward: -17, Steps: 4, Goal Reached: No
Episode 8, Total Reward: -33.0, Steps: 6, Goal Reached: No
Episode 9, Total Reward: -29.0, Steps: 4, Goal Reached: No
Episode 10, Total Reward: -19, Steps: 2, Goal Reached: No
Episode 11, Total Reward: -40.0, Steps: 11, Goal Reached: No
Episode 12, Total Reward: -75.0, Steps: 18, Goal Reached: No
Episode 13, Total Reward: -19, Steps: 2, Goal Reached: No
Episode 14, Total Reward: -30.0, Steps: 15, Goal Reached: No
Episode 15, Total Reward: -19, Steps: 2, Goal Reached: No
Episode 16, Total Reward: -48.0, Steps: 9, Goal Reached: No
Episode 17, Total Reward: -19, Steps: 2, Goal Reached: No

In [None]:
# Testing the trained policy
def test_dqn(policy_net, env, num_test_episodes=100):
    total_rewards = []
    success_count = 0
    goal_state = state_space - 1  # Define the goal state
    goal_state = one_hot_state(goal_state,state_space)

    # Load trained policy net
    policy_net.load_state_dict(torch.load("model/DQN_policy_net.pth"))
    policy_net.eval()

    for episode in range(num_test_episodes):
        state, _ = env.reset()
        state = one_hot_state(state, state_space)
        done = False
        episode_reward = 0
        visited_states = set()  # Track visited states
        trajectory = []  

        while not done:
                trajectory.append(state.argmax())  # Log the agent's current state
                
                if random.random() < 0.1:  # 10% chance to explore
                    action = random.choice(range(action_space))
                else:
                    with torch.no_grad():
                        action = torch.argmax(policy_net(torch.tensor(state, dtype=torch.float32))).item()
            
            # Take a step in the environment
                next_state, reward, done, _, _ = env.step(action)
                next_state = one_hot_state(next_state, state_space)
                
                reward = custom_reward(next_state, action, reward, done, goal_state,visited_states)
                episode_reward += reward
                state = next_state

                total_rewards.append(episode_reward)

        # Checking if the goal state was reached
        if done and int(state.argmax()) == int(goal_state.argmax()):
            success_count += 1
            

        print(f"Test Episode {episode + 1}: Reward = {total_reward:.2f}, Steps = {step_count}, Goal Reached: {'Yes' if done and int(state.argmax()) == int(goal_state.argmax()) else 'No'}")

    avg_reward = np.mean(total_rewards)
    success_rate = (success_count / num_test_episodes) * 100
    avg_steps_test = np.mean(steps_to_goal) if steps_to_goal else float('inf')

    print("\n--- Testing Metrics ---")
    print(f"Average Reward: {avg_reward:.2f}")
    print(f"Success Rate: {success_rate:.2f}%")
    print(f"Average Steps to Goal (for successful episodes): {avg_steps_test:.2f}")
    print("------------------------")



In [None]:
# Run the test
print("Testing the trained policy...")
test_dqn(policy_net, env, num_test_episodes=200)

Testing the trained policy...


  policy_net.load_state_dict(torch.load("DQN_policy_net.pth"))


Test Episode 1: Reward = -18.00, Steps = 3, Goal Reached: No
Test Episode 2: Reward = -18.00, Steps = 3, Goal Reached: No
Test Episode 3: Reward = -18.00, Steps = 3, Goal Reached: No
Test Episode 4: Reward = -18.00, Steps = 3, Goal Reached: No
Test Episode 5: Reward = -18.00, Steps = 3, Goal Reached: No
Test Episode 6: Reward = -18.00, Steps = 3, Goal Reached: No
Test Episode 7: Reward = -18.00, Steps = 3, Goal Reached: No
Test Episode 8: Reward = -18.00, Steps = 3, Goal Reached: No
Test Episode 9: Reward = -18.00, Steps = 3, Goal Reached: No
Test Episode 10: Reward = -18.00, Steps = 3, Goal Reached: No
Test Episode 11: Reward = -18.00, Steps = 3, Goal Reached: No
Test Episode 12: Reward = -18.00, Steps = 3, Goal Reached: No
Test Episode 13: Reward = -18.00, Steps = 3, Goal Reached: No
Test Episode 14: Reward = -18.00, Steps = 3, Goal Reached: No
Test Episode 15: Reward = -18.00, Steps = 3, Goal Reached: No
Test Episode 16: Reward = -18.00, Steps = 3, Goal Reached: No
Test Episode 17: 