In [3]:
import gymnasium as gym
import numpy as np
import random
from math import exp
from gymnasium.envs.toy_text.frozen_lake import generate_random_map

# Create a 10x10 deterministic FrozenLake environment
env = gym.make('FrozenLake-v1', is_slippery=False, desc=generate_random_map(size=10, seed=259))

state_size = env.observation_space.n
action_size = env.action_space.n

desc = env.unwrapped.desc
hole_states = {i for i, row in enumerate(desc.flatten()) if row == b'H'}
goal_state = {i for i, row in enumerate(desc.flatten()) if row == b'G'}

# Initialize Q-table with terminal states set to 0
qtable = np.random.uniform(low=-0.5, high=0.5, size=(state_size, action_size))
qtable[list(hole_states), :] = 0.0  # No future rewards from holes
qtable[list(goal_state), :] = 0.0   # No future rewards from goal

# Training parameters
total_episodes = 20000
max_steps = 200
initial_learning_rate = 0.8
min_learning_rate = 0.1
gamma = 0.95
epsilon_initial = 1.0
min_epsilon = 0.01
decay_factor = (min_epsilon / epsilon_initial) ** (1 / total_episodes)  # Proper decay calculation
temperature_initial = 1.0
min_temperature = 0.01

epsilon = epsilon_initial
temperature = temperature_initial
rewards = []

# Precompute distances to goal for reward shaping
def compute_distances_to_goal(desc, goal_state):
    rows, cols = desc.shape
    goal_row, goal_col = np.unravel_index(list(goal_state)[0], (rows, cols))
    distances = np.zeros((rows, cols))
    for i in range(rows):
        for j in range(cols):
            distances[i, j] = abs(i - goal_row) + abs(j - goal_col)
    return distances.flatten()

distances = compute_distances_to_goal(desc, goal_state)

for episode in range(total_episodes):
    state, _ = env.reset()
    total_rewards = 0
    done = False

    # Anneal learning rate
    learning_rate = max(min_learning_rate, initial_learning_rate * (1 - episode / total_episodes))

    for step in range(max_steps):
        # Boltzmann exploration (softmax action selection)
        if random.uniform(0, 1) > epsilon:
            action_probs = np.exp(qtable[state] / temperature) / np.sum(np.exp(qtable[state] / temperature))
            action = np.random.choice(range(action_size), p=action_probs)
        else:
            action = env.action_space.sample()

        new_state, reward, done, truncated, _ = env.step(action)

        # Reward shaping based on distance to goal
        if new_state in hole_states:
            reward = -1.0
        elif new_state in goal_state:
            reward = 1.0
        else:
            # Encourage moving closer to the goal
            reward = 0.1 * (distances[state] - distances[new_state])

        # Q-learning update
        qtable[state, action] += learning_rate * (
            reward + gamma * np.max(qtable[new_state]) - qtable[state, action]
        )

        total_rewards += reward
        state = new_state

        if done or truncated:
            break

    # Decay exploration parameters
    epsilon = max(min_epsilon, epsilon * decay_factor)
    temperature = max(min_temperature, temperature * decay_factor)
    rewards.append(total_rewards)

# Evaluation phase
test_episodes = 100
successes = 0
for _ in range(test_episodes):
    state, _ = env.reset()
    done = False
    while not done:
        action = np.argmax(qtable[state])
        new_state, reward, done, truncated, _ = env.step(action)
        if done and reward == 1.0:
            successes += 1
    env.close()

print(f"Average Training Reward: {sum(rewards)/total_episodes:.3f}")
print(f"Success Rate ({test_episodes} tests): {successes/test_episodes*100:.1f}%")

Average Training Reward: 1.377
Success Rate (100 tests): 0.0%
