In [2]:
import numpy as np
import gymnasium as gym
import random

# Environment setup
env = gym.make('FrozenLake-v1', is_slippery=False)  # Set to False for deterministic actions
total_episodes = 10000  # Number of training episodes
max_steps = 100  # Max steps per episode

# Q-learning parameters
learning_rate = 0.8
gamma = 0.95  # Discount factor
epsilon = 1.0  # Exploration rate
max_epsilon = 1.0
min_epsilon = 0.01
decay_rate = 0.001

# Q-table initialization
state_size = env.observation_space.n
action_size = env.action_space.n
qtable = np.zeros((state_size, action_size))

# List of rewards
rewards = []

# Training loop
for episode in range(total_episodes):
    state, _ = env.reset()  # Updated reset() format
    done = False
    total_rewards = 0

    for step in range(max_steps):
        # Exploration-exploitation trade-off
        exp_exp_tradeoff = random.uniform(0, 1)
        
        if exp_exp_tradeoff > epsilon:
            action = np.argmax(qtable[state])  # Exploit best action
        else:
            action = env.action_space.sample()  # Explore random action

        # Take action and observe result
        new_state, reward, done, truncated, _ = env.step(action)  # Updated step() format

        # Q-learning update rule
        qtable[state, action] = qtable[state, action] + learning_rate * (
            reward + gamma * np.max(qtable[new_state]) - qtable[state, action]
        )

        total_rewards += reward
        state = new_state  # Move to new state

        if done or truncated:  # Stop if episode ends
            break

    # Decay epsilon (reduce exploration)
    epsilon = min_epsilon + (max_epsilon - min_epsilon) * np.exp(-decay_rate * episode)
    rewards.append(total_rewards)

# Print results
print("Score over time:", sum(rewards) / total_episodes)
print("Final Q-Table:")
print(qtable)

Score over time: 0.8835
Final Q-Table:
[[0.73509189 0.77378094 0.77378094 0.73509189]
 [0.73509189 0.         0.81450625 0.77378094]
 [0.77378094 0.857375   0.77378094 0.81450625]
 [0.81450625 0.         0.77375638 0.77378075]
 [0.77378094 0.81450625 0.         0.73509189]
 [0.         0.         0.         0.        ]
 [0.         0.9025     0.         0.81450625]
 [0.         0.         0.         0.        ]
 [0.81450625 0.         0.857375   0.77378094]
 [0.81450625 0.9025     0.9025     0.        ]
 [0.857375   0.95       0.         0.857375  ]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.         0.9025     0.95       0.857375  ]
 [0.9025     0.95       1.         0.9025    ]
 [0.         0.         0.         0.        ]]
