In [2]:
import numpy as np
import gym

# Initialize FrozenLake environment
env = gym.make("FrozenLake-v1", is_slippery=False, render_mode=None)  

In [3]:
# Define Q-learning parameters
alpha = 0.1   # Learning rate
gamma = 0.99  # Discount factor
epsilon = 1.0 # Initial exploration rate
epsilon_min = 0.01  # 🔥 Correctly defined now
epsilon_decay = 0.995
episodes = 5000


In [4]:
# Initialize Q-table
state_size = env.observation_space.n
action_size = env.action_space.n
Q_table = np.zeros((state_size, action_size))

In [5]:
for episode in range(episodes):
    state, _ = env.reset()  # Reset environment for a new episode
    done = False

    while not done:
        # Choose action using epsilon-greedy policy
        if np.random.rand() < epsilon:
            action = env.action_space.sample()  # Explore
        else:
            action = np.argmax(Q_table[state, :])  # Exploit

        # Take action and observe the outcome
        new_state, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated  # Fix Gym API change

        # Q-learning update rule (Bellman Equation)
        Q_table[state, action] += alpha * (reward + gamma * np.max(Q_table[new_state, :]) - Q_table[state, action])

        state = new_state  # Move to the next state

    # 🔥 Decay epsilon after every episode
    if epsilon > epsilon_min:
        epsilon *= epsilon_decay

    # Print progress every 500 episodes
    if episode % 500 == 0:
        print(f"Episode {episode}, Epsilon: {epsilon:.4f}")

print("Training completed!")



Episode 0, Epsilon: 0.9950
Episode 500, Epsilon: 0.0812
Episode 1000, Epsilon: 0.0100
Episode 1500, Epsilon: 0.0100
Episode 2000, Epsilon: 0.0100
Episode 2500, Epsilon: 0.0100
Episode 3000, Epsilon: 0.0100
Episode 3500, Epsilon: 0.0100
Episode 4000, Epsilon: 0.0100
Episode 4500, Epsilon: 0.0100
Training completed!


In [6]:
test_episodes = 10
for episode in range(test_episodes):
    state, _ = env.reset()
    done = False
    total_reward = 0
    print(f"\nEpisode {episode+1}:")

    while not done:
        action = np.argmax(Q_table[state, :])  # Always exploit learned policy
        new_state, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated
        total_reward += reward
        state = new_state

    print(f"Total Reward: {total_reward}")

env.close()



Episode 1:
Total Reward: 0.0

Episode 2:
Total Reward: 0.0

Episode 3:
Total Reward: 0.0

Episode 4:
Total Reward: 0.0

Episode 5:
Total Reward: 0.0

Episode 6:
Total Reward: 0.0

Episode 7:
Total Reward: 0.0

Episode 8:
Total Reward: 0.0

Episode 9:
Total Reward: 0.0

Episode 10:
Total Reward: 0.0
