Implementing a parallel Q-learning algorithm

In [24]:
pip install gym




In [38]:
import gym
import numpy as np

In [39]:
# Create the FrozenLake environment
env = gym.make("FrozenLake-v1")

In [40]:
# Initialize the Q-table with zeros
num_states = env.observation_space.n
num_actions = env.action_space.n
Q = np.zeros((num_states, num_actions))

In [51]:
# Hyperparameters
learning_rate = 0.1
discount_factor = 0.0099
num_episodes = 10000
max_steps_per_episode = 100

In [46]:
# Exploration parameters
epsilon = 1.0
epsilon_decay = 0.995
min_epsilon = 0.01

In [52]:
# Q-learning algorithm
for episode in range(num_episodes):
    state = env.reset()
    done = False
    total_reward = 0

    for step in range(max_steps_per_episode):
        # Epsilon-greedy policy for action selection
        if np.random.rand() < epsilon:
            action = env.action_space.sample()  # Explore
        else:
            action = np.argmax(Q[state, :])  # Exploit

        # Take the selected action and observe the new state and reward
        next_state, reward, done, _ = env.step(action)

        # Update the Q-table
        Q[state, action] = (1 - learning_rate) * Q[state, action] + \
            learning_rate * (reward + discount_factor * np.max(Q[next_state, :]))

        total_reward += reward
        state = next_state

        if done:
            break

    # Decay epsilon to reduce exploration over time
    epsilon = max(min_epsilon, epsilon * epsilon_decay)

    if episode % 100 == 0:
        print(f"Episode {episode}, Total Reward: {total_reward}")

Episode 0, Total Reward: 0.0
Episode 100, Total Reward: 0.0
Episode 200, Total Reward: 0.0
Episode 300, Total Reward: 0.0
Episode 400, Total Reward: 0.0
Episode 500, Total Reward: 0.0
Episode 600, Total Reward: 0.0
Episode 700, Total Reward: 0.0
Episode 800, Total Reward: 0.0
Episode 900, Total Reward: 0.0
Episode 1000, Total Reward: 0.0
Episode 1100, Total Reward: 0.0
Episode 1200, Total Reward: 0.0
Episode 1300, Total Reward: 0.0
Episode 1400, Total Reward: 0.0
Episode 1500, Total Reward: 0.0
Episode 1600, Total Reward: 0.0
Episode 1700, Total Reward: 0.0
Episode 1800, Total Reward: 1.0
Episode 1900, Total Reward: 0.0
Episode 2000, Total Reward: 0.0
Episode 2100, Total Reward: 0.0
Episode 2200, Total Reward: 0.0
Episode 2300, Total Reward: 0.0
Episode 2400, Total Reward: 0.0
Episode 2500, Total Reward: 0.0
Episode 2600, Total Reward: 0.0
Episode 2700, Total Reward: 0.0
Episode 2800, Total Reward: 0.0
Episode 2900, Total Reward: 0.0
Episode 3000, Total Reward: 0.0
Episode 3100, Total 

In [48]:
# Evaluate the trained Q-table
num_evaluation_episodes = 100
total_rewards = []

for episode in range(num_evaluation_episodes):
    state = env.reset()
    done = False
    episode_reward = 0

    while not done:
        action = np.argmax(Q[state, :])
        state, reward, done, _ = env.step(action)
        episode_reward += reward

    total_rewards.append(episode_reward)

average_reward = np.mean(total_rewards)
print(f"Average Reward Over {num_evaluation_episodes} Episodes: {average_reward}")

Average Reward Over 100 Episodes: 0.75
