In [4]:
import numpy as np
import gym

def train_q_agent(environment, learn_rate, discount_factor, explore_prob, total_episodes):
    q_values = np.random.rand(environment.observation_space.n, environment.action_space.n)

    for episode in range(total_episodes):
        current_state = environment.reset()
        done = False

        while not done:
            if np.random.uniform(0, 1) < explore_prob:
                chosen_action = environment.action_space.sample()
            else:
                chosen_action = np.argmax(q_values[current_state, :])

            new_state, reward, done, _ = environment.step(chosen_action)
            q_values[current_state, chosen_action] = (1 - learn_rate) * q_values[current_state, chosen_action] + \
                                                     learn_rate * (reward + discount_factor * np.max(q_values[new_state, :]))

            current_state = new_state

    return q_values

def agent_performance(environment, q_values, num_trials=1000):
    total_reward = 0
    for _ in range(num_trials):
        current_state = environment.reset()
        done = False
        while not done:
            action = np.argmax(q_values[current_state, :])
            new_state, reward, done, _ = environment.step(action)
            total_reward += reward
            current_state = new_state

    avg_reward = total_reward / num_trials
    return avg_reward

# Create the FrozenLake environment
env = gym.make('FrozenLake-v1', is_slippery=True)

# Hyperparameters
alpha_values = [0.1, 0.5, 0.9]
gamma_values = [0.1, 0.5, 0.9]
epsilon_values = [0.1, 0.5, 0.9]
total_episodes = 10000

# Experiment and evaluate different hyperparameter settings
for alpha in alpha_values:
    for gamma in gamma_values:
        for epsilon in epsilon_values:
            q_values = train_q_agent(env, alpha, gamma, epsilon, total_episodes)
            avg_reward = agent_performance(env, q_values)
            print(f"Alpha: {alpha}, Gamma: {gamma}, Epsilon: {epsilon}, Average Reward: {avg_reward}")

# Close the environment
env.close()


Alpha: 0.1, Gamma: 0.1, Epsilon: 0.1, Average Reward: 0.013
Alpha: 0.1, Gamma: 0.1, Epsilon: 0.5, Average Reward: 0.016
Alpha: 0.1, Gamma: 0.1, Epsilon: 0.9, Average Reward: 0.019
Alpha: 0.1, Gamma: 0.5, Epsilon: 0.1, Average Reward: 0.006
Alpha: 0.1, Gamma: 0.5, Epsilon: 0.5, Average Reward: 0.035
Alpha: 0.1, Gamma: 0.5, Epsilon: 0.9, Average Reward: 0.016
Alpha: 0.1, Gamma: 0.9, Epsilon: 0.1, Average Reward: 0.035
Alpha: 0.1, Gamma: 0.9, Epsilon: 0.5, Average Reward: 0.043
Alpha: 0.1, Gamma: 0.9, Epsilon: 0.9, Average Reward: 0.061
Alpha: 0.5, Gamma: 0.1, Epsilon: 0.1, Average Reward: 0.021
Alpha: 0.5, Gamma: 0.1, Epsilon: 0.5, Average Reward: 0.058
Alpha: 0.5, Gamma: 0.1, Epsilon: 0.9, Average Reward: 0.0
Alpha: 0.5, Gamma: 0.5, Epsilon: 0.1, Average Reward: 0.016
Alpha: 0.5, Gamma: 0.5, Epsilon: 0.5, Average Reward: 0.0
Alpha: 0.5, Gamma: 0.5, Epsilon: 0.9, Average Reward: 0.0
Alpha: 0.5, Gamma: 0.9, Epsilon: 0.1, Average Reward: 0.016
Alpha: 0.5, Gamma: 0.9, Epsilon: 0.5, Average 