In [10]:
import gym
import numpy as np

In [11]:
# Parameters
gamma = 1.0
alpha = 0.25
epsilon = 0.29
no_of_episodes = 14697
seed = 741684
amap = ["SFFFHFFFFFFFFFFG"]


In [15]:
# Initialize the environment
desc = np.array([list(row) for row in amap])
env = gym.make("FrozenLake-v1", desc=desc, is_slippery=False)  # Use is_slippery=False for more deterministic behavior
env = env.unwrapped
env.seed(seed)
np.random.seed(seed)


In [16]:
# Initialize Q-table
q_table = np.zeros((env.observation_space.n, env.action_space.n))

def epsilon_greedy_policy(state, epsilon):
    if np.random.rand() < epsilon:
        return env.action_space.sample()  # Explore: random action
    else:
        return np.argmax(q_table[state])  # Exploit: action with max Q-value


In [17]:
# Train the SARSA agent
for episode in range(no_of_episodes):
    state = env.reset()
    action = epsilon_greedy_policy(state, epsilon)

    done = False
    while not done:
        next_state, reward, done, _ = env.step(action)
        next_action = epsilon_greedy_policy(next_state, epsilon)

        # Update Q-value using SARSA update rule
        q_table[state, action] += alpha * (reward + gamma * q_table[next_state, next_action] - q_table[state, action])

        state = next_state
        action = next_action

    if episode % 1000 == 0:
        print(f"Episode {episode}/{no_of_episodes} completed.")
        print(f"Current Q-table after {episode} episodes:")
        print(q_table)

# Print final Q-table
print("Final Q-table:")
print(q_table)


Episode 0/14697 completed.
Current Q-table after 0 episodes:
[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]
Episode 1000/14697 completed.
Current Q-table after 1000 episodes:
[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]
Episode 2000/14697 completed.
Current Q-table after 2000 episodes:
[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]
Episode 3000/14697 completed.
Current Q-table after 3000 episodes:
[[0. 0. 0. 0.]


In [19]:
# Evaluate the policy
def evaluate_policy(env, q_table, episodes=100):
    total_rewards = 0
    max_steps_per_episode = 100  # Set a limit to the number of steps per episode to avoid infinite loops
    
    for _ in range(episodes):
        state = env.reset()
        done = False
        steps = 0
        while not done and steps < max_steps_per_episode:
            action = np.argmax(q_table[state])
            next_state, reward, done, _ = env.step(action)
            total_rewards += reward
            state = next_state
            steps += 1
    
    average_reward = total_rewards / episodes
    return average_reward

# Run the evaluation
average_reward = evaluate_policy(env, q_table)
print(f"Average reward over 100 episodes: {average_reward}")


Average reward over 100 episodes: 0.0
