In [4]:
import gymnasium as gym
import numpy as np
import matplotlib.pyplot as plt
import pickle


def run_mc(episodes, is_training, render):
    env = gym.make('FrozenLake-v1', map_name="8x8", is_slippery=False, render_mode='human' if render else None)
    n_states = env.observation_space.n
    n_actions = env.action_space.n

    if is_training:
        q = np.zeros((n_states, n_actions))
        returns_sum = np.zeros((n_states, n_actions))
        returns_count = np.ones((n_states, n_actions))  # avoid division by zero
    else:
        with open('frozen_lake8x8_mc.pkl', 'rb') as f:
            q = pickle.load(f)

    epsilon = 1.0
    epsilon_decay = 0.0001
    rng = np.random.default_rng()
    reward_per_episode = np.zeros(episodes)

    for i in range(episodes):
        episode = []
        state = env.reset()[0]
        terminated = False
        truncated = False

        while not terminated and not truncated:
            if is_training and rng.random() < epsilon:
                action = env.action_space.sample()
            else:
                action = np.argmax(q[state, :])

            new_state, reward, terminated, truncated, _ = env.step(action)
            episode.append((state, action, reward))
            state = new_state

        if reward == 1:
            reward_per_episode[i] = 1

        if is_training:
            visited = set()
            G = 0
            for t in reversed(range(len(episode))):
                s, a, r = episode[t]
                G = r + G  # undiscounted return, or use G = gamma*G + r

                # First-visit MC
                if (s, a) not in visited:
                    visited.add((s, a))
                    returns_sum[s, a] += G
                    returns_count[s, a] += 1
                    q[s, a] = returns_sum[s, a] / returns_count[s, a]

        epsilon = max(epsilon - epsilon_decay, 0)

    # Plot
    sum_rewards = np.zeros(episodes)
    for t in range(episodes):
        sum_rewards[t] = np.sum(reward_per_episode[max(0, t - 100):(t + 1)])
    plt.plot(sum_rewards)
    plt.xlabel('Episode')
    plt.ylabel('100-episode Reward Sum')
    plt.title('FrozenLake 8x8 Monte Carlo')
    plt.savefig('frozen_lake8x8_mc.png')
    plt.close()

    if is_training:
        with open("frozen_lake8x8_mc.pkl", "wb") as f:
            pickle.dump(q, f)

    env.close()


In [None]:
if __name__ == '__main__':
    # Training
    run_mc(15000, is_training=True, render=False)

    # Testing
    run_mc(1, is_training=False, render=True)
