In [1]:
import gymnasium as gym
import torch


env = gym.make('FrozenLake-v1', desc=None, map_name="4x4", is_slippery=True, render_mode="ansi_list")

n_state = env.observation_space
print('State matrix:', n_state, 'number of state', n_state)

n_action = env.action_space
print('number of action:', n_action)

State matrix: Discrete(16) number of state Discrete(16)
number of action: Discrete(4)


In [2]:
def run_episode(env, policy):
    state, info = env.reset()
    # this is the difference between MC and DP, find rewards and states
    rewards = []
    states = [state]
    is_done = False
    truncated = False
    while not (is_done or truncated):
        action = int(policy[state].item())
        state, reward, is_done, truncated, info = env.step(action)
        # keep all states and reward
        states.append(state)
        rewards.append(reward)
    env.close()
    # convert to torch
    states = torch.tensor(states)
    rewards = torch.tensor(rewards)

    return states, rewards

In [3]:
def mc_prediction_first_visit(env, policy, gamma, n_episode):
    n_state = policy.shape[0]
    V = torch.zeros(n_state)
    N = torch.zeros(n_state)
    for episode in range(n_episode):
        # run 1 episode until end of the episode
        states_t, rewards_t = run_episode(env, policy)
        # print(states_t, rewards_t)
        return_t = 0
        first_visit = torch.zeros(n_state)
        G = torch.zeros(n_state)
        # take a look at the state and the reward from the last to first start
        # calculate given policy
        for state_t, reward_t in zip(reversed(states_t)[1:], reversed(rewards_t)):
            # calculate rewards
            # because the reward at the last can be only 0 or 1, otherwise are 0
            # so the reward at first start will be smallest
            return_t = gamma * return_t + reward_t
            # put the reward for the state into given policy
            # as you can see, if we come in the same state, it will be replaced to the early time when visit
            # That's why we call first-visit
            G[state_t] = return_t
            first_visit[state_t] = 1
        # at the end of given policy calculation
        # we need to update the state transition by summation them (prepare to average)
        for state in range(n_state):
            if first_visit[state] > 0:
                V[state] += G[state]
                N[state] += 1
    # average state transition here
    for state in range(n_state):
        if N[state] > 0:
            V[state] = V[state] / N[state]
    return V

In [4]:
gamma = 1
n_episode = 10000

optimal_policy = torch.tensor([0., 3., 0., 3., 0., 0., 0., 0., 3., 1., 0., 0., 0., 2., 1., 0.])
value = mc_prediction_first_visit(env, optimal_policy, gamma, n_episode)
print('The value function calculated by first-visit MC prediction:\n', value)

The value function calculated by first-visit MC prediction:
 tensor([0.7252, 0.5179, 0.4311, 0.0000, 0.7252, 0.0000, 0.3547, 0.0000, 0.7252,
        0.7261, 0.6491, 0.0000, 0.0000, 0.7964, 0.8873, 0.0000])
