In [1]:
import gymnasium as gym
import torch

In [2]:
env = gym.make("LunarLander-v2", render_mode="rgp_array_list")
state, info = env.reset()

print(state)

state = tuple((state * 10).astype(int))
print(state)

n_action = env.action_space.n
print('Number of actions:', n_action)

[-0.00715313  1.4124196  -0.7245581   0.06662139  0.00829555  0.16412328
  0.          0.        ]
(0, 14, -7, 0, 0, 1, 0, 0)
Number of actions: 4


  logger.warn(


In [3]:
def run_episode(env, Q, epsilon, n_action):
    """
    Run a episode and performs epsilon-greedy policy
    @param env: OpenAI Gym environment
    @param Q: Q-function
    @param epsilon: the trade-off between exploration and exploitation
    @param n_action: action space
    @return: resulting states, actions and rewards for the entire episode
    """
    state, info = env.reset()
    rewards = []
    actions = []
    states = []
    is_done = False
    truncated = False
    while not (is_done or truncated):
        state_tuple = tuple((state * 10).astype(int))  # Convert state to a tuple
        probs = torch.ones(n_action) * epsilon / n_action
        best_action = torch.argmax(Q[state_tuple]).item()  # Use state_tuple as key
        probs[best_action] += 1.0 - epsilon
        action = torch.multinomial(probs, 1).item()
        actions.append(action)
        states.append(state_tuple)  # Append state_tuple
        state, reward, is_done, truncated, info = env.step(action)
        rewards.append(reward)
    return states, actions, rewards

In [4]:
from collections import defaultdict

def mc_control_epsilon_greedy(env, gamma, n_episode, epsilon):
    """
    Obtain the optimal policy with on-policy MC control with epsilon_greedy
    @param env: OpenAI Gym environment
    @param gamma: discount factor
    @param n_episode: number of episodes
    @param epsilon: the trade-off between exploration and exploitation
    @return: the optimal Q-function, and the optimal policy
    """
    n_action = env.action_space.n
    G_sum = defaultdict(float)
    N = defaultdict(int)
    Q = defaultdict(lambda: torch.empty(n_action))
    for episode in range(n_episode):
        if (episode + 1) % 1000 == 0:
            print("Training episode {}".format(episode+1))
        states_t, actions_t, rewards_t = run_episode(env, Q, epsilon, n_action)
        return_t = 0
        G = {}
        for state_t, action_t, reward_t in zip(states_t[::-1], actions_t[::-1], rewards_t[::-1]):
            return_t = gamma * return_t + reward_t
            G[(state_t, action_t)] = return_t
        for state_action, return_t in G.items():
            state, action = state_action

            G_sum[state_action] += return_t
            N[state_action] += 1
            Q[state][action] = G_sum[state_action] / N[state_action]
    policy = {}
    for state, actions in Q.items():
        policy[state] = torch.argmax(actions).item()
    return Q, policy

In [5]:
gamma = 1

n_episode = 100000
epsilon = 0.1

optimal_Q, optimal_policy = mc_control_epsilon_greedy(env, gamma, n_episode, epsilon)

Training episode 1000
Training episode 2000
Training episode 3000
Training episode 4000
Training episode 5000
Training episode 6000
Training episode 7000
Training episode 8000
Training episode 9000
Training episode 10000
Training episode 11000
Training episode 12000
Training episode 13000
Training episode 14000
Training episode 15000
Training episode 16000
Training episode 17000
Training episode 18000
Training episode 19000
Training episode 20000
Training episode 21000
Training episode 22000
Training episode 23000
Training episode 24000
Training episode 25000
Training episode 26000
Training episode 27000
Training episode 28000
Training episode 29000
Training episode 30000
Training episode 31000
Training episode 32000
Training episode 33000
Training episode 34000
Training episode 35000
Training episode 36000
Training episode 37000
Training episode 38000
Training episode 39000
Training episode 40000
Training episode 41000
Training episode 42000
Training episode 43000
Training episode 440

In [6]:
def simulate_episode(env, policy):
    state, info = env.reset()
    is_done = False
    truncated = False
    while not (is_done or truncated):
        state_tuple = tuple((state * 10).astype(int))  # Convert state to a tuple
        action = policy[state_tuple]  # Use state_tuple as key
        state, reward, is_done, truncated, info = env.step(action)
        return reward

In [7]:
n_episode = 50000
n_win_optimal = 0
n_lose_optimal = 0

for episode in range(n_episode):
    if (episode + 1) % 1000 == 0:
        print("Testing episode {}".format(episode+1))
    reward = simulate_episode(env, optimal_policy)
    if reward == 1:
        n_win_optimal += 1
    elif reward == -1:
        n_lose_optimal += 1

Testing episode 1000
Testing episode 2000
Testing episode 3000
Testing episode 4000
Testing episode 5000
Testing episode 6000
Testing episode 7000
Testing episode 8000
Testing episode 9000
Testing episode 10000
Testing episode 11000
Testing episode 12000
Testing episode 13000
Testing episode 14000
Testing episode 15000
Testing episode 16000
Testing episode 17000
Testing episode 18000
Testing episode 19000
Testing episode 20000
Testing episode 21000
Testing episode 22000
Testing episode 23000
Testing episode 24000
Testing episode 25000
Testing episode 26000
Testing episode 27000
Testing episode 28000
Testing episode 29000
Testing episode 30000
Testing episode 31000
Testing episode 32000
Testing episode 33000
Testing episode 34000
Testing episode 35000
Testing episode 36000
Testing episode 37000
Testing episode 38000
Testing episode 39000
Testing episode 40000
Testing episode 41000
Testing episode 42000
Testing episode 43000
Testing episode 44000
Testing episode 45000
Testing episode 460

In [8]:
print('Winning probability under the optimal policy: {}'.format(n_win_optimal/n_episode))

print('Losing probability under the optimal policy: {}'.format(n_lose_optimal/n_episode))

Winning probability under the optimal policy: 0.0
Losing probability under the optimal policy: 0.0
