## WEEK 9MDP & MONTE CARLO METHODS

In [13]:
import gym

env = gym.make('CliffWalking-v0')


In [14]:
import numpy as np

# Define the number of episodes and the learning rate
num_episodes = 500
alpha = 0.1

# Initialize the Q-table
Q = np.zeros((env.observation_space.n, env.action_space.n))

# Define a function to choose an action based on the Q-table and the epsilon-greedy policy
def epsilon_greedy(Q, state, epsilon):
    if np.random.uniform(0, 1) < epsilon:
        action = env.action_space.sample()
    else:
        action = np.argmax(Q[state, :])
    return action

# Loop over the episodes
# for episode in range(num_episodes):
#     state = env.reset()
#     done = False
#     epsilon = 1.0 / (episode + 1)
#     rewards = []
#     states = []
#     actions = []
#     # Play the episode until termination
#     while not done:
#         action = epsilon_greedy(Q, state, epsilon)
#         next_state, reward, done, _ = env.step(action)
#         rewards.append(reward)
#         states.append(state)
#         actions.append(action)
#         state = next_state
#     # Update the Q-table based on the episode
#     G = 0
#     for t in reversed(range(len(states))):
#         state = states[t]
#         action = actions[t]
#         reward = rewards[t]
#         G = alpha * (G + reward - Q[state, action])
#         Q[state, action] += G
for episode in range(num_episodes):
    state = env.reset()
    done = False
    epsilon = 1.0 / (episode + 1)
    rewards = []
    states = []
    actions = []
    # Play the episode until termination
    while not done:
        action = epsilon_greedy(Q, state, epsilon)
        next_state, reward, done, _ = env.step(action)
        rewards.append(reward)
        states.append(state)
        actions.append(action)
        state = next_state
    # Compute the discounted returns for each time step in the episode
    returns = np.zeros(len(rewards))
    G = 0
    for t in reversed(range(len(rewards))):
        G = rewards[t] + 0.99 * G
        returns[t] = G
    # Update the Q-table based on the episode
    states = np.array(states)
    actions = np.array(actions)
    returns = np.array(returns)
    Q[states, actions] += alpha * (returns - Q[states, actions])


In [17]:
state = env.reset()
done = False
steps = 0
while not done:
    action = np.argmax(Q[state, :])
    state, reward, done, _ = env.step(action)
    steps += 1
print("Number of steps to reach the goal state:", steps)


Number of steps to reach the goal state: 17


In [18]:
print("Number of episodes needed to learn the optimal policy:", num_episodes)


Number of episodes needed to learn the optimal policy: 500


### On-policy first-visit MC control (for Ɛ-soft policies),for Ɛ= 0.1


In [None]:
import gym
import numpy as np

env = gym.make('CliffWalking-v0')

# set up the hyperparameters
num_episodes = 100
epsilon = 0.1
gamma = 1.0  # discount factor
alpha = 0.1  # step size

# initialize Q table and N table
Q = np.zeros((env.observation_space.n, env.action_space.n))
N = np.zeros((env.observation_space.n, env.action_space.n))

# define epsilon-greedy policy
def epsilon_greedy_policy(Q, state, epsilon):
    if np.random.uniform(0, 1) < epsilon:
        # choose a random action
        action = env.action_space.sample()
    else:
        # choose the action with maximum Q-value
        action = np.argmax(Q[state])
    return action

# iterate over episodes
for i in range(num_episodes):
    episode = []
    state = env.reset()
    done = False
    
    # generate an episode
    while not done:
        action = epsilon_greedy_policy(Q, state, epsilon)
        next_state, reward, done, _ = env.step(action)
        episode.append((state, action, reward))
        state = next_state
    
    # update Q table using first-visit MC method
    G = 0
    states, actions, rewards = zip(*episode)
    discounts = np.power(gamma, range(len(rewards)))
    returns = np.cumsum(rewards[::-1])[::-1]
    for t, state in enumerate(states):
        action = actions[t]
        if (state, action) not in episode[:t]:
            N[state][action] += 1
            alpha = 1.0 / N[state][action]
            Q[state][action] += alpha * (np.dot(discounts[t:], returns[t:]) - Q[state][action])
        
# evaluate the learned policy
state = env.reset()
steps = 0
done = False
while not done:
    action = np.argmax(Q[state])
    next_state, reward, done, _ = env.step(action)
    state = next_state
    steps += 1

print("Number of steps needed to learn optimal policy:", steps)
print("Number of episodes:", num_episodes)


  deprecation(
  deprecation(
