# Monte Carlo Prediction and Control

## Imports

In [None]:
import gym
import numpy as np
from tqdm.notebook import trange

## Monte Carlo Prediction

In [None]:
def mc_prediction(env, policy, obs_space, num_episodes, alpha, gamma):
    # v as value function
    v = np.zeros(len(obs_space))
    
    for episode in trange(num_episodes):
        done, obs = False, env.reset()
        obs_trajectory = []
        reward_trajectory = []

        #1: interaction with the environment to generate a trajectory
        while not done:
            action = policy(obs)
            next_obs, reward, done, _ = env.step(action)
            obs_trajectory.append(obs)
            reward_trajectory.append(reward)
            obs = next_obs
            
        #2: calculate value function of the policy
        visited = np.zeros(len(obs_space), dtype=np.bool_)
        discount_rates = np.array([gamma**i for i in range(len(obs_trajectory))])
        for t, obs in enumerate(obs_trajectory):
            if visited[obs]:
                continue
            visited[obs] = True
            rewards = np.array(reward_trajectory, dtype=np.float32)
            target = np.sum(rewards * discount_rates[:len(rewards)])
            v[obs] = v[obs] + alpha * (target - v[obs])
    
    return v

## Monte Carlo Control

In [None]:
def mc_control(env, obs_space, action_space, num_episodes, alpha, gamma, epsilon):
    
    # Initialization phase
    #------------------------------------------------------------------------------
    # v as value function
    q = np.zeros(shape=(len(obs_space), len(action_space)))
    # epsilon greedy policy
    def policy(obs):
        if np.random.rand() < epsilon:
            action = env.action_space.sample()
        else:
            action = q[obs].argmax()
        return action
        
    # Learning phase
    #------------------------------------------------------------------------------
    for episode in trange(num_episodes):
        done, obs = False, env.reset()
        obs_trajectory = []
        action_trajectory = []
        reward_trajectory = []

        #1: interaction with the environment to generate a trajectory
        while not done:
            action = policy(obs)
            next_obs, reward, done, _ = env.step(action)
            action_trajectory.append(action)
            obs_trajectory.append(obs)
            reward_trajectory.append(reward)
            obs = next_obs
            
        #2: calculate action value function 
        visited = np.zeros(len(obs_space), dtype=np.bool_)
        discount_rates = np.array([gamma**i for i in range(len(obs_trajectory))])
        for t, (obs, action) in enumerate(zip(obs_trajectory, action_trajectory)):
            if visited[obs]:
                continue
            visited[obs] = True
            rewards = np.array(reward_trajectory, dtype=np.float32)
            target = np.sum(rewards * discount_rates[:len(rewards)])
            q[obs][action] = q[obs][action] + alpha * (target - q[obs][action])
    
    # greedy policy
    policy_mapping = np.argmax(q, axis=1)
    policy = lambda x: policy_mapping[x]

    return policy, q


## Test using FrozeLake

In [None]:
env = gym.make('FrozenLake-v1')

In [None]:
obs_space = {obs for obs in range(env.observation_space.n)}
action_space = {action for action in range(env.action_space.n)}

### FrozenLake Monte Carlo Prediction

In [None]:
def policy(state):
    mapping = {
            0: 2,
            1: 2,
            2: 1,
            3: 0,
            4: 1,
            5: 1,
            6: 1,
            7: 1,
            8: 2,
            9: 1,
            10: 1,
            11: 1,
            12: 2,
            13: 2,
            14: 2,
            15: 2
    }
    return mapping[state]

In [None]:
mc_prediction(env=env, policy=policy, obs_space=obs_space, num_episodes=100000, alpha=0.01, gamma=0.99)

### FrozenLake Monte Carlo Control

In [None]:
policy, q = mc_control(env=env, 
                       obs_space=obs_space, 
                       action_space=action_space, 
                       num_episodes=100000, 
                       alpha=0.1, 
                       gamma=0.99, 
                       epsilon=0.2)

In [None]:
for obs in obs_space:
    print(f'Observation: {obs}, q-values: {q[obs]}, action: {policy(obs)}')