# Inleveropgave 2: Model-Free Prediction and Control

## Model-Free Prediction

### Sources
- https://towardsdatascience.com/reinforcement-learning-rl-101-with-python-e1aa0d37d43b

In [1]:
from typing import Tuple, List
from collections import defaultdict
from utils import Maze, show_utility, value_iteration

import random as rd
import numpy as np

In [2]:
start_state = (3, 2)
terminal_states = [(0, 3), (3, 0)]

rewards = np.array([[-1, -1, -1, 40],
                    [-1, -1, -10, -10],
                    [-1, -1, -1, -1],
                    [10, -2, -1, -1]])

# initialize the Maze
maze = Maze(rewards, terminal_states, start_state)

# use the value function to get the utilities
values = value_iteration(maze, discount=0.9, p_action=1.0)
values

array([[30.5   , 35.    , 40.    ,  0.    ],
       [26.45  , 30.5   , 35.    , 40.    ],
       [22.805 , 26.45  , 22.805 , 26.    ],
       [ 0.    , 22.805 , 19.5245, 22.4   ]])

### Generating episodes
Voor het genereren van een episode worden er twee nieuwe functies ge√Øntroduceerd. De eerste functie kan gebruikt worden om een random episode te genereren, terwijl de tweede gebruikt maakt van de eerder uitgewerkte value iteration om zodanig een bepaalde policy te kunnen volgen.

Deze functies hebben dezelfde hoeveelheid parameters. Echter, niet al deze parameters worden gebruikt (dit zorgt voor iets meer consistency).

In [3]:
def generate_episode_random(env: Maze, values: np.ndarray, discount: float, p_action: float):
    """Generates an episode based on the random policy.
    
    Here the p_action is not being used, because the actions are already random and therefore it won't really matter if we take a wrong turn
    Furthermore the discount is not being used.
    Both these parameters are set in the function because API consistency
    """
    steps = []  # holds Tuples with the states, actions and rewards
    pos = env.get_random_position()
    
    # break if the chosen state is a terminal state
    while pos not in env.end_states:

        next_actions = env.get_next_action_positions(pos)
        # choose a random action and get the reward for the action.
        action = rd.choice(next_actions)
        
        reward = env.R[action]
        steps.append((pos, action, reward))
        # update the pos to the taken action
        pos = action
        
    # save the latest pos with all extra data
    steps.append((pos, (), 0))

    return steps

In [4]:
def generate_episode_optimal(env: Maze, values: np.ndarray, discount: float, p_action: float):
    """Generates an episode based on the optimal policy."""
    steps = []  # holds Tuples with the states, actions and rewards
    pos = env.get_random_position()
    
    # break if the chosen state is a terminal state
    while pos not in env.end_states:
    
        # get the next action based on the optimal policy
        next_actions = env.get_next_action_positions(pos)
        action_values = []
        
        for action in next_actions:
            action_values.append(env.R[action] + (discount * values[action]))
        
        # get the index of the max elements
        max_elem = max(action_values)
        policy_actions = [act for i, act in zip(action_values, next_actions) if i == max_elem]
        
        # choose the desired action and check based on the p_action if the action is certain
        action = rd.choice(policy_actions)
        if p_action < rd.random():
            # whoops, the desired action cannot be taken, so choose one of the others
            chosen_index = np.argmax(action_values)
            # remove the earlier chosen action and choose a random action
            action = rd.choice(next_actions[:chosen_index] + next_actions[chosen_index + 1:])
        
        reward = env.R[action]
        steps.append((pos, action, reward))
        # update the pos to the taken action
        pos = action
        
    # save the latest pos with all extra data
    steps.append((pos, (), 0))

    return steps

### Monte-Carlo Policy Evaluation

In [5]:
def monte_carlo_policy_evaluation(env: Maze, values: np.ndarray, policy: callable, discount: float = 0.9,
                              n_episodes: int = 10000, p_action: float = 0.7):
    """"""
    state_values = np.zeros(env.R.shape)
    state_returns = defaultdict(list)

    for _ in range(n_episodes):
        # generate a new episode with a certain policy
        episode = policy(env, values, discount, p_action)

        G = 0
        visited_states = []
        # looping over each step and 
        for pos, action, reward in episode[::-1]:
            G = discount * G + reward
            
            if pos not in visited_states:
                # update the the current state with the new return
                state_returns[pos].append(G)
                # calculate the average value
                state_values[pos] = np.mean(state_returns[pos])
                # update visited states
                visited_states.append(pos)
    
    return state_values

#### MC Random policy

In [6]:
random_values1 = monte_carlo_policy_evaluation(maze, values, policy=generate_episode_random, discount=1.0)
show_utility(random_values1)

-------------------------------------
| 0.08   | 5.63   | 16.84  | 0.0    | 
-------------------------------------
| 1.78   | 5.35   | 15.25  | 19.95  | 
-------------------------------------
| 3.64   | 3.09   | 3.83   | 2.16   | 
-------------------------------------
| 0.0    | 2.83   | -0.74  | -4.19  | 
-------------------------------------


In [7]:
random_values2 = monte_carlo_policy_evaluation(maze, values, policy=generate_episode_random, discount=0.9)
show_utility(random_values2)

-------------------------------------
| -0.4   | 4.83   | 17.22  | 0.0    | 
-------------------------------------
| 0.11   | 1.18   | 10.37  | 19.96  | 
-------------------------------------
| 3.57   | 0.8    | -0.6   | 0.24   | 
-------------------------------------
| 0.0    | 3.33   | -2.1   | -3.93  | 
-------------------------------------


#### MC Optimal policy

In [8]:
optim_values1 = monte_carlo_policy_evaluation(maze, values, policy=generate_episode_optimal, discount=1.0)
show_utility(optim_values1)

-------------------------------------
| 34.45  | 37.12  | 39.54  | 0.0    | 
-------------------------------------
| 31.59  | 34.16  | 36.36  | 37.93  | 
-------------------------------------
| 27.69  | 31.49  | 28.85  | 26.69  | 
-------------------------------------
| 0.0    | 27.13  | 26.24  | 24.76  | 
-------------------------------------


In [9]:
optim_values1 = monte_carlo_policy_evaluation(maze, values, policy=generate_episode_optimal, discount=0.9)
show_utility(optim_values1)

-------------------------------------
| 24.56  | 32.18  | 39.52  | 0.0    | 
-------------------------------------
| 18.98  | 24.25  | 29.25  | 36.56  | 
-------------------------------------
| 14.21  | 18.29  | 14.27  | 19.76  | 
-------------------------------------
| 0.0    | 12.84  | 10.62  | 14.18  | 
-------------------------------------


### Temporal Difference Learning

In [10]:
def get_random_step(env: Maze, values: np.ndarray, pos: Tuple[int, int], discount: float, p_action: float):
    """Picks the next action based on the current state and the random policy."""
    next_actions = env.get_next_action_positions(pos)
    
    # choose a random action
    action = rd.choice(next_actions)
    reward = env.R[action]
    
    # return the current state, the action taken and the reward of the state after the action
    return pos, action, reward


def get_optimal_step(env: Maze, values: np.ndarray, pos: Tuple[int, int], discount: float, p_action: float):
    """Picks the next action based on the current state and the optimal policy."""
    
    # get the next action based on the optimal policy
    next_actions = env.get_next_action_positions(pos)
    action_values = []
    
    # calculate the value of the next actions based on the values calculated during the value iteration step
    for action in next_actions:
        action_values.append(env.R[action] + (discount * values[action]))

    # get the index of the max elements 
    max_elem = max(action_values)
    policy_actions = [act for i, act in zip(action_values, next_actions) if i == max_elem]
    
    # choose the desired action and check based on the p_action if the action is certain
    action = rd.choice(policy_actions)
    if p_action < rd.random():
        # whoops, the desired action cannot be taken, so choose one of the others
        chosen_index = np.argmax(action_values)
        # remove the earlier chosen action and choose a random action
        action = rd.choice(next_actions[:chosen_index] + next_actions[chosen_index + 1:])

    # gather the reward of the taken action
    reward = env.R[action]
    
    return pos, action, reward

In [11]:
def temporal_difference_learning(env: Maze, values: np.ndarray, policy: callable, step_size: float = 0.1,
                                 discount: float = 0.9, n_episodes: int = 10000, p_action: float = 0.7):
    """"""
    state_values = np.zeros(env.R.shape)

    for _ in range(n_episodes):
        # get the random first position
        state = env.get_random_position()

        while state not in env.end_states:
            
            # choose an action based on the policy
            state, action, reward = policy(env, values, state, discount, p_action)
            
            # update the value of the current_state
            state_values[state] = state_values[state] + step_size * (reward + discount * state_values[action] - state_values[state])
            
            # update the current state
            state = action

    return state_values   

#### TD Random policy

In [12]:
random_values1 = temporal_difference_learning(maze, values, policy=get_random_step, discount=1.0, p_action=0.7)
show_utility(random_values1)

-------------------------------------
| -14.3  | -9.42  | 2.55   | 0.0    | 
-------------------------------------
| -13.17 | -16.5  | -15.0  | -12.26 | 
-------------------------------------
| -5.68  | -12.05 | -19.58 | -22.13 | 
-------------------------------------
| 0.0    | -5.49  | -16.37 | -21.3  | 
-------------------------------------


In [13]:
random_values2 = temporal_difference_learning(maze, values, policy=get_random_step, discount=0.9, p_action=0.7)
show_utility(random_values2)

-------------------------------------
| -5.9   | -5.26  | 9.49   | 0.0    | 
-------------------------------------
| -5.29  | -10.28 | -6.25  | -4.22  | 
-------------------------------------
| -2.07  | -7.98  | -10.52 | -11.14 | 
-------------------------------------
| 0.0    | -3.36  | -8.2   | -9.58  | 
-------------------------------------


#### TD Optimal policy

In [14]:
optim_values1 = temporal_difference_learning(maze, values, policy=get_optimal_step, discount=1.0, p_action=0.7)
show_utility(optim_values1)

-------------------------------------
| 33.81  | 34.59  | 34.38  | 0.0    | 
-------------------------------------
| 29.91  | 31.19  | 32.73  | 29.18  | 
-------------------------------------
| 26.83  | 29.13  | 25.95  | 22.56  | 
-------------------------------------
| 0.0    | 25.21  | 25.11  | 24.19  | 
-------------------------------------


In [15]:
optim_values2 = temporal_difference_learning(maze, values, policy=get_optimal_step, discount=0.9, p_action=0.7)
show_utility(optim_values2)

-------------------------------------
| 23.23  | 26.64  | 34.47  | 0.0    | 
-------------------------------------
| 15.81  | 18.11  | 26.54  | 32.73  | 
-------------------------------------
| 12.13  | 12.42  | 11.51  | 18.41  | 
-------------------------------------
| 0.0    | 10.28  | 9.65   | 15.24  | 
-------------------------------------
