# Inleveropgave 2: Model-Free Prediction and Control

## Model-Free Prediction

### Sources
- https://towardsdatascience.com/reinforcement-learning-rl-101-with-python-e1aa0d37d43b

In [1]:
from typing import Tuple, List
from collections import defaultdict
from utils import Maze, show_utility, value_iteration

import random as rd
import numpy as np

In [2]:
start_state = (3, 2)
terminal_states = [(0, 3), (3, 0)]

rewards = np.array([[-1, -1, -1, 40],
                    [-1, -1, -10, -10],
                    [-1, -1, -1, -1],
                    [10, -2, -1, -1]])

# initialize the Maze
maze = Maze(rewards, terminal_states, start_state)

# use the value function to get the utilities
values = value_iteration(maze, discount=0.9, p_action=1.0)
values

array([[30.5   , 35.    , 40.    ,  0.    ],
       [26.45  , 30.5   , 35.    , 40.    ],
       [22.805 , 26.45  , 22.805 , 26.    ],
       [ 0.    , 22.805 , 19.5245, 22.4   ]])

### Generating episodes
Voor het genereren van een episode worden er twee nieuwe functies geïntroduceerd. De eerste functie kan gebruikt worden om een random episode te genereren, terwijl de tweede gebruikt maakt van de eerder uitgewerkte value iteration om zodanig een bepaalde policy te kunnen volgen.

Deze functies hebben dezelfde hoeveelheid parameters. Echter, niet al deze parameters worden gebruikt (dit zorgt voor iets meer consistency).

In [3]:
def generate_episode_random(env: Maze, values: np.ndarray, discount: float):
    """"""
    steps = []  # holds Tuples with the states, actions and rewards
    pos = env.get_random_position()
    
    # break if the chosen state is a terminal state
    while pos not in env.end_states:

        next_actions = env.get_next_action_positions(pos)
        # choose a random action and gather
        action = rd.choice(next_actions)
        
        reward = env.R[action]
        steps.append((pos, action, reward))
        # update the pos to the taken action
        pos = action
        
    # save the latest pos with all extra data
    steps.append((pos, (), 0))

    return steps

In [4]:
def generate_episode_optimal(env: Maze, values: np.ndarray, discount: float):
    """"""
    steps = []  # holds Tuples with the states, actions and rewards
    pos = env.get_random_position()
    
    # break if the chosen state is a terminal state
    while pos not in env.end_states:
    
        # get the next action based on the optimal policy
        next_actions = env.get_next_action_positions(pos)
        action_values = []
        
        for action in next_actions:
            action_values.append(env.R[action] + (discount * values[action]))
        
        # get the index of the max elements 
        max_elem = max(action_values)
        policy_actions = [act for i, act in zip(action_values, next_actions) if i == max_elem]
        
        # choose a random action and gather
        action = rd.choice(policy_actions)
        
        reward = env.R[action]
        steps.append((pos, action, reward))
        # update the pos to the taken action
        pos = action
        
    # save the latest pos with all extra data
    steps.append((pos, (), 0))

    return steps

### Monte-Carlo Policy Evaluation

In [5]:
def monte_carlo_policy_evaluation(env: Maze, values: np.ndarray, policy: callable, discount: float = 0.9,
                              n_episodes: int = 1000, p_action: float = 0.7):
    """"""
    state_values = np.zeros(env.R.shape)
    state_returns = defaultdict(list)

    for _ in range(n_episodes):
        # generate a new episode with a certain policy
        episode = policy(env, values, discount)

        G = 0
        visited_states = []
        # looping over each step and 
        for pos, action, reward in episode[::-1]:
            G = discount * G + reward
            
            if pos not in visited_states:
                # update the the current state with the new return
                state_returns[pos].append(G)
                # calculate the average value
                state_values[pos] = np.mean(state_returns[pos])
                # update visited states
                visited_states.append(pos)
    
    return state_values

#### MC Random policy

In [6]:
random_values1 = monte_carlo_policy_evaluation(maze, values, policy=generate_episode_random, discount=1.0)
show_utility(random_values1)

-------------------------------------
| -2.71  | 5.85   | 14.75  | 0.0    | 
-------------------------------------
| 1.09   | 5.37   | 14.17  | 19.12  | 
-------------------------------------
| 4.77   | 4.26   | 4.53   | -0.1   | 
-------------------------------------
| 0.0    | 1.11   | -1.66  | -5.94  | 
-------------------------------------


In [7]:
random_values2 = monte_carlo_policy_evaluation(maze, values, policy=generate_episode_random, discount=0.9)
show_utility(random_values2)

-------------------------------------
| -1.3   | 6.12   | 18.82  | 0.0    | 
-------------------------------------
| -0.38  | 1.88   | 11.51  | 19.54  | 
-------------------------------------
| 3.56   | 0.82   | 0.07   | -0.96  | 
-------------------------------------
| 0.0    | 4.88   | -1.12  | -4.08  | 
-------------------------------------


#### MC Optimal policy

In [8]:
optim_values1 = monte_carlo_policy_evaluation(maze, values, policy=generate_episode_optimal, discount=1.0, n_episodes=100)
show_utility(optim_values1)

-------------------------------------
| 38.0   | 39.0   | 40.0   | 0.0    | 
-------------------------------------
| 37.0   | 38.0   | 39.0   | 40.0   | 
-------------------------------------
| 36.0   | 37.0   | 36.0   | 30.0   | 
-------------------------------------
| 0.0    | 36.0   | 35.0   | 29.0   | 
-------------------------------------


In [9]:
optim_values1 = monte_carlo_policy_evaluation(maze, values, policy=generate_episode_optimal, discount=0.9, n_episodes=100)
show_utility(optim_values1)

-------------------------------------
| 30.5   | 35.0   | 40.0   | 0.0    | 
-------------------------------------
| 26.45  | 30.5   | 35.0   | 40.0   | 
-------------------------------------
| 22.81  | 26.45  | 22.8   | 26.0   | 
-------------------------------------
| 0.0    | 22.8   | 19.52  | 22.4   | 
-------------------------------------


### Temporal Difference Learning

In [10]:
def temporal_difference_learning(env: Maze, values: np.ndarray, policy: callable, step_size: float = 0.01,
                                 discount: float = 0.9, n_episodes: int = 1000):
    """"""
    state_values = np.zeros(env.R.shape)

    for _ in range(n_episodes):
        # generate a new episode with a certain policy
        episode = policy(env, values, discount)
        
        for pos, action, reward in episode:
            if pos in env.end_states:
                break
            else:
                value = state_values[pos]
                state_values[pos] = value + step_size * (reward + discount * state_values[action] - value)
    
    return state_values

#### TD Random policy

In [11]:
random_values1 = temporal_difference_learning(maze, values, step_size=0.01, policy=generate_episode_random, discount=1.0)
show_utility(random_values1)

-------------------------------------
| -7.34  | -4.2   | 6.41   | 0.0    | 
-------------------------------------
| -7.34  | -9.59  | -5.68  | 5.77   | 
-------------------------------------
| -1.28  | -7.34  | -11.12 | -9.89  | 
-------------------------------------
| 0.0    | -2.56  | -9.34  | -11.45 | 
-------------------------------------


In [12]:
random_values2 = temporal_difference_learning(maze, values, step_size=0.01, policy=generate_episode_random, discount=0.9)
show_utility(random_values2)

-------------------------------------
| -4.28  | -2.05  | 7.36   | 0.0    | 
-------------------------------------
| -4.06  | -6.2   | -4.01  | 2.1    | 
-------------------------------------
| 0.55   | -4.2   | -7.67  | -7.39  | 
-------------------------------------
| 0.0    | -0.53  | -5.53  | -6.88  | 
-------------------------------------


#### TD Optimal policy

In [13]:
optim_values1 = temporal_difference_learning(maze, values, step_size=0.01, policy=generate_episode_optimal, discount=1.0)
show_utility(optim_values1)

-------------------------------------
| 21.12  | 38.46  | 39.96  | 0.0    | 
-------------------------------------
| 9.5    | 33.63  | 14.72  | 33.58  | 
-------------------------------------
| 1.67   | 21.93  | 6.61   | 10.72  | 
-------------------------------------
| 0.0    | 3.5    | 0.38   | 1.38   | 
-------------------------------------


In [14]:
optim_values2 = temporal_difference_learning(maze, values, step_size=0.01, policy=generate_episode_optimal, discount=0.9)
show_utility(optim_values2)

-------------------------------------
| 13.99  | 34.37  | 39.95  | 0.0    | 
-------------------------------------
| 3.83   | 26.41  | 16.3   | 34.01  | 
-------------------------------------
| 1.02   | 15.27  | 4.13   | 9.27   | 
-------------------------------------
| 0.0    | 1.55   | -0.01  | 0.72   | 
-------------------------------------
