# Value Iteration example

### Sources:
- http://www.cs.uu.nl/docs/vakken/b2ki/LastYear/Docs/Slides/mdps-handouts.pdf

In [None]:
from typing import Tuple, List
import numpy as np
import copy

In [None]:
class Maze:
    def __init__(self, R: np.ndarray, end: List[Tuple[int, int]],
                 noise: float = 0.7, discount: float = 0.9, threshold: float = 0.0001):
        self.R = R
        self.utility = np.zeros(R.shape)
        self.end_states = end

        self.noise = noise
        self.discount = discount
        self.threshold = threshold
    
    def __calc_action_value(self, action, other_actions):
        """"""
        total_value = self.noise * (self.R[action] + (self.discount * self.utility[action]))
        
        dist_chance = (1 - self.noise) / len(other_actions)
        
        for noise_action in other_actions:
            total_value += dist_chance * (self.R[noise_action] + (self.discount * self.utility[noise_action]))
        return total_value
    
    def __get_action_positions(self, current_pos: Tuple[int, int]):
        """"""
        row, col = current_pos

        up = (row - 1, col) if row - 1 >= 0 else current_pos
        right = (row, col + 1) if col + 1 < self.R.shape[1] else current_pos
        left = (row, col -1) if col - 1 >= 0 else current_pos
        down = (row + 1, col) if row + 1 < self.R.shape[0] else current_pos
        return up, right, left, down
    
    def value_iteration(self):
        """Value iteration method."""
        delta = np.inf
        # get all positions in the grid
        positions = [(i,j) for i in range(self.R.shape[0]) for j in range(self.R.shape[1])]
        
        while delta > self.threshold:
            delta = 0
            new_utility = np.zeros(self.utility.shape)
            for pos in positions:
                # check if we are evaluating an end state
                if pos in self.end_states:
                    # current position is an end-state so value is 0
                    continue

                # save the current value
                value = self.utility[pos]
                # get the next positions of all the actions that can be taken on the current positions
                actions = self.__get_action_positions(pos)
                action_values = []
                for index, action in enumerate(actions):
                    noise_actions = actions[:index] + actions[index+1:]
                    action_values.append(self.__calc_action_value(action, noise_actions))
                
                # select the action with the highest utility
                highest_utility = max(action_values)
                new_utility[pos] = highest_utility
                # update the delta
                delta = max(delta, abs(value - highest_utility))
            
            self.utility = copy.deepcopy(new_utility)
    
    def show_utility(self):
        """Prints the utility array to the screen."""
        for row in range(self.utility.shape[0]):
            print("-------------------------------------")
            out = "| "
            for col in range(self.utility.shape[1]):
                out += str(round(self.utility[(row, col)], 2)).ljust(6) + ' | '
            print(out)
        print("-------------------------------------")
    
    def show_policy(self):
        ...

In [None]:
class Agent:
    def __init__(self, pos: Tuple[int, int], model: Maze):
        self.pos = pos
        self.env = model

In [None]:
start_state = (3, 2)
terminal_states = [(0, 3), (3, 0)]

rewards = np.array([[-1, -1, -1, 40],
                    [-1, -1, -10, -10],
                    [-1, -1, -1, -1],
                    [10, -2, -1, -1]])

# build maze and agent
maze = Maze(rewards, terminal_states, discount=0.9)
agent = Agent(start_state, maze)

maze.value_iteration()
maze.show_utility()
# maze.show_policy()

### Unittest