# Implement a 4x4 gridworld, find terminal with DP


<u>Group:</u><br>
$383321$<br>
$2373769$ <br>
$2376182$<br>
$0709942R$

In [4]:
from copy import copy
from enum import Enum
import numpy as np
import matplotlib.pyplot as plt

## Gridworld
Gridworld for a simple finite MDP. Only actions with movements in cardinal directions are allowed.<br>
And the gird boundaries and obstacles are inaccessible by the agent.

In [27]:
class CellType(Enum):
    SPACE = 0
    OBSTACLE = 1
    
class Actions(Enum):
    UP = 0
    DOWN = 1
    LEFT = 2
    RIGHT = 3

class State(object):
    def __init__(self, row, col):
        self.row = row
        self.col = col
        
    def __eq__(self, other):
        return self.row == other.row and self.col == other.col
    
    def __str__(self):
        return str((self.row, self.col))
    
        
class GridWorld(object):
        
    def __init__(self, height=4, width=4):
        self.start = State(height-1, 0)
        self.goal  = State(0, 0)
        self.environment = self._init_environment(height, width, True)
        self.rewards = self._init_rewards(self.environment)
        self.shape = (len(self.environment),len(self.environment[0]))
        self.agent_state = self.start 
        
    # Add agent to environment using 4 as agen't position
    def __str__(self):
        environment = self.environment.copy()
        environment[self.agent_state.row, self.agent_state.col] = 4
        return str(environment)
        
        
    def _init_rewards(self, environment):
        rewards = {}
        for row in range(len(environment)):
            for col in range(len(environment[row,:])):
                if environment[row, col] != CellType.OBSTACLE.value:
                    cardinal_states = self.get_cardinal_states(State(row, col))
                    rewards[row, col] = {}
                    for neighbour in cardinal_states:
                        reward = -1
                        if neighbour == self.goal:
                            reward = 0
                        rewards[row, col][neighbour.row, neighbour.col] = reward
        return rewards
    
                    
    def _init_environment(self, height, width, no_obstacles = False):
        environment = []
        for row in range(height):
            new_col = []
            for col in range(width):         
                new_col.append(CellType.SPACE.value)
            environment.append(new_col)
        return np.flip(np.array(environment),axis=0)
    
    def get_agent_state(self, agent):
        return self.agent_state
    
    def get_cardinal_states(self, state):
        return [State(state.row + 1, state.col), State(state.row - 1, state.col),
                State(state.row, state.col + 1), State(state.row, state.col - 1)]

    
    def get_available_actions(self):
        return [Actions.DOWN, Actions.UP, Actions.LEFT, Actions.RIGHT]
    
    def state_reward_from_action(self, state, action): 
        from_state = state
        if action == Actions.DOWN:
            new_state = State(state.row + 1, state.col)
        if action == Actions.UP:
            new_state = State(state.row - 1, state.col)
        if action == Actions.LEFT:
            new_state = State(state.row, state.col - 1)
        if action == Actions.RIGHT:
            new_state = State(state.row, state.col + 1)
        return new_state, self.rewards[from_state.row, from_state.col][new_state.row, new_state.col]
        
    def take_action(self, action): 
        from_state = self.agent_state
        if action == Actions.DOWN:
            self.agent_state = State(self.agent_state.row + 1, self.agent_state.col)
        if action == Actions.UP:
            self.agent_state = State(self.agent_state.row - 1, self.agent_state.col)
        if action == Actions.LEFT:
            self.agent_state = State(self.agent_state.row, self.agent_state.col - 1)
        if action == Actions.RIGHT:
            self.agent_state = State(self.agent_state.row, self.agent_state.col + 1)
        return self.rewards[from_state.row, from_state.col][self.agent_state.row, self.agent_state.col]
        

## Policy evalution
Version 1: 2-array version <br>
Version 2: In-place version <br><br>
$v_0$ is intialized arbitrarily, except the terminal state, must be given value 0.

In [103]:
def two_array_policy_evaluation(grid_world, gamma= 1):
    #Arbitraraly initialisased value function
    v = np.zeros(grid_world.shape)
    v_prev = np.zeros(grid_world.shape)
    #Loop through all states (Env cells)
    for row in range (grid_world.shape[0]): 
        for col in range(grid_world.shape[1]):
            if grid_world.agent_state == grid_world.goal:
                return Codehere

            for action in grid_world.get_available_actions():
                new_state, reward = grid_world.state_reward_from_action(State(row, col), action)
                discounted_value = 0
                # if state is within the domain
                if new_state.row < len(v_prev) and new_state.col < len(v_prev):
                    discounted_value = v_prev[new_state.row, new_state.col]
                    
                # Uniform random policy = 0.25 for 4 cardinal positions   
                v[row, col] = 0.25 * (reward + gamma * discounted_value)
                v_prev[row, col] = v[row, col]
                
    print(v)         

In [104]:
two_array_policy_evaluation(GridWorld())

[[-0.25 -0.25 -0.25 -0.25]
 [-0.25 -0.25 -0.25 -0.25]
 [-0.25 -0.25 -0.25 -0.25]
 [-0.25 -0.25 -0.25 -0.25]]
