In [37]:
! pip install numpy pandas



In [1]:
import numpy as np
import random
import sys          # We use sys to get the max value of a float
import pandas as pd # We only use pandas for displaying tables nicely
pd.options.display.float_format = '{:,.3f}'.format



# World class and globals

This is how we generate a world and the global variables we use

In [39]:
# Globals:
ACTIONS = ("up", "down", "left", "right")

# Rewards, terminals and obstacles are characters:
REWARDS = {" ": -1, ".": 2, "+": 10, "-": -10}
TERMINALS = ("+", "-") # Note a terminal should also have a reward assigned
OBSTACLES = ("#")

# Discount factor
gamma = 1

# The probability of a random move:
rand_move_probability = 0

class World:
  def __init__(self, width, height):
    self.width = width
    self.height = height
    # Create an empty world where the agent can move to all cells
    self.grid = np.full((width, height), ' ', dtype='U1')

  def add_obstacle(self, start_x, start_y, end_x=None, end_y=None):
    """
    Create an obstacle in either a single cell or rectangle.
    """
    if end_x == None: end_x = start_x
    if end_y == None: end_y = start_y

    self.grid[start_x:end_x + 1, start_y:end_y + 1] = OBSTACLES[0]

  def add_reward(self, x, y, reward):
    assert reward in REWARDS, f"{reward} not in {REWARDS}"
    self.grid[x, y] = reward

  def add_terminal(self, x, y, terminal):
    assert terminal in TERMINALS, f"{terminal} not in {TERMINALS}"
    self.grid[x, y] = terminal

  def is_obstacle(self, x, y):
    if x < 0 or x >= self.width or y < 0 or y >= self.height:
      return True
    else:
      return self.grid[x ,y] in OBSTACLES

  def is_terminal(self, x, y):
    return self.grid[x ,y] in TERMINALS

  def get_reward(self, x, y):
    """
    Return the reward associated with a given location
    """
    return REWARDS[self.grid[x, y]]

  def get_next_state(self, current_state, action):
    """
    Get the next state given a current state and an action. The outcome can be
    stochastic  where rand_move_probability determines the probability of
    ignoring the action and performing a random move.
    """
    assert action in ACTIONS, f"Unknown acion {action} must be one of {ACTIONS}"

    x, y = current_state

    # If our current state is a terminal, there is no next state
    if self.grid[x, y] in TERMINALS:
      return None

    # Check of a random action should be performed:
    if np.random.rand() < rand_move_probability:
      action = np.random.choice(ACTIONS)

    if action == "up":      y -= 1
    elif action == "down":  y += 1
    elif action == "left":  x -= 1
    elif action == "right": x += 1

    # If the next state is an obstacle, stay in the current state
    return (x, y) if not self.is_obstacle(x, y) else current_state




In [40]:
world = World(5,5)

world.add_reward(2,3, "+")

def equiprobable_random_policy(x, y):
  return { k:1/len(ACTIONS) for k in ACTIONS }

print(world.grid.T)
display(pd.DataFrame(world.grid.T))

[[' ' ' ' ' ' ' ' ' ']
 [' ' ' ' ' ' ' ' ' ']
 [' ' ' ' ' ' ' ' ' ']
 [' ' ' ' '+' ' ' ' ']
 [' ' ' ' ' ' ' ' ' ']]


Unnamed: 0,0,1,2,3,4
0,,,,,
1,,,,,
2,,,,,
3,,,+,,
4,,,,,


In [41]:
def iterative_policy_evaluation(world, policy, theta=1e-5, max_iterations=1e3):

  # Our initial estimates for all states in the world is 0:
  V = np.full((world.width, world.height), 0.0)

  while True:
    # delta keeps track of the largest change in one iteration, so we set it to
    # 0 at the start of each iteration:
    delta = 0

    # Loop over all states (x,y)
    for y in range(world.height):
      for x in range(world.width):
        if not world.is_obstacle(x, y):
          # Get action probabilities for the current state:
          actions = policy(x, y)

          # v is the new estimate that will be updated in the loop:
          v = 0

          # loop over all actions that our policy says that we can perform
          # in the current state:
          for action, action_prob in actions.items():
            # For each action, get state transition probabilities and
            # accumulate in v rewards weighted with action and state transition
            # probabilities:
            for (xi, yi), state_prob in world.get_state_transition_probabilities((x, y), action).items():
              v += action_prob * state_prob * (world.get_reward(xi, yi) + gamma * V[xi, yi])

          # update delta (largest change in estimate so far)
          delta = max(delta, abs(v - V[x, y]))
          V[x, y] = v

    # check if current state value estimates are close enought to end:
    if delta <= theta:
      break

    max_iterations -= 1
    if max_iterations == 0:
      break

  # Return the state value estimates
  return V


V = iterative_policy_evaluation(world, equiprobable_random_policy)

display(pd.DataFrame(V.T))

AttributeError: 'World' object has no attribute 'get_state_transition_probabilities'