In [2]:
! pip install numpy pandas



In [3]:
import numpy as np
import random
import sys          # We use sys to get the max value of a float
import pandas as pd # We only use pandas for displaying tables nicely
pd.options.display.float_format = '{:,.3f}'.format

# World class and globals

This is how we generate a world and the global variables we use

In [154]:
# Globals:
ACTIONS = ("up", "down", "left", "right")

# Rewards, terminals and obstacles are characters:
REWARDS = {" ": -1, ".": 2, "+": 30, "-": -10}
OBSTACLES = ("#")

# Discount factor
gamma = 1

# The probability of a random move:
rand_move_probability = 0

class World:
  def __init__(self, width, height):
    self.width = width
    self.height = height
    # Create an empty world where the agent can move to all cells
    self.grid = np.full((width, height), '.', dtype='U1')

  def add_obstacle(self, start_x, start_y, end_x=None, end_y=None):
    """
    Create an obstacle in either a single cell or rectangle.
    """
    if end_x == None: end_x = start_x
    if end_y == None: end_y = start_y

    self.grid[start_x:end_x + 1, start_y:end_y + 1] = OBSTACLES[0]

  def add_reward(self, x, y, reward):
    assert reward in REWARDS, f"{reward} not in {REWARDS}"
    self.grid[x, y] = reward

  def is_obstacle(self, x, y):
    if x < 0 or x >= self.width or y < 0 or y >= self.height:
      return True
    else:
      return self.grid[x ,y] in OBSTACLES


  def get_reward(self, x, y):
    """
    Return the reward associated with a given location
    """
    return REWARDS[self.grid[x, y]]

  def get_next_state(self, current_state, action):
    """
    Get the next state given a current state and an action. The outcome can be
    stochastic  where rand_move_probability determines the probability of
    ignoring the action and performing a random move.
    """
    assert action in ACTIONS, f"Unknown acion {action} must be one of {ACTIONS}"

    x, y = current_state

    # Check of a random action should be performed:
    if np.random.rand() < rand_move_probability:
      action = np.random.choice(ACTIONS)

    if action == "up":      y -= 1
    elif action == "down":  y += 1
    elif action == "left":  x -= 1
    elif action == "right": x += 1

    # If the next state is an obstacle, stay in the current state
    return (x, y) if not self.is_obstacle(x, y) else current_state

In [192]:

def equiprobable_random_policy(x, y):
  return { k:1/len(ACTIONS) for k in ACTIONS }

def epsilon_greedy_policy(q, x, y, epsilon = 0.5):
    if np.random.rand() < epsilon:  # choose random action with probability epsilon
        return random.randrange(len(ACTIONS))
    else:
        return np.argmax(q[x,y,:])  # return the action with the highest q value

# Our step size / learing rate 
alpha = 0.05 

# Discount factor
gamma = 0.9

epsilon = 0.1

# Episodes to run 
EPISODES = 300


def SARSA_policy(x,y):
  actions = {a : epsilon/len(ACTIONS) for a in ACTIONS}                       # her sætter den alle til 0.075 for alle 4 actions
  actions[ACTIONS[np.argmax(Q[x,y,:])]] = 1 - epsilon + epsilon/ len(ACTIONS) # Her bliver den action med størst værdi og sættes til en større sandsynlighed end 0.075 --> den bliver sat til 0.775
  return actions


def Expected_SARSA_run_episode(world, policy, start_state, time_steps=75):
    # Initialise S:
    current_state = start_state

    Q = np.random.rand(world.width, world.height, len(ACTIONS))

    path = []

    cummulative_reward = 0

    steps = 0

    # Chose action from S using policy
    a_possibles = policy(*current_state)
    current_action = random.choices(population=list(a_possibles.keys()), weights=a_possibles.values(),k=1)


    # Loop for each step of episode
    while time_steps >= 1:
        # Take Action A observe R, S'
        next_state = world.get_next_state(current_state, *current_action)

        # Chose action from S using policy
        a_possibles = policy(*next_state)
        next_action = random.choices(population=list(a_possibles.keys()), weights=a_possibles.values(),k=1)

        reward = world.get_reward(*next_state)
        # Q(S,A) <- Q(S,A) + alpha(R + gamme * sum(pi(a|S')*Q(S'|a))) -Q(S,A)))
        sum = 0
        for a in ACTIONS:
            # Policy(actions for next state)*Q(Action probability for next state)
            sum += a_possibles[a]*Q[next_state][ACTIONS.index(a)]

        Q[current_state][ACTIONS.index(*current_action)] += alpha*(reward + gamma * sum - Q[current_state][ACTIONS.index(*current_action)])

        world.add_reward(current_state[0], current_state[1], " ")

        cummulative_reward += reward

        # S <- S'
        current_state = next_state
        current_action = next_action
        time_steps -= 1
        steps += 1

    return Q

In [200]:
world = World(5,5)

world.add_reward(2,3, "+")
world.add_reward(1,2, "+")
world.add_reward(3,2, "+")
world.add_reward(4,3, "+")
display(pd.DataFrame(world.grid.T))

Q = Expected_SARSA_run_episode(world, SARSA_policy, (0,0))


display(pd.DataFrame(Q[:,:,0].T))
best_actions = np.array(ACTIONS)[np.argmax(Q, axis=2)]
display(pd.DataFrame(best_actions.T))
display(pd.DataFrame(world.grid.T))
print(reward)


Unnamed: 0,0,1,2,3,4
0,.,.,.,.,.
1,.,.,.,.,.
2,.,+,.,+,.
3,.,.,+,.,+
4,.,.,.,.,.


Unnamed: 0,0,1,2,3,4
0,0.169,0.333,0.675,0.94,0.188
1,0.02,-0.194,-0.345,0.612,0.749
2,0.173,0.028,0.591,0.24,0.75
3,0.578,0.392,0.267,0.681,0.951
4,0.478,0.224,0.907,0.979,0.643


Unnamed: 0,0,1,2,3,4
0,down,left,left,down,left
1,left,right,down,down,up
2,down,left,left,right,down
3,down,left,left,down,up
4,down,down,up,up,left


Unnamed: 0,0,1,2,3,4
0,,,,.,.
1,,,,,.
2,,,,,.
3,.,.,+,.,+
4,.,.,.,.,.


NameError: name 'reward' is not defined