In [1]:
import itertools
import numpy as np
import pandas as pd
import random

  ## Grid world
  It has N x M states. The Agent can move from one state to another state depending on the state it is currently in and the action it takes.

  Here, there are total of 5*5 = 25 states representing the "environment".
  
  |   |  0  |  1  |  2  |  3  |  4  |
  |---|-----|-----|-----|-----|-----|
  | 0 |(0,0)|(0,1)|(0,2)|(0,3)|(0,4)|
  | 1 |(1,0)|(1,1)|(1,2)|(1,3)|(1,4)|
  | 2 |(2,0)|(2,1)|(2,2)|(2,3)|(2,4)|
  | 3 |(3,0)|(3,1)|(3,2)|(3,3)|(3,4)|
  | 4 |(4,0)|(4,1)|(4,2)|(4,3)|(4,4)|
  
____________________________
### Example
**agent is at (2,2)**:


  |   |  0  |  1  |  2  |  3  |  4  |
  |---|-----|-----|-----|-----|-----|
  | 0 |     |     |     |     |     |
  | 1 |     |     |     |     |     |
  | 2 |     |     |[ A ]|     |     |
  | 3 |     |     |     |     |     |
  | 4 |     |     |     |     | END |


Takes an action to move ***right***. Action: (0,1)

next_state = current_state + action = (2,2) + (0,1) = (2,3)

  |   |  0  |  1  |  2  |  3  |  4  |
  |---|-----|-----|-----|-----|-----|
  | 0 |     |     |     |     |     |
  | 1 |     |     |     |     |     |
  | 2 |     |     |     |[ A ]|     |
  | 3 |     |     |     |     |     |
  | 4 |     |     |     |     | END |


Takes an action to move ***down***. Action: (1,0)

next_state = current_state + action = (2,3) + (1,3) = (3,3)

  |   |  0  |  1  |  2  |  3  |  4  |
  |---|-----|-----|-----|-----|-----|
  | 0 |     |     |     |     |     |
  | 1 |     |     |     |     |     |
  | 2 |     |     |     |     |     |
  | 3 |     |     |     |[ A ]|     |
  | 4 |     |     |     |     | END |


Takes an action to move ***right***. Action: (0,1)

next_state = current_state + action = (3,3) + (0,1) = (3,4)

  |   |  0  |  1  |  2  |  3  |  4  |
  |---|-----|-----|-----|-----|-----|
  | 0 |     |     |     |     |     |
  | 1 |     |     |     |     |     |
  | 2 |     |     |     |     |     |
  | 3 |     |     |     |     |[ A ]|
  | 4 |     |     |     |     | END |


Takes an action to move ***down***. Action: (1,0)

next_state = current_state + action = (3,4) + (1,0) = (4,4)

  |   |  0  |  1  |  2  |  3  |  4  |
  |---|-----|-----|-----|-----|-----|
  | 0 |     |     |     |     |     |
  | 1 |     |     |     |     |     |
  | 2 |     |     |     |     |     |
  | 3 |     |     |     |     |     |
  | 4 |     |     |     |     |[ A ]|

In [34]:
class GridWorldEnv:
  """Grid world is an one of most classical reinforcement learning problems.
  Here the envirnment is made of a rectangular 2-d box with N rows and M columns,
  totally consisting of N x M states.

  The agent at any given point can take one of the 4 possible actions - to move:
  - Left (0,-1)
  - Right (0,1)
  - Up (-1,0)
  - Down (1,0)

  Different types of rewards and constraints can be formulated in this kind of
  setup

  Reward
  ------
  - Moving to the terminal state receives a reward of +10
  - Every other step receives -1 reward.

  Teminal states:
  (4,4)

  |   | 0   | 1   | 2   | 3   | 4   |
  |---|-----|-----|-----|-----|-----|
  | 0 |start|     |     |     |     |
  | 1 |     |     |     |     |     |
  | 2 |     |     |     |     |     |
  | 3 |     |     |     |     |     |
  | 4 |     |     |     |     | END |

  """
  def __init__(self, N = 10, M = 10):
    self.N = N
    self.M = M
    self.total_states = N*M

    self.observation_space = list(itertools.product(range(N), range(M)))
    self.action_space = [(0,1), (1,0), (0, -1), (-1, 0)]

    self.terminated = False
    self.total_reward = 0

    # once the agent reaches terminal state, it stays there. Assume start state cannot be terminal state.
    # also known as absorbing states
    self.terminal_states = [(4,4)]
    self.reset()

  def reset(self):
    self.state = (0,0)

  def _get_transition_probability(self, present_state, action, next_state):

    if present_state in self.terminal_states:
      return 0

    # If the expected state matches the action taken, return a probability 1
    expected_state = tuple(np.array(present_state) + np.array(action))
    if expected_state == next_state:
      return 1

    # Make sure the agent does not go out of the grid
    if (expected_state not in self.observation_space
        and present_state == next_state):
      return 1

    return 0

  def _get_reward(self,present_state, action, next_state):
    if next_state in self.terminal_states:
      reward = 10
    else:
      reward = -1
    return reward

  def step(self, action):
    # check if terminal state is reached
    if self.state in self.terminal_states:
      self.terminated = True
      reward = np.nan
      return self.state, reward, self.terminated

    max_prob = -np.inf
    for possible_state in self.observation_space:
      p = self._get_transition_probability(self.state, action, possible_state)
      if p > max_prob:
        next_state = possible_state
        max_prob = p

    reward = self._get_reward(self.state, action, next_state)

    self.state = next_state
    self.total_reward = self.total_reward + reward

    return self.state, reward, self.terminated

In [77]:
class Agent:
  def __init__(self,env):
    self.env = env
    env.reset()

  def policy(self):
    current_state = self.env.state
    action = random.choice([(0,1), (1,0)])
    # action = (0, 1)
    return action

In [78]:
env = GridWorldEnv(5,5)
agent = Agent(env)

agent.policy()

(0, 1)

In [64]:
agent.policy()

(0, 1)

In [45]:
env.state

(0, 0)

In [47]:
a = agent.policy()
a, env.step(a)

((0, 1), ((0, 1), -1, False))

In [87]:
# random walk
# get total reward
# try with different policies
episodes = 100

reward_sum = 0
for i in range(episodes):
  env = GridWorldEnv(5,5)
  agent = Agent(env)
  while not env.terminated:
    current_state = env.state
    action = agent.policy()
    next_state, reward, terminated = env.step(action)
  reward_sum += env.total_reward

  # print("current_state:",current_state, " action: ", action, "next_state: ", next_state, "reward: ", reward)

print("Average total reward: ", reward_sum/episodes)

Average total reward:  0.52


In [69]:
env.total_reward

-100

### 📔 Tasks
- Obtain the average total reward received by taking random actions.
- Add dummy states as [(3,0), (2,3)]. Dummy states are states where the agents cannot move to. Hint: modify the transition probability such that the agent cannot move to these states, similar to how we control the agent from moving out of the grid.
- Does this change the average total reward received by the agent.

