<a href="https://colab.research.google.com/github/ShreyJais/RL/blob/main/2348558_RL_Lab6.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Reinforcement learning environment to support model-based algorithms for decision-making.

In [1]:
import numpy as np
import gym
from gym import spaces

In [2]:
class ModelBasedGridWorld(gym.Env):
  def __init__(self, grid_size=5, terminal_states=None, obstacles=None, random_seed=None):
    """
    grid_size: Size of the grid (grid_size x grid_size).
    terminal_states: List of terminal state positions (row, col).
    obstacles: List of obstacle positions (row, col).
    random_seed: Random seed for reproducibility.
    """
    super().__init__()
    self.grid_size = grid_size
    self.terminal_states = terminal_states or [(grid_size - 1, grid_size - 1)]
    self.obstacles = obstacles or []
    self.random_seed = random_seed
    self._setup_environment()

  def _setup_environment(self):
    # Initialize state and action spaces
    self.action_space = spaces.Discrete(4)  # Actions: 0=Up, 1=Right, 2=Down, 3=Left
    self.observation_space = spaces.MultiDiscrete([self.grid_size, self.grid_size])
    self.state = (0, 0)  # Start at top-left corner
    self.reward_model = {}  # Reward function R(s, a)
    self.transition_model = {}  # Transition dynamics P(s'|s, a)
    if self.random_seed:
        np.random.seed(self.random_seed)

  def step(self, action):
    """Take an action and return the next state, reward, done, and info."""
    if action < 0 or action >= self.action_space.n:
        raise ValueError("Invalid action.")

    row, col = self.state
    if self.state in self.terminal_states:
        return self.state, 0, True, {}

    # Define movement directions
    moves = {0: (-1, 0), 1: (0, 1), 2: (1, 0), 3: (0, -1)}  # Up, Right, Down, Left
    dr, dc = moves[action]
    next_row, next_col = row + dr, col + dc

    # Ensure the next state stays within bounds and not an obstacle
    next_row = max(0, min(self.grid_size - 1, next_row))
    next_col = max(0, min(self.grid_size - 1, next_col))
    next_state = (next_row, next_col)

    if next_state in self.obstacles:
        next_state = self.state  # Stay in the same position if hitting an obstacle
        reward = -5 # Penalty for hitting an obstacle
    else:
      # Reward is -1 for each step unless in terminal state
      reward = -1 if next_state not in self.terminal_states else 10

    done = next_state in self.terminal_states

    # Update state
    self.state = next_state

    return self.state, reward, done, {}

  def reset(self):
    """Reset the environment to the initial state."""
    self.state = (0, 0)
    return self.state

  def render(self, mode='human'):
    """Render the current state of the environment."""
    grid = np.zeros((self.grid_size, self.grid_size), dtype=str)
    grid[:, :] = '.'
    for r, c in self.terminal_states:
        grid[r, c] = 'T'  # Mark terminal states
    for r, c in self.obstacles:
        grid[r, c] = 'O' # Mark obstacles
    row, col = self.state
    grid[row, col] = 'A'  # Mark agent position
    print("\n".join([" ".join(row) for row in grid]))
    print()

  def get_transition_model(self):
    """Return the transition dynamics for model-based RL."""
    return self.transition_model

  def get_reward_model(self):
    """Return the reward function for model-based RL."""
    return self.reward_model

  and should_run_async(code)


In [3]:
# Initialize the environment with obstacles
env = ModelBasedGridWorld(grid_size=5, terminal_states=[(4, 4)], obstacles=[(2, 2), (3,1)])

# Reset the environment
state = env.reset()
env.render()

# Simulate an episode and store the path
path = [state]
done = False
while not done:
    action = env.action_space.sample()  # Random action
    next_state, reward, done, _ = env.step(action)
    path.append(next_state)
    env.render()
    print(f"Action: {action}, Reward: {reward}")

# Visualize the path
grid = np.zeros((env.grid_size, env.grid_size), dtype=str)
grid[:, :] = '.'
for r, c in env.terminal_states:
    grid[r, c] = 'T'
for r, c in env.obstacles:
    grid[r, c] = 'O'
for i, state in enumerate(path):
    row, col = state
    if i == 0:
        grid[row, col] = 'S'  # Mark start
    elif grid[row, col] == '.':
        grid[row, col] = '*'  # Mark path only on empty cells


# Mark the end state separately, after placing the path
row, col = path[-1]
grid[row, col] = 'E' # Mark end

print("\n".join([" ".join(row) for row in grid]))
print()

A . . . .
. . . . .
. . O . .
. O . . .
. . . . T

A . . . .
. . . . .
. . O . .
. O . . .
. . . . T

Action: 3, Reward: -1
. A . . .
. . . . .
. . O . .
. O . . .
. . . . T

Action: 1, Reward: -1
. . . . .
. A . . .
. . O . .
. O . . .
. . . . T

Action: 2, Reward: -1
. . . . .
A . . . .
. . O . .
. O . . .
. . . . T

Action: 3, Reward: -1
. . . . .
. . . . .
A . O . .
. O . . .
. . . . T

Action: 2, Reward: -1
. . . . .
A . . . .
. . O . .
. O . . .
. . . . T

Action: 0, Reward: -1
. . . . .
. A . . .
. . O . .
. O . . .
. . . . T

Action: 1, Reward: -1
. . . . .
A . . . .
. . O . .
. O . . .
. . . . T

Action: 3, Reward: -1
. . . . .
A . . . .
. . O . .
. O . . .
. . . . T

Action: 3, Reward: -1
. . . . .
. A . . .
. . O . .
. O . . .
. . . . T

Action: 1, Reward: -1
. . . . .
. . A . .
. . O . .
. O . . .
. . . . T

Action: 1, Reward: -1
. . . . .
. . . A .
. . O . .
. O . . .
. . . . T

Action: 1, Reward: -1
. . . . .
. . A . .
. . O . .
. O . . .
. . . . T

Action: 3, Reward: -1
