In [2]:
!pip3 install gymnasium

Collecting gymnasium
  Downloading gymnasium-0.29.1-py3-none-any.whl (953 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m953.9/953.9 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
Collecting farama-notifications>=0.0.1 (from gymnasium)
  Downloading Farama_Notifications-0.0.4-py3-none-any.whl (2.5 kB)
Installing collected packages: farama-notifications, gymnasium
Successfully installed farama-notifications-0.0.4 gymnasium-0.29.1


# Importing the necessary libraries

In [3]:
import gymnasium
from gymnasium import spaces, utils
import numpy as np
import os
import math
import matplotlib.pyplot as plt
from operator import add
from math import sqrt

# Creating the Random Maze Environment using the Gymnasium.Env class

In [39]:
class RandomMazeEnv(gymnasium.Env):

    metadata = {"render_modes" : ["human", "rgb_array"], "render_fps": 4}
    # Define constants for clearer code
    LEFT = 0
    DOWN = 3
    RIGHT = 2
    UP = 1

    def __init__(self, render = None, seed = None):
        super(RandomMazeEnv, self).__init__()

        p = 0.8
        self.grid_size = 3 * 4
        # Number of discrete actions, we have four: left, down, right, up
        self.action_space = spaces.Discrete(4)
        self.observation_space = spaces.Discrete(12)
        self.P = {
            0: {
                0: [
                    [p + (1-p)/2, 0, -0.04, False],
                    [(1-p)/2, 4, -0.04, False]
                ],
                3: [
                    [(1-p)/2, 0, -0.04, False],
                    [p, 4, -0.04, False],
                    [(1-p)/2, 1, -0.04, False]
                ],
                2: [
                    [(1-p)/2, 4, -0.04, False],
                    [p, 1, -0.04, False],
                    [(1-p)/2, 0, -0.04, False]
                ],
                1: [
                    [(1-p)/2, 1, -0.04, False],
                    [p + (1-p)/2, 0, -0.04, False],
                ]
            },
            1: {
                0: [
                    [(1-p), 1, -0.04, False],
                    [p, 0, -0.04, False],
                ],
                3: [
                    [(1-p)/2, 0, -0.04, False],
                    [p, 1, -0.04, False],
                    [(1-p)/2, 2, -0.04, False]
                ],
                2: [
                    [(1-p), 1, -0.04, False],
                    [p, 2, -0.04, False],
                ],
                1: [
                    [(1-p)/2, 2, -0.04, False],
                    [p, 1, -0.04, False],
                    [(1-p)/2, 0, -0.04, False]
                ]
            },
            2: {
                0: [
                    [(1-p)/2, 2, -0.04, False],
                    [p, 1, -0.04, False],
                    [(1-p)/2, 6, -0.04, False]
                ],
                3: [
                    [(1-p)/2, 1, -0.04, False],
                    [p, 6, -0.04, False],
                    [(1-p)/2, 3, 1.0, True]
                ],
                2: [
                    [(1-p)/2, 6, -0.04, False],
                    [p, 3, 1.0, True],
                    [(1-p)/2, 2, -0.04, False]
                ],
                1: [
                    [(1-p)/2, 3, 1.0, True],
                    [p, 2, -0.04, False],
                    [(1-p)/2, 1, -0.04, False]
                ]
            },
            3: {
                0: [
                    [1.0, 3, 0.0, True]
                ],
                3: [
                    [1.0, 3, 0.0, True]
                ],
                2: [
                    [1.0, 3, 0.0, True]
                ],
                1: [
                    [1.0, 3, 0.0, True]
                ]
            },
            4: {
                0: [
                    [(1-p)/2, 0, -0.04, False],
                    [p, 4, -0.04, False],
                    [(1-p)/2, 8, -0.04, False]
                ],
                3: [
                    [(1-p), 4, -0.04, False],
                    [p, 8, -0.04, False],
                ],
                2: [
                    [(1-p)/2, 8, -0.04, False],
                    [p, 4, -0.04, False],
                    [(1-p)/2, 0, -0.04, False]
                ],
                1: [
                    [(1-p), 4, -0.04, False],
                    [p, 0, -0.04, False],
                ]
            },
            5: {
                0: [
                    [1.0, 5, 0.0, True]
                ],
                3: [
                    [1.0, 5, 0.0, True]
                ],
                2: [
                    [1.0, 5, 0.0, True]
                ],
                1: [
                    [1.0, 5, 0.0, True]
                ]
            },
            6: {
                0: [
                    [(1-p)/2, 2, -0.04, False],
                    [p, 6, -0.04, False],
                    [(1-p)/2, 10, -0.04, False]
                ],
                3: [
                    [(1-p)/2, 6, -0.04, False],
                    [p, 10, -0.04, False],
                    [(1-p)/2, 7, -1.0, True]
                ],
                2: [
                    [(1-p)/2, 10, -0.04, False],
                    [p, 7, -1.0, True],
                    [(1-p)/2, 2, -0.04, False]
                ],
                1: [
                    [(1-p)/2, 7, -1.0, True],
                    [p, 2, -0.04, False],
                    [(1-p)/2, 6, -0.04, False]
                ]
            },
            7: {
                0: [
                    [1.0, 7, 0.0, True]
                ],
                3: [
                    [1.0, 7, 0.0, True]
                ],
                2: [
                    [1.0, 7, 0.0, True]
                ],
                1: [
                    [1.0, 7, 0.0, True]
                ]
            },
            8: {
                0: [
                    [(1-p)/2, 4, -0.04, False],
                    [p + (1-p)/2, 8, -0.04, False]
                ],
                3: [
                    [p + (1-p)/2, 8, -0.04, False],
                    [(1-p)/2, 9, -0.04, False]
                ],
                2: [
                    [(1-p)/2, 8, -0.04, False],
                    [p, 9, -0.04, False],
                    [(1-p)/2, 4, -0.04, False]
                ],
                1: [
                    [(1-p)/2, 9, -0.04, False],
                    [p, 4, -0.04, False],
                    [(1-p)/2, 8, -0.04, False]
                ]
            },
            9: {
                0: [
                    [(1-p), 9, -0.04, False],
                    [p, 8, -0.04, False]
                ],
                3: [
                    [(1-p)/2, 8, -0.04, False],
                    [p, 9, -0.04, False],
                    [(1-p)/2, 10, -0.04, False]
                ],
                2: [
                    [(1-p), 9, -0.04, False],
                    [p, 10, -0.04, False]
                ],
                1: [
                    [(1-p)/2, 10, -0.04, False],
                    [p, 9, -0.04, False],
                    [(1-p)/2, 8, -0.04, False]
                ]
            },
            10: {
                0: [
                    [(1-p)/2, 6, -0.04, False],
                    [p, 9, -0.04, False],
                    [(1-p)/2, 10, -0.04, False]
                ],
                3: [
                    [(1-p)/2, 9, -0.04, False],
                    [p, 10, -0.04, False],
                    [(1-p)/2, 11, -0.04, False]
                ],
                2: [
                    [(1-p)/2, 10, -0.04, False],
                    [p, 11, -0.04, False],
                    [(1-p)/2, 6, -0.04, False]
                ],
                1: [
                    [(1-p)/2, 11, -0.04, False],
                    [p, 6, -0.04, False],
                    [(1-p)/2, 9, -0.04, False]
                ]
            },
            11: {
                0: [
                    [(1-p)/2, 7, -1.0, True],
                    [p, 10, -0.04, False],
                    [(1-p)/2, 11, -0.04, False]
                ],
                3: [
                    [(1-p)/2, 10, -0.04, False],
                    [p + (1-p)/2, 11, -0.04, False]
                ],
                2: [
                    [p + (1-p)/2, 11, -0.04, False],
                    [(1-p)/2, 7, -1.0, True]
                ],
                1: [
                    [(1-p)/2, 11, -0.04, False],
                    [p, 7, -1.0, True],
                    [(1-p)/2, 10, -0.04, False]
                ]
            }
        }
        self.seed(seed)
        self.start_state = 8
        self.state = self.start_state
        self.gamma = 0.99 # discount factor

    def seed(self, seed=None):
        self.np_random, seed = gymnasium.utils.seeding.np_random(seed)
        return [seed]

    def step(self, action):
        transitions = self.P[self.state][action]
        i = self.np_random.choice(len(transitions), p=[t[0] for t in transitions])
        prob, next_state, reward, done = transitions[i]
        self.state = next_state
        return next_state, reward, done, {}

        # moving to the next state
        self.state = next_state
        return next_state, reward, done, {}

    def reset(self, seed = None):
        self.seed(seed)
        # Reset the state of the environment to an initial state
        self.state = self.start_state
        return self.state

    def close(self):
         pass

# Implementing the `generateTrajectory()` for simulating the Random Maze Environment using some default test cases

In [41]:
# Random policy
def random_policy(state):
    return env.action_space.sample()  # Randomly sample an action

# Implementing the generateTrajectory function with maxsteps
def generateTrajectory(env, policy, maxSteps, seed = None):
  state = env.reset(seed = seed)
  trajectory = []
  for _ in range(maxSteps):
    # choosing an action from policy
    action = policy(state) # pi(a|s)
    next_state, reward, done, _ = env.step(action)
    trajectory.append((state, action, reward, next_state, done))
    if done:
      break
    state = next_state

  if done:
    return trajectory
  else:
    return [] # discard partial trajectories

# Simluation

In [49]:
global_seed = 123
numEpisodes = 10
maxSteps = 50

# testing the RandomMazeEnv Environment
env = RandomMazeEnv(seed = global_seed)
trajectory = generateTrajectory(env, random_policy, maxSteps, seed = global_seed)
print("Printing Trjactory")
print(trajectory)
print("Total number of states encountered : {}".format(len(trajectory)))

for e in trajectory:
  print(e)

Printing Trjactory
[(8, 2, -0.04, 9, False), (9, 1, -0.04, 10, False), (10, 1, -0.04, 6, False), (6, 0, -0.04, 6, False), (6, 0, -0.04, 6, False), (6, 1, -0.04, 2, False), (2, 3, 1.0, 3, True)]
Total number of states encountered : 7
(8, 2, -0.04, 9, False)
(9, 1, -0.04, 10, False)
(10, 1, -0.04, 6, False)
(6, 0, -0.04, 6, False)
(6, 0, -0.04, 6, False)
(6, 1, -0.04, 2, False)
(2, 3, 1.0, 3, True)
