In [1]:
from codecs import mbcs_decode
from ctypes.wintypes import WORD
from pickle import TUPLE
from platform import python_branch
import gym
from gym import spaces
import pygame
import numpy as np
#import matplotlib.pyplot as plt




class GridWorldEnv(gym.Env):
    metadata = {"render_modes": ["human", "rgb_array"], "render_fps": 3}

    def __init__(self, render_mode=None, size=7):
        self.size = size  # The size of the square grid
        self.window_size = 512  # The size of the PyGame window

        # Observations are dictionaries with the agent's and the target's location.
        # Each location is encoded as an element of {0, ..., `size`}^2, i.e. MultiDiscrete([size, size]).
        self.observation_space = spaces.Dict(
            {
                "agent": spaces.Box(0, size - 1, shape=(2,), dtype=int),
                "target": spaces.Box(0, size - 1, shape=(2,), dtype=int),
                "obstacle": spaces.Box(0,size-1, shape=(2,), dtype=int),
            }
        )
        
        # We have 4 actions, corresponding to "right", "up", "left", "down",stay
        self.action_space = spaces.Discrete(4)

        """
        The following dictionary maps abstract actions from `self.action_space` to 
        the direction we will walk in if that action is taken.
        I.e. 0 corresponds to "right", 1 to "up" etc.
        """

        self._action_to_direction = {
            0: np.array([1, 0]),
            1: np.array([0, 1]),
            2: np.array([-1, 0]),
            3: np.array([0, -1]),
            #4: np.array([0,0])
        }

        assert render_mode is None or render_mode in self.metadata["render_modes"]
        self.render_mode = render_mode

        """
        If human-rendering is used, `self.window` will be a reference
        to the window that we draw to. `self.clock` will be a clock that is used
        to ensure that the environment is rendered at the correct framerate in
        human-mode. They will remain `None` until human-mode is used for the
        first time.
        """
        self.window = None
        self.clock = None
   

    #def _get_obs(self):
     #   return {"agent": self._agent_location, "target": self._target_location,  "Obstacle": self._obstacle_location}

    
    def _get_obs(self):
        if(self._target_location[0] and self._target_location[1]  in np.array( [self._agent_location + [1,1],
                                                self._agent_location +  [1,-1],
                                                self._agent_location +  [-1,-1],
                                                self._agent_location +  [-1,1],
                                                self._agent_location +  [2,0],
                                                self._agent_location +  [0,2],
                                                self._agent_location +  [-2,0],
                                                self._agent_location +  [0,-2],
                                                self._agent_location +  [1,0],
                                                self._agent_location +  [-1,0],
                                                self._agent_location +  [0,1],
                                                self._agent_location +  [0,-1]], dtype=int)):
            #pygame.quit()
            
            return {"target":self._target_location,"agent": self._agent_location }



        else:
            return {"agent": self._agent_location}

    def _get_info(self):
        return {"distance": np.linalg.norm(self._agent_location - self._target_location, ord=1)}

    def _get_steps(self):
        return {"steps":self.steps}
    #def _get_obstacle(self):
    #    return {"distance": np.linalg.norm(self._agent_location - self._obstacle_location, ord=1)}

    def reset(self, seed=2, options=None):
        # We need the following line to seed self.np_random
        super().reset(seed=seed)

        # Choose the agent's location uniformly at random
        #self._agent_location = int(8,8)
        self._agent_location= self.np_random.integers(
                 self.size-1, self.size, size=2, dtype=int
            ) 
        self.steps = 0
        self.reward = 0
        self._obstacle_location = np.array([2,2])
        
 
        # We will sample the target's location randomly until it does not coincide with the agent's location
        self._target_location = self._agent_location
        while np.array_equal(self._target_location, self._agent_location) or np.array_equal(self._target_location, self._obstacle_location):
            self._target_location = self.np_random.integers(
                 0, self.size, size=2, dtype=int
            )
        



        """while np.array_equal(self._target_location, self._agent_location):
            self._target_location = self.np_random.integers(
                0, self.size, size=2, dtype=int
            )
            self._obstacle_location = self._target_location
            while np.array_equal(self._target_location, self._obstacle_location):
                self._obstacle_location = self.np_random.integers(
                    0, self.size, size=2, dtype=int
                )
        """
        self.observation = self._get_obs()
        self.info = self._get_info()
        self.observation_steps = self._get_steps()

        if self.render_mode == "human":
            self._render_frame()

        return self.observation, self.info, self.observation_steps

    def step(self, action):
        # Map the action (element of {0,1,2,3}) to the direction we walk in
        self.steps += 1
        direction = self._action_to_direction[action]
        #while(np.array_equal(newloc, self._obstacle_location)):
        #    direction = self._action_to_direction[action]
        #    newloc = np.clip(
        #    self._agent_location + direction, 0, self.size - 1
        #    )
        if(self._agent_location + direction ==self._obstacle_location).all():
            self._agent_location = self._agent_location -direction
        # We use `np.clip` to make sure we don't leave the grid
        self._agent_location = np.clip(
            self._agent_location + direction, 0, self.size - 1,
            
        )
        
        #self._agent_location = np.clip(
        #self._agent_location + direction,self._obstacle_location
        #)

        # An episode is done iff the agent has reached the target
        self.terminated = np.array_equal(self._agent_location, self._target_location)

        
        self.reward += 1 if self.terminated else -0.01  # Binary sparse rewards
        self.observation = self._get_obs()
        self.info = self._get_info()

        if self.render_mode == "human":
            self._render_frame()

        return (self.observation, self.reward, self.terminated, False, self.info, self.steps)

    def render(self):
        if self.render_mode == "rgb_array":
            return self._render_frame()

    def _render_frame(self):
        if self.window is None and self.render_mode == "human":
            pygame.init()
            pygame.display.init()
            self.window = pygame.display.set_mode((self.window_size, self.window_size))
        if self.clock is None and self.render_mode == "human":
            self.clock = pygame.time.Clock()

        canvas = pygame.Surface((self.window_size, self.window_size))
        canvas.fill((255, 255, 255))
        pix_square_size = (
            self.window_size / self.size
        )  # The size of a single grid square in pixels

        # First we draw the target
        pygame.draw.rect(
            canvas,
            (255, 0, 0),
            pygame.Rect(
                pix_square_size * self._target_location,
                (pix_square_size, pix_square_size),
            ),
        )
        pygame.draw.rect(
            canvas,
            (0, 0, 0),
            pygame.Rect(
                pix_square_size * self._obstacle_location,
                (pix_square_size, pix_square_size),
            ),
        )
        # Now we draw the agent
        pygame.draw.circle(
            canvas,
            (0, 0, 255),
            (self._agent_location + 0.5) * pix_square_size,
            pix_square_size / 3,
        )
        
        
       
        # Finally, add some gridlines
        for x in range(self.size + 1):
            pygame.draw.line(
                canvas,
                0,
                (0, pix_square_size * x),
                (self.window_size, pix_square_size * x),
                width=2,
            )
            pygame.draw.line(
                canvas,
                0,
                (pix_square_size * x, 0),
                (pix_square_size * x, self.window_size),
                width=2,
            )

        if self.render_mode == "human":
            # The following line copies our drawings from `canvas` to the visible window
            self.window.blit(canvas, canvas.get_rect())
            pygame.event.pump()
            pygame.display.update()

            # We need to ensure that human-rendering occurs at the predefined framerate.
            # The following line will automatically add a delay to keep the framerate stable.
            self.clock.tick(self.metadata["render_fps"])
        else:  # rgb_array
            return np.transpose(
                np.array(pygame.surfarray.pixels3d(canvas)), axes=(1, 0, 2)
            )

    def close(self):
        if self.window is not None:
            pygame.display.quit()
            pygame.quit()

    

In [2]:
env = GridWorldEnv(render_mode="rgb_array")
env.reset()
steps = 0
episodes = 20
for episode in range(1, episodes+1):
    env.observation = env.reset()
    steps = 0
    done = False
    score = 0
    #print(env.observation)
    while not done:
        action = env.action_space.sample()
        env.observation = env.step(action)
        env.render()
        steps +=1
        #print(env.observation)
        if(np.array_equal(env._agent_location, env._target_location)):
            done = True
            
            score += env.reward
            
    print('Episode:{} Score:{}'.format(episode,score), "steps:{} ".format(steps))

#plt.plot(episode,steps)
        
env.close()

Episode:1 Score:-0.7900000000000014 steps:180 
Episode:2 Score:-3.479999999999949 steps:449 
Episode:3 Score:0.47999999999999976 steps:53 
Episode:4 Score:-0.8100000000000014 steps:182 
Episode:5 Score:-1.4499999999999917 steps:246 
Episode:6 Score:0.72 steps:29 
Episode:7 Score:-0.9000000000000015 steps:191 
Episode:8 Score:0.4399999999999997 steps:57 
Episode:9 Score:-0.4900000000000011 steps:150 
Episode:10 Score:-1.7799999999999847 steps:279 
Episode:11 Score:-0.8100000000000014 steps:182 
Episode:12 Score:0.5299999999999998 steps:48 
Episode:13 Score:0.6199999999999999 steps:39 
Episode:14 Score:0.89 steps:12 
Episode:15 Score:-1.669999999999987 steps:268 
Episode:16 Score:0.7699999999999999 steps:24 
Episode:17 Score:-0.44000000000000106 steps:145 
Episode:18 Score:-2.1699999999999764 steps:318 
Episode:19 Score:0.01999999999999935 steps:99 
Episode:20 Score:-0.030000000000000693 steps:104 
