In [30]:
import gym
from gym import Env
import numpy as np
from gym import spaces
import random

class BoxSystem(Env):
    """Custom Environment that follows gym interface"""
    metadata = {'render.modes' : ['human']}

    LEFT = 0
    RIGHT = 1
    UP = 2
    DOWN = 3


    def __init__(self, x=5, y=5):
        self.size = np.array([x,y]) #size of the grid
        self.window = 512 #size of the pygame window
        self.action_space = spaces.Discrete(4) # 4 actions (right, up down and left)
        self.target_pos = np.array([x,y]) #position of the target (x and y coords)
        self.observation_space = spaces.Box(low=0, high=self.size, shape =(2,), dtype=np.float32)


    def reset(self, poserx = random.randint(0,5), posery = random.randint(0,5)): #resets so that the agent positions itself to a random point on the grid 
        self.agent_pos = np.array([poserx, posery]) # defines the x and y acoords of the pos  
        return self.agent_pos #returns as a numpy array


    def step(self, action):
        #deals with each of the 4 actions that the agent can take
        

        if(action == self.UP):
            self.agent_pos[1] +=1
        elif (action == self.DOWN):
            self.agent_pos[1] -=1
        elif(action == self.LEFT):
            self.agent_pos[0] -=1
        elif(action == self.RIGHT):
            self.agent_pos[0] +=1
        else: 
            raise ValueError("Received invalid action={} which is not part of the action space".format(action))
        
        #deals with the boundaries of the grid
        self.agent_pos = np.clip(self.agent_pos, 0, self.size[1])

        done = bool(self.agent_pos[0] == self.target_pos[0] and self.agent_pos[1] == self.target_pos[1]) # if the agent pos is the same as the target pos

        reward = -(((self.target_pos[0] - self.agent_pos[0]) + (self.target_pos[1] - self.agent_pos[1])) ** 0.5) #uses the distance formula
        """if the dist between the agent and the target = 0 then it will be the largest reward, otherwise everything will be a negative number """

        info={} # i'm not using this


        return np.array([self.agent_pos[0], self.agent_pos[1]]), reward, done, info #return everything




            

    def render(self, mode = 'human'):
        pass
        



    def close(self):
        pass


In [31]:
env = BoxSystem()

episodes = 5
for episode in range(1, episodes+1):
    state = env.reset()
    done = False
    score = 0 
    
    while not done:
        action = env.action_space.sample()
        n_state, reward, done, info = env.step(action)
        score+=reward
    print('Episode:{} Score:{}'.format(episode, score))
env.close()

Episode:1 Score:-48.94426434535725
Episode:2 Score:-73.53299129018681
Episode:3 Score:-36.50293455432995
Episode:4 Score:-144.35192192055678
Episode:5 Score:-963.0034526608955


In [32]:
import os
import gym
from stable_baselines3 import A2C

log_path = os.path.join('Training', 'Logs') 

env = BoxSystem()

model = A2C('MlpPolicy', env, verbose=1, tensorboard_log =log_path)
model.learn(total_timesteps=100000)

obs = env.reset()
for i in range(1000):
    action, _state = model.predict(obs, deterministic=True)
    obs, reward, done, info = env.step(action)
    if done:
      obs = env.reset() 

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Logging to Training\Logs\A2C_1
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 61.8     |
|    ep_rew_mean        | -136     |
| time/                 |          |
|    fps                | 71       |
|    iterations         | 100      |
|    time_elapsed       | 6        |
|    total_timesteps    | 500      |
| train/                |          |
|    entropy_loss       | -1.32    |
|    explained_variance | -0.0463  |
|    learning_rate      | 0.0007   |
|    n_updates          | 99       |
|    policy_loss        | 12.6     |
|    value_loss         | 128      |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 27.7     |
|    ep_rew_mean        | -56.9    |
| time/                 |          |
|    fps                | 132      |
|    iterations         

In [33]:
episodes = 5
for episode in range(1, episodes+1):
    state = env.reset()
    done = False
    score = 0 
    
    while not done:
        env.render()
        action = env.action_space.sample()
        n_state, reward, done, info = env.step(action)
        score+=reward
    print('Episode:{} Score:{}'.format(episode, score))
env.close()

Episode:1 Score:-395.72970356941784
Episode:2 Score:-1321.8534498479223
Episode:3 Score:-66.65688442926506
Episode:4 Score:-250.7696011326257
Episode:5 Score:-65.2646455469835


In [None]:
import os
PPO_path = os.path.join('Training', 'Saved Models', 'PPO_model')

In [None]:
model.save(PPO_path)