In [1]:
import os
import gym
import numpy as np
import matplotlib.pyplot as plt

from stable_baselines3.common import results_plotter
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.results_plotter import load_results, ts2xy, plot_results
from stable_baselines3.common.noise import NormalActionNoise
from stable_baselines3.common.callbacks import BaseCallback

from gym_env_3steps_in_1img import CustomEnv  # gym_environment3layers for 3-layer input image
from network import CustomCNN

from stable_baselines3.common.torch_layers import BaseFeaturesExtractor
import torch.nn as nn
import torch.nn.functional as F
import torch
from stable_baselines3 import PPO


torch.backends.cudnn.benchmark = True

pygame 2.0.1 (SDL 2.0.14, Python 3.8.10)
Hello from the pygame community. https://www.pygame.org/contribute.html


# Collab callback class

In [2]:
class SaveOnBestTrainingRewardCallback(BaseCallback):
    """
    Callback for saving a model (the check is done every ``check_freq`` steps)
    based on the training reward (in practice, we recommend using ``EvalCallback``).

    :param check_freq: (int)
    :param log_dir: (str) Path to the folder where the model will be saved.
      It must contains the file created by the ``Monitor`` wrapper.
    :param verbose: (int)
    """
    def __init__(self, check_freq: int, log_dir: str, verbose=1):
        super(SaveOnBestTrainingRewardCallback, self).__init__(verbose)
        self.check_freq = check_freq
        self.log_dir = log_dir
        self.save_path = os.path.join(log_dir, 'best_model_PPO')
        self.best_mean_reward = -np.inf

    def _init_callback(self) -> None:
        pass
        # Create folder if needed
#         if self.save_path is not None:
#             os.makedirs(self.save_path, exist_ok=True)

    def _on_step(self) -> bool:
        if self.n_calls % self.check_freq == 0:

          # Retrieve training reward
          x, y = ts2xy(load_results(self.log_dir), 'timesteps')
          if len(x) > 0:
              # Mean training reward over the last 100 episodes
              mean_reward = np.mean(y[-100:])
              if self.verbose > 0:
                print("Num timesteps: {}".format(self.num_timesteps))
                print("Best mean reward: {:.2f} - Last mean reward per episode: {:.2f}".format(self.best_mean_reward, mean_reward))

              # New best model, you could save the agent here
              if mean_reward > self.best_mean_reward:
                  self.best_mean_reward = mean_reward
                  # Example for saving best model
                  if self.verbose > 0:
                    print("Saving new best model to {}".format(self.save_path))
                  self.model.save(self.save_path)

        return True


# Init logs, model, environment

In [3]:
log_dir = './saved_models_cont_mult/PPO/'
os.makedirs(log_dir, exist_ok=True)
callback = SaveOnBestTrainingRewardCallback(check_freq=1000, log_dir=log_dir)


env = CustomEnv(obstacle_turn = False, 
                vizualaze     = False, 
                Total_war     = True,
                head_velocity = 0.005,
                num_obs       = 5, 
                num_enemy     = 1, 
                size_obs = [30, 40],
                steps_limit    = 2000)

env = Monitor(env, log_dir)

policy_kwargs = dict(
    features_extractor_class=CustomCNN,
    features_extractor_kwargs=dict(features_dim=518),
    activation_fn=torch.nn.ReLU,
    net_arch = [dict(pi=[1029, 128, 32, 8], vf=[1029, 128, 32, 8])])

model = PPO(policy          = 'MlpPolicy',
            env             = env,
            learning_rate   = 0.0001,
            n_steps         = 2048,
            batch_size      = 24,
            gamma           = 0.99,
            gae_lambda      = 0.95,
            tensorboard_log = "./tensorboard_logs_cont_mult/",
            policy_kwargs   = policy_kwargs,
            verbose         = 0,
            device          = 'cuda')

We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=2048 and n_envs=1)


# Run learning

In [None]:
model.learn(total_timesteps=1e6,callback=callback)

  return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)
  return F.conv2d(input, weight, bias, self.stride,


Num timesteps: 1000
Best mean reward: -inf - Last mean reward per episode: -460.33
Saving new best model to ./saved_models_cont_mult/PPO/best_model_PPO
Num timesteps: 2000
Best mean reward: -460.33 - Last mean reward per episode: -480.48
Num timesteps: 3000
Best mean reward: -460.33 - Last mean reward per episode: -571.63
Num timesteps: 4000
Best mean reward: -460.33 - Last mean reward per episode: -513.32
Num timesteps: 5000
Best mean reward: -460.33 - Last mean reward per episode: -473.72
Num timesteps: 6000
Best mean reward: -460.33 - Last mean reward per episode: -518.00
Num timesteps: 7000
Best mean reward: -460.33 - Last mean reward per episode: -509.50
Num timesteps: 8000
Best mean reward: -460.33 - Last mean reward per episode: -529.14
Num timesteps: 9000
Best mean reward: -460.33 - Last mean reward per episode: -527.90
Num timesteps: 10000
Best mean reward: -460.33 - Last mean reward per episode: -541.77
Num timesteps: 11000
Best mean reward: -460.33 - Last mean reward per epi

# Check tensorboard

In [None]:
!tensorboard --logdir ./tensorboard_logs_cont_mult/

# Load and check the model

In [None]:
log_dir = './saved_models_cont_mult/PPO/'
env = CustomEnv(obstacle_turn = True, 
                vizualaze     = True, 
                Total_war     = True,
                head_velocity = 0.01,
                num_obs       = 5, 
                num_enemy     = 2, 
                size_obs = [30, 40],
                steps_limit    = 2000)

In [None]:
import matplotlib.pyplot as plt
import cv2

In [None]:
state = env.reset()
state = cv2.cvtColor(state['img'], cv2.COLOR_BGR2RGB)
cv2.imwrite('final_img.png', state)
# cv2.imwrite

In [None]:
state_,  reward, done, _ = env.step([0.5, 0.8])
state_ = cv2.cvtColor(state_['img'], cv2.COLOR_BGR2RGB)
cv2.imwrite('final_img.png', state_)

In [None]:
model = PPO.load(path=log_dir + 'best_model_PPOobs_and_2enemy', env=env)

In [None]:
env.render(model)

In [None]:
state = env.reset()
action = model.predict(state)
print(action[0][0])

In [None]:
velosities = []
state = env.reset()
action = model.predict(state)
for i in range(0, 100):
    state, reward, done, numstep = env.step(action[0])
    action = model.predict(state)
    velosities.append(action[0][0])
print(max(velosities), min(velosities))