In [1]:
import os
import gym
import numpy as np
import matplotlib.pyplot as plt

from stable_baselines3.common import results_plotter
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.results_plotter import load_results, ts2xy, plot_results
from stable_baselines3.common.noise import NormalActionNoise
from stable_baselines3.common.callbacks import BaseCallback

from gym_environment import CustomEnv
from network import CustomCNN

from stable_baselines3.common.torch_layers import BaseFeaturesExtractor
import torch.nn as nn
import torch.nn.functional as F
import torch
from stable_baselines3 import A2C

torch.backends.cudnn.benchmark = True

pygame 2.0.1 (SDL 2.0.14, Python 3.8.10)
Hello from the pygame community. https://www.pygame.org/contribute.html


# Collab callback class

In [2]:
class SaveOnBestTrainingRewardCallback(BaseCallback):
    """
    Callback for saving a model (the check is done every ``check_freq`` steps)
    based on the training reward (in practice, we recommend using ``EvalCallback``).

    :param check_freq: (int)
    :param log_dir: (str) Path to the folder where the model will be saved.
      It must contains the file created by the ``Monitor`` wrapper.
    :param verbose: (int)
    """
    def __init__(self, check_freq: int, log_dir: str, verbose=1):
        super(SaveOnBestTrainingRewardCallback, self).__init__(verbose)
        self.check_freq = check_freq
        self.log_dir = log_dir
        self.save_path = os.path.join(log_dir, 'best_model_A2C')
        self.best_mean_reward = -np.inf

    def _init_callback(self) -> None:
        pass
        # Create folder if needed
#         if self.save_path is not None:
#             os.makedirs(self.save_path, exist_ok=True)

    def _on_step(self) -> bool:
        if self.n_calls % self.check_freq == 0:

          # Retrieve training reward
          x, y = ts2xy(load_results(self.log_dir), 'timesteps')
          if len(x) > 0:
              # Mean training reward over the last 100 episodes
              mean_reward = np.mean(y[-100:])
              if self.verbose > 0:
                print("Num timesteps: {}".format(self.num_timesteps))
                print("Best mean reward: {:.2f} - Last mean reward per episode: {:.2f}".format(self.best_mean_reward, mean_reward))

              # New best model, you could save the agent here
              if mean_reward > self.best_mean_reward:
                  self.best_mean_reward = mean_reward
                  # Example for saving best model
                  if self.verbose > 0:
                    print("Saving new best model to {}".format(self.save_path))
                  self.model.save(self.save_path)

        return True


# Init logs, model, environment

In [3]:
log_dir = './saved_models_cont_mult/A2C/'
os.makedirs(log_dir, exist_ok=True)
callback = SaveOnBestTrainingRewardCallback(check_freq=1000, log_dir=log_dir)

env = CustomEnv(obstacle_turn = False, 
                vizualaze     = False, 
                Total_war     = True,
                )

env = Monitor(env, log_dir)

policy_kwargs = dict(
    features_extractor_class=CustomCNN,
    features_extractor_kwargs=dict(features_dim=518),
    activation_fn=torch.nn.ReLU,
    net_arch = [dict(pi=[1029, 128, 32, 8], vf=[1029, 128, 32, 8])])

model = A2C(policy          = 'MlpPolicy',
            env             = env,
            learning_rate   = 0.0001,
            n_steps         = 2,
            gamma           = 0.99,
            gae_lambda      = 0.95,
            tensorboard_log = "./tensorboard_logs_cont_mult/",
            policy_kwargs   = policy_kwargs,
            verbose         = 0,
            device          = 'cuda',
            use_sde         = True)



# Run learning

In [4]:
model.learn(total_timesteps=1e6,callback=callback)

  return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)
  return F.conv2d(input, weight, bias, self.stride,


Num timesteps: 1000
Best mean reward: -inf - Last mean reward per episode: -74.44
Saving new best model to ./saved_models_cont_mult/A2C/best_model_A2C
Num timesteps: 2000
Best mean reward: -74.44 - Last mean reward per episode: -85.95
Num timesteps: 3000
Best mean reward: -74.44 - Last mean reward per episode: -55.86
Saving new best model to ./saved_models_cont_mult/A2C/best_model_A2C
Num timesteps: 4000
Best mean reward: -55.86 - Last mean reward per episode: -118.29
Num timesteps: 5000
Best mean reward: -55.86 - Last mean reward per episode: -118.73
Num timesteps: 6000
Best mean reward: -55.86 - Last mean reward per episode: -73.97
Num timesteps: 7000
Best mean reward: -55.86 - Last mean reward per episode: -52.08
Saving new best model to ./saved_models_cont_mult/A2C/best_model_A2C
Num timesteps: 8000
Best mean reward: -52.08 - Last mean reward per episode: -36.14
Saving new best model to ./saved_models_cont_mult/A2C/best_model_A2C
Num timesteps: 9000
Best mean reward: -36.14 - Last 

<stable_baselines3.a2c.a2c.A2C at 0x7f2758bf2e80>

# Record the video

In [5]:
# env.render(model,1)

# Check tensorboard

In [None]:
!tensorboard --logdir ./tensorboard_logs_cont_mult/

Serving TensorBoard on localhost; to expose to the network, use a proxy or pass --bind_all
TensorBoard 2.1.1 at http://localhost:6009/ (Press CTRL+C to quit)


# Load and check the model

In [None]:
log_dir = './saved_models_cont_mult/A2C/'
env = CustomEnv(obstacle_turn=False, Total_war=False)

In [None]:
model = A2C.load(path=log_dir + 'best_model_with_sde_2steps', env=env)

In [None]:
state = env.reset()
action = model.predict(state)
print(action[0][0])

In [None]:
velosities = []
state = env.reset()
action = model.predict(state)
for i in range(0, 100):
    state, reward, done, numstep = env.step(action[0])
    action = model.predict(state)
    velosities.append(action[0][0])
print(max(velosities), min(velosities))