In [1]:
from Enviroment import Enviroment
import pygame
import time
import matplotlib.pyplot as plt
import gym
from gym import spaces
import cv2 
import os
import numpy as np
import torch
from tqdm import tqdm
import warnings
# warnings.filterwarnings("ignore", category=UserWarning)


pygame 2.0.1 (SDL 2.0.14, Python 3.8.3)
Hello from the pygame community. https://www.pygame.org/contribute.html


In [2]:
class CustomEnv(gym.Env):
    '''
    Оборочивание класса среды в среду gym
    '''
    metadata = {'render.modes': ['human']}

    def __init__(self, obstacle_turn: bool, Total_war: bool, num_obs: int, num_enemy: int, 
                 size_obs, steps_limit, vizualaze=False, head_velocity=0.01):
        '''
        Инициализация класса среды
        :param obstacle_turn: (bool) Флаг генерации препятствий
        :param vizualaze: (bool) Флаг генерации препятствий
        :param Total_war: (bool) Флаг режима игры (с противником или без)
        :param steps_limit: (int) Максимальное количество действий в среде за одну игру
        '''
        self.log_koef = 50

        self.velocity_coef = 35       #  1/2 max speed !!!
        self.ang_Norm_coef = np.pi
        self.coords_Norm_coef = 500
        
        self.enviroment = Enviroment(obstacle_turn, vizualaze, Total_war,
                                     head_velocity, num_obs, num_enemy, size_obs, steps_limit)

        self.enviroment.reset()

        self.action_space = spaces.Box(low=np.array([-1, -1]), high=np.array([1, 1]), dtype=np.float16)
        self.observation_space = gym.spaces.Dict({
                    'img': spaces.Box(low=0, high=255, shape=(500, 500, 3), dtype=np.uint8),
                    'posRobot': spaces.Box(low=np.array([0, 0,-3.14]), high=np.array([500, 500, 3.14])),
                    'target': spaces.Box(low  = np.array([[0, 0,-3.14] for i in range(num_enemy)]).reshape(-1), 
                                         high = np.array([[500, 500, 3.14] for i in range(num_enemy)]).reshape(-1)
                                        )
                                                })
        if num_enemy > 1:
            self.normTarget = self._NormSomeCoords
        else:
            self.normTarget = self._NormOneCoords
        
        self.poseRobot = self._NormOneCoords
        
        self.img1 = None
        self.img2 = None
        self.img3 = None
        

    def make_layers(self):
        """
        Функция наслоения изображений трех последовательных шагов в среде
        :param img1, img2, img3: состояния среды на трех последовательных шагах
        :return: new_img: изображение, содержащее информацию о состояниях среды на трех последовательных шагах, отображенную с разной интенсивностью
        """
        new_img = cv2.addWeighted(self.img2, 0.4, self.img1, 0.2, 0)
        self.Img = cv2.addWeighted(self.img3, 0.7, new_img, 0.5, 0)
    
    
    def step(self, action):
        """
        Метод осуществления шага в среде
        :param action: (int) направление движения в среде
        :return: dict_state, reward, not done, {}: состояние, реворд, флаг терминального состояния, информация о среде
        """
        
        action[0] *= self.velocity_coef
        action[0] += self.velocity_coef 
        action[1] *= self.ang_Norm_coef
        
        state, reward, done, numstep = self.enviroment.step(action)
        
        self.img1 = self.img2
        self.img2 = self.img3
        self.img3 = state.img
        
        self.make_layers()
    
        dist = np.sqrt((state.target[:,0]-state.posRobot[0])**2 + (state.target[:,1]-state.posRobot[1])**2) 
        Ax = np.cos(state.target[:,2])
        Ay = -np.sin(state.target[:,2])
        Bx = state.posRobot[0] - state.target[:,0]
        By = state.posRobot[1] - state.target[:,1]

        phy = np.arccos((Ax*Bx + Ay*By)/(np.sqrt(Ax**2 + Ay**2) * np.sqrt(Bx**2 + By**2)))
        np.clip(dist, 1e-9, None)
        
        reward += np.sum(np.log2(phy/dist*self.log_koef)) * int(not done)

        
        dict_state = {'img':     state.img,  
                      'posRobot':self.poseRobot(state.posRobot),  
                      'target':  self.normTarget(state.target).reshape(-1)}

        return dict_state, reward, done, {}
    
    def _NormSomeCoords(self, coords):
        '''
        Метод нормализации координат
        :return: coords: нормализованные координаты
        '''
        coords=np.float32(coords)
        coords[:,2]  = coords[:,2] / self.ang_Norm_coef #угол
        coords[:,:2] = coords[:,:2] / self.coords_Norm_coef #координаты
        
        return coords

    def _NormOneCoords(self, coords):
        '''
        Метод нормализации координат
        :return: coords: нормализованные координаты
        '''
        coords=np.float32(coords)
        coords[2]  = coords[2] / self.ang_Norm_coef #угол
        coords[:2] = coords[:2] / self.coords_Norm_coef #координаты
        
        return coords


    def reset(self):
        '''
        Метод обновления игры
        :return: dict_state: состояние
        '''
        
        state = self.enviroment.reset()
        
        self.img2 = state.img
        self.img3 = state.img
        
        dict_state = {'img':     state.img,  
                      'posRobot':self.poseRobot(state.posRobot),  
                      'target':  self.normTarget(state.target).reshape(-1)}

        return dict_state

    def render(self, model, num_gifs=1):
        '''
        Метод вывода информации об игре
        :param mode:
        :return:
        '''
        for i in range(num_gifs):
            
            images = []
            obs = self.reset()
            img = obs['img']# env.render(mode='rgb_array')
            done = False
                
            height, width, layers = img.shape
            size = (width,height)
            out = cv2.VideoWriter(f"video{i}.avi",cv2.VideoWriter_fourcc(*'DIVX'), 25, size)
            img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
            out.write(img)
            while not done:

                action, _ = model.predict(obs)
                print(action)
                obs, _, done ,_ = self.step(action)
                img = obs['img']
                img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
                out.write(img)
            out.release()
    
    def get_statistic(self, model, num_games):
        collision = 0
        win = 0
        destroyed = 0
        loss = 0
        
        pbar = tqdm(range(num_games))
        for i in pbar:
            obs = self.reset()
            done = False
            while not done:
                action, _ = model.predict(obs)
                obs, reward, done ,_ = self.step(action)
                
                
                
            if reward == -30:#win
                collision+=1
            elif reward == 100:# loss
                win +=1
            elif reward == -100:# loss
                destroyed +=1
            else:    #not_achieved
                loss+=1
        
        print("Win: ",win/num_games)
        print("destroyed: ", destroyed/num_games)
        print("loss: ",loss/num_games)
        print("collision: ",collision/num_games)
        

In [3]:
import torch as th
import torch.nn as nn
import torch.nn.functional as F
import torch
from stable_baselines3.common.torch_layers import BaseFeaturesExtractor

class CustomCNN(BaseFeaturesExtractor):
    """
    :param observation_space: (gym.Space)
    :param features_dim: (int) Number of features extracted.
        This corresponds to the number of unit for the last layer.
    """

    def __init__(self, observation_space: gym.spaces.Dict, features_dim: int = 518):
        super(CustomCNN, self).__init__(observation_space, features_dim)
        
        
        extractors = {}
        
        for key, subspace in observation_space.spaces.items():
            if key == "img":
        
                n_input_channels = observation_space[key].shape[0]
            
                extractors[key] = nn.Sequential(

                nn.Conv2d(n_input_channels, 32, 2),
                nn.MaxPool2d(2, 2),
                nn.Conv2d(32, 64, 2),
                nn.MaxPool2d(2, 2),

                ResBlock(n_filters=64, kernel_size=2),
                nn.MaxPool2d(4, 4),
                ResBlock(n_filters=64, kernel_size=2),
                nn.MaxPool2d(2, 2),
                ResBlock(n_filters=64, kernel_size=2),
                nn.MaxPool2d(2, 2),
                ResBlock(n_filters=64, kernel_size=2), 
                nn.MaxPool2d(2, 2),
                
                nn.Conv2d(64, 128, 2),
                nn.Flatten())
                    
            elif key == "posRobot":
                
                n_input_channels = observation_space[key].shape[0]
                
                extractors[key] = nn.Sequential(nn.Linear(n_input_channels, 9),
                                        nn.ReLU(),
                                        nn.Linear(9, 9),
                                        nn.ReLU(),
                                        nn.Linear(9, 3))
                    
            elif key == "target":
                            
                n_input_channels = observation_space[key].shape[0]
                    
                extractors[key] = nn.Sequential(nn.Linear(n_input_channels, 9),
                                        nn.ReLU(),
                                        nn.Linear(9, 9),
                                        nn.ReLU(),
                                        nn.Linear(9, 3))
                
        self.extractors = nn.ModuleDict(extractors)

    def forward(self, observations: th.Tensor) -> th.Tensor:
        '''
        Forward propagation
        :param observations: (dict) изображение; координаты и углы ориентации агентов
        :return: features tensor
        '''
        encoded_tensor_list = []

        for key, extractor in self.extractors.items():
            encoded_tensor_list.append(extractor(observations[key]))

        return th.cat(encoded_tensor_list, dim=1)

    
class ResBlock(nn.Module):
    def __init__(self, n_filters, kernel_size):
        """
        Инициализация кастомного резнетовского блока
        :param n_filters: (int) количество фильтров сверточного слоя
        :param kernel_size: (int) размер ядра свертки
        """
        super().__init__()
        self.n_filters = n_filters
        self.kernel_size = kernel_size

        self.b1 = nn.Conv2d(self.n_filters, self.n_filters, self.kernel_size, padding='same')
    
        self.b2 = nn.BatchNorm2d(self.n_filters, eps = 0.001, momentum= 0.99)
        self.b3 = nn.Conv2d(self.n_filters, self.n_filters, self.kernel_size, padding='same')
        self.b4 = nn.BatchNorm2d(self.n_filters, eps = 0.001, momentum= 0.99)
        
    def forward(self, x):
        '''
        Forward propagation
        :param x: input
        :return: output
        '''
        residual = x
        y = F.relu(self.b1(x))
        y = self.b2(y)
        y = F.relu(self.b3(y))
        y = self.b4(y)
        y += residual
        y = F.relu(y)
        return y



In [4]:
from stable_baselines3.common import results_plotter
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.results_plotter import load_results, ts2xy, plot_results
from stable_baselines3.common.noise import NormalActionNoise
from stable_baselines3.common.callbacks import BaseCallback

class SaveOnBestTrainingRewardCallback(BaseCallback):
    """
    Callback for saving a model (the check is done every ``check_freq`` steps)
    based on the training reward (in practice, we recommend using ``EvalCallback``).

    :param check_freq: (int)
    :param log_dir: (str) Path to the folder where the model will be saved.
      It must contains the file created by the ``Monitor`` wrapper.
    :param verbose: (int)
    """
    def __init__(self, check_freq: int, log_dir: str, agent_name: str, verbose=1):
        super(SaveOnBestTrainingRewardCallback, self).__init__(verbose)
        self.check_freq = check_freq
        self.log_dir = log_dir
        self.save_path = os.path.join(log_dir, 'best_model_')
        self.best_mean_reward = -np.inf

    def _init_callback(self) -> None:
        pass
        # Create folder if needed
#         if self.save_path is not None:
#             os.makedirs(self.save_path, exist_ok=True)

    def _on_step(self) -> bool:
        if self.n_calls % self.check_freq == 0:

          # Retrieve training reward
          x, y = ts2xy(load_results(self.log_dir), 'timesteps')
          if len(x) > 0:
              # Mean training reward over the last 100 episodes
              mean_reward = np.mean(y[-100:])
              if self.verbose > 0:
                print("Num timesteps: {}".format(self.num_timesteps))
                print("Best mean reward: {:.2f} - Last mean reward per episode: {:.2f}".format(self.best_mean_reward, mean_reward))

              # New best model, you could save the agent here
              if mean_reward > self.best_mean_reward:
                  self.best_mean_reward = mean_reward
                  # Example for saving best model
                  if self.verbose > 0:
                    print("Saving new best model to {}".format(self.save_path))
                  self.model.save(self.save_path)

        return True

In [5]:
policy_kwargs = dict(
    features_extractor_class=CustomCNN,
    features_extractor_kwargs=dict(features_dim=518),
    activation_fn=torch.nn.ReLU,
    net_arch = [dict(pi=[1029, 128, 32, 8], vf=[1029, 128, 32, 8])])  # for metods with 2 neural networks
#     net_arch = [64, 64])  # for metods with 1 neural network

In [6]:
env = CustomEnv(obstacle_turn = True, 
                vizualaze     = False, 
                Total_war     = True,
                head_velocity = 0.005,
                num_obs       = 4, 
                num_enemy     = 2, 
                size_obs = [30, 40],
                steps_limit    = 2000)



In [7]:
from stable_baselines3 import PPO, A2C, TD3, DDPG, SAC

log_dir = './saved_models_cont/PPO/'
os.makedirs(log_dir, exist_ok=True)

callback = SaveOnBestTrainingRewardCallback(check_freq  = 5000, 
                                            log_dir     = log_dir,
                                            agent_name  = 'PPO')

env = Monitor(env, log_dir)

model = PPO(policy          = 'MlpPolicy',  # 2 neural networks metod
            env             = env,
            learning_rate   = 0.0001,
            n_steps         = 2048, 
            batch_size      = 24,
            tensorboard_log = "./tensorboard_logs_cont_mult/",
            policy_kwargs   = policy_kwargs,
            verbose         = 0,
            device          = 'cuda')

# model = A2C(policy          = 'MlpPolicy',  # 2 neural networks metod
#             env             = env,
#             learning_rate   = 0.0001,
#             n_steps         = 10,
#             gamma           = 0.99,
#             gae_lambda      = 0.95,
#             tensorboard_log = "./tensorboard_logs_disc_mult/",
#             policy_kwargs   = policy_kwargs,
#             verbose         = 0,
#             device          = 'cuda')

# model = TD3(policy          = 'MlpPolicy',  # 1 neural network metod
#             env             = env,
#             learning_rate   = 0.0001,
#             buffer_size     = 100,
#             batch_size      = 2,
#             gamma           = 0.99,
#             tensorboard_log = "./tensorboard_logs_cont_mult/",
#             policy_kwargs   = policy_kwargs,
#             verbose         = 0,
#             device          = 'cuda')

# model = DDPG(policy         = 'MlpPolicy',  # 1 neural network metod
#             env             = env,
#             learning_rate   = 0.0001,
#             buffer_size     = 100,
#             batch_size      = 2,
#             gamma           = 0.99,
#             tensorboard_log = "./tensorboard_logs_cont_mult/",
#             policy_kwargs   = policy_kwargs,
#             verbose         = 0,
#             device          = 'cuda')

# model = SAC(policy          = 'MlpPolicy',  # 1 neural network metod
#             env             = env,
#             learning_rate   = 0.0001,
#             buffer_size     = 100,
#             batch_size      = 2,
#             gamma           = 0.99,
#             tensorboard_log = "./tensorboard_logs_cont_mult/",
#             policy_kwargs   = policy_kwargs,
#             verbose         = 0,
#             device          = 'cuda')

We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=2048 and n_envs=1)


In [None]:
model.learn(total_timesteps=1e6,callback=callback)

  return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)
  return F.conv2d(input, weight, bias, self.stride,


Num timesteps: 5000
Best mean reward: -inf - Last mean reward per episode: -422.76
Saving new best model to ./saved_models_cont/PPO/best_model_
Num timesteps: 10000
Best mean reward: -422.76 - Last mean reward per episode: -405.51
Saving new best model to ./saved_models_cont/PPO/best_model_
Num timesteps: 15000
Best mean reward: -405.51 - Last mean reward per episode: -437.30
Num timesteps: 20000
Best mean reward: -405.51 - Last mean reward per episode: -466.39
Num timesteps: 25000
Best mean reward: -405.51 - Last mean reward per episode: -376.67
Saving new best model to ./saved_models_cont/PPO/best_model_
Num timesteps: 30000
Best mean reward: -376.67 - Last mean reward per episode: -297.15
Saving new best model to ./saved_models_cont/PPO/best_model_
Num timesteps: 35000
Best mean reward: -297.15 - Last mean reward per episode: -354.21


## Video record

In [None]:
# env.render(model,1)

## Check the statistics of playing

In [None]:
env.get_statistic(model, 10)

## Tensorboard check

In [None]:
# !tensorboard --logdir ./tensorboard_logs_cont_mult/

## Model load

In [None]:
# model = PPO.load("./saved_models_cont/PPO/best_model_", env=env)

## User mode of play to check

In [None]:
# state = env.reset()
# done = False

In [None]:
# last_action = [0.4, -np.pi/2]
# while not done:
#     state, reward, done, numstep = env.step(last_action)
#     print(reward)#, done, state['target'])
#     action = input('theta')
#     if action:
#         last_action[1] = float(action)
#         action = input('velocity')
#         if action:
#             last_action[0] = float(action)