In [None]:
import os
import gym
import numpy as np

from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.results_plotter import load_results, ts2xy, plot_results
from stable_baselines3.common.callbacks import BaseCallback

from Enviroment import Enviroment
from gym import spaces
import cv2
from tqdm import tqdm

from stable_baselines3.common.utils import set_random_seed
from stable_baselines3.common.torch_layers import BaseFeaturesExtractor
import torch.nn as nn
import torch.nn.functional as F
import torch
from stable_baselines3 import PPO

## Callback class

In [None]:
class SaveOnBestTrainingRewardCallback(BaseCallback):
    """
    Callback for saving a model (the check is done every ``check_freq`` steps)
    based on the training reward (in practice, we recommend using ``EvalCallback``).

    :param check_freq: (int)
    :param log_dir: (str) Path to the folder where the model will be saved.
      It must contains the file created by the ``Monitor`` wrapper.
    :param verbose: (int)
    """
    def __init__(self, check_freq: int, log_dir: str, verbose=1):
        super(SaveOnBestTrainingRewardCallback, self).__init__(verbose)
        self.check_freq = check_freq
        self.log_dir = log_dir
        self.save_path = os.path.join(log_dir, 'best_model_PPO')
        self.best_mean_reward = -np.inf
        self.rew_len = []
        self.n_games = 0

    def _on_step(self) -> bool:
        
        if self.n_games < len(model.ep_info_buffer):
            self.n_games+=1
            self.rew_len.append(model.ep_info_buffer[-1]['r'])
            
            if self.best_mean_reward < np.mean(self.rew_len[-100:]):
                self.best_mean_reward = np.mean(self.rew_len[-100:])
                self.model.save(self.save_path)

        return True

## Neural network class

In [None]:
import torch as th
import torch.nn as nn
import torch.nn.functional as F
import torch
from stable_baselines3.common.torch_layers import BaseFeaturesExtractor

class CustomCNN(BaseFeaturesExtractor):
    """
    :param observation_space: (gym.Space)
    :param features_dim: (int) Number of features extracted.
        This corresponds to the number of unit for the last layer.
    """

    def __init__(self, observation_space: gym.spaces.Dict, features_dim: int = 518):
        super(CustomCNN, self).__init__(observation_space, features_dim)
        
        
        n_input_channels = observation_space.shape[0]
        self.cnn = nn.Sequential(

            nn.Conv2d(n_input_channels, 32, 2),
            nn.MaxPool2d(2, 2),
            nn.Conv2d(32, 64, 2),
            nn.MaxPool2d(2, 2),

            ResBlock(n_filters=64, kernel_size=2),
            nn.MaxPool2d(2, 2),
            ResBlock(n_filters=64, kernel_size=2),
            nn.MaxPool2d(2, 2),
            ResBlock(n_filters=64, kernel_size=2),
            nn.MaxPool2d(2, 2),
            ResBlock(n_filters=64, kernel_size=2), 
            nn.MaxPool2d(2, 2),

            nn.Conv2d(64, 128, 2),
            nn.Flatten()
        )
        
        with th.no_grad():
            n_flatten = self.cnn(
                th.as_tensor(observation_space.sample()[None]).float()
            ).shape[1]

        print(n_flatten)
        self.linear = nn.Sequential(nn.Linear(n_flatten, features_dim), nn.ReLU())
        
    def forward(self, observations: th.Tensor) -> th.Tensor:
        '''
        Forward propagation
        :param observations: (dict) изображение; координаты и углы ориентации агентов
        :return: features tensor
        '''

        return self.linear(self.cnn(observations)) 

    
class ResBlock(nn.Module):
    def __init__(self, n_filters, kernel_size):
        """
        Инициализация кастомного резнетовского блока
        :param n_filters: (int) количество фильтров сверточного слоя
        :param kernel_size: (int) размер ядра свертки
        """
        super().__init__()
        self.n_filters = n_filters
        self.kernel_size = kernel_size

        self.b1 = nn.Conv2d(self.n_filters, self.n_filters, self.kernel_size, padding='same')
    
        self.b2 = nn.BatchNorm2d(self.n_filters, eps = 0.001, momentum= 0.99)
        self.b3 = nn.Conv2d(self.n_filters, self.n_filters, self.kernel_size, padding='same')
        self.b4 = nn.BatchNorm2d(self.n_filters, eps = 0.001, momentum= 0.99)
        
    def forward(self, x):
        '''
        Forward propagation
        :param x: input
        :return: output
        '''
        residual = x
        y = F.relu(self.b1(x))
        y = self.b2(y)
        y = F.relu(self.b3(y))
        y = self.b4(y)
        y += residual
        y = F.relu(y)
        return y

## Environment gym class

In [None]:
import collections
import cv2
import numpy as np
import gym

In [None]:
class RepeatAction(gym.Wrapper):
    def __init__(self, env=None, repeat=4, fire_first=False):
        super(RepeatAction, self).__init__(env)
        self.repeat = repeat
        self.shape = env.observation_space.low.shape
        self.fire_first = fire_first

    def step(self, action):
        t_reward = 0.0
        done = False
        for i in range(self.repeat):
            obs, reward, done, info = self.env.step(action)
            t_reward += reward
            if done:
                break
        return obs, t_reward, done, info

    def reset(self):
        obs = self.env.reset()
        if self.fire_first:
            assert self.env.unwrapped.get_action_meanings()[1] == 'FIRE'
            obs, _,_,_ = self.env.step(1)
        return obs


In [None]:
class PreprocessFrame(gym.ObservationWrapper):
    def __init__(self, shape, env=None):
        super(PreprocessFrame, self).__init__(env)
        self.shape = (shape[2], shape[0], shape[1])
        self.observation_space = gym.spaces.Box(low=0.0, high=1.0,
                                                shape=self.shape,
                                                dtype=np.float32)

    def observation(self, obs):
        new_frame = cv2.cvtColor(obs, cv2.COLOR_RGB2GRAY)
        resized_screen = cv2.resize(new_frame, self.shape[1:],
                                    interpolation=cv2.INTER_AREA)
        new_obs = np.array(resized_screen, dtype=np.uint8).reshape(self.shape)
        new_obs = new_obs / 255.0

        return new_obs


In [None]:
class StackFrames(gym.ObservationWrapper):
    def __init__(self,env , repeat):
        super(StackFrames, self).__init__(env)
        self.observation_space = gym.spaces.Box(
            env.observation_space.low.repeat(repeat, axis=0),
            env.observation_space.high.repeat(repeat, axis=0),
            dtype=np.float32)
        self.stack = collections.deque(maxlen=repeat)

    def reset(self):
        self.stack.clear()
        observation = self.env.reset()
        for _ in range(self.stack.maxlen):
            self.stack.append(observation)

        return np.array(self.stack).reshape(self.observation_space.low.shape)

    def observation(self, observation):
        self.stack.append(observation)

        return np.array(self.stack).reshape(self.observation_space.low.shape)


In [None]:
class CustomEnv(gym.Env):
    '''
    Оборочивание класса среды в среду gym
    '''

    def __init__(self, obstacle_turn: bool, Total_war: bool, num_obs: int, num_enemy: int, inp_dim: int,
                 size_obs, steps_limit, vizualaze=False, head_velocity=0.01,
                rew_col = -100,rew_win=100, rew_defeat = -100):
        '''
        Инициализация класса среды
        :param obstacle_turn: (bool) Флаг генерации препятствий
        :param vizualaze: (bool) Флаг генерации препятствий
        :param Total_war: (bool) Флаг режима игры (с противником или без)
        :param steps_limit: (int) Максимальное количество действий в среде за одну игру
        '''
        self.log_koef = 50

        self.velocity_coef = 35       #  1/2 max speed !!!
        self.ang_Norm_coef = 1
        self.coords_Norm_coef = 500
        self.proportional_coef = 0.01
        self.inp_dim = inp_dim
        
        self.rew_col = rew_col
        self.rew_win = rew_win
        self.rew_defeat = rew_defeat
                
        self.enviroment = Enviroment(obstacle_turn, vizualaze, Total_war,
                                     head_velocity, num_obs, num_enemy, size_obs, steps_limit,
                                     rew_col, rew_win, rew_defeat,epsilon = 100,sigma = 30)

        self.action_space = spaces.Box(low=np.array([-1, -1]), high=np.array([1, 1]), dtype=np.float16) # [speed, angle]
        self.observation_space = gym.spaces.Box(low=0, high=255, shape=(self.inp_dim, self.inp_dim, 3), dtype=np.uint8)        
        
        state = self.enviroment.reset()
        
   
    
    def step(self, action):
        """
        Метод осуществления шага в среде
        :param action: (int) направление движения в среде
        :return: dict_state, reward, not done, {}: состояние, реворд, флаг терминального состояния, информация о среде
        """
        

        action[1] *= self.ang_Norm_coef
        action[0] = self.velocity_coef * np.clip(action[0], a_min = 0.001, a_max = 1)
        
        state, reward, done, numstep = self.enviroment.step(action)
        
    
        x2 = state.posRobot[0]
        y2 = state.posRobot[1]
    
        x4 = state.target[0,0]
        y4 = state.target[0,1]
        
        f2 =  state.target[0,2]
        
        Ax4, Ay4 = -np.cos(f2), np.sin(f2)
        Bx24, By24 = x2 - x4, y2 - y4
        
        dist = - np.sqrt(np.abs((x2-x4)**2 + (y2-y4)**2))
        phy = (Ax4*Bx24 + Ay4*By24)/(np.sqrt(Ax4**2 + Ay4**2) * np.sqrt(Bx24**2 + By24**2))
        reward_l = phy*(dist+500) * self.proportional_coef * (not done) + np.round(reward, 2).sum()

        return state.img, reward_l, done, {}
    
    def reset(self):
        '''
        Метод обновления игры
        :return: dict_state: состояние
        '''
        
        state = self.enviroment.reset()
        
        dict_state = state.img 


        return dict_state

    def render(self, model, num_gifs=1):
        '''
        Метод вывода информации об игре
        :param mode:
        :return:
        '''
        for i in range(num_gifs):
            
            images = []
            obs = self.reset()
            img = obs['img']# env.render(mode='rgb_array')
            done = False
                
            height, width, layers = img.shape
            size = (width,height)
            out = cv2.VideoWriter(f"video{i}.avi",cv2.VideoWriter_fourcc(*'DIVX'), 25, size)
            img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
            out.write(img)
            while not done:

                action, _ = model.predict(obs)
                obs, _, done ,_ = self.step(action)
                img = obs['img']
                img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
                out.write(img)
            out.release()
    
    def get_statistic(self, model, num_games):
        collision = 0
        win = 0
        destroyed = 0
        loss = 0
        
        pbar = tqdm(range(num_games))
        for i in pbar:
            obs = self.reset()
            done = False
            while not done:
                action, _ = model.predict(obs)
                obs, reward, done ,_ = self.step(action)
                
            if reward == self.rew_col:      # collision
                collision+=1
            elif reward == self.rew_win:    # win
                win +=1
            elif reward == self.rew_defeat: # loss
                destroyed +=1
            else:                           # not_achieved
                loss+=1
        
        print("Win: ",win/num_games)
        print("destroyed: ", destroyed/num_games)
        print("loss: ",loss/num_games)
        print("collision: ",collision/num_games)

In [None]:
env = CustomEnv(obstacle_turn = True,
                vizualaze     = True, 
                Total_war     = True,
                head_velocity = 0.005,#
                num_obs       = 5, 
                num_enemy     = 1, 
                size_obs      = [50, 60],
                rew_col       = -70,
                rew_win       = 100,
                inp_dim       = 250,#!!!!!!!!!!!!
                rew_defeat    = -100,
                steps_limit   = 1000)

In [None]:
env = RepeatAction(env, repeat = 4)
env = PreprocessFrame( shape=(250,250,1), env = env)
env = StackFrames(env, repeat = 4)

In [None]:
# e = env.reset()

In [None]:
# env.step([1,1])

In [None]:
# env.step(np.array([4, 1]))

## Initialize the agent

In [None]:
from stable_baselines3 import PPO, A2C, TD3, DDPG, SAC

policy_kwargs = dict(
    features_extractor_class=CustomCNN,
    features_extractor_kwargs=dict(features_dim=134),
    activation_fn=torch.nn.ReLU,
    net_arch = [dict(pi=[134, 32, 8], vf=[134, 32, 8])])

model = PPO(policy          = 'MlpPolicy',
            env             = env,
            learning_rate   = 0.001,
            n_steps         = 2048,
            batch_size      = 72,
            gamma           = 0.99,
            gae_lambda      = 0.95,
            tensorboard_log = "./tensorboard_logs/",
            policy_kwargs   = policy_kwargs,
            verbose         = 1,
            device          = 'cuda')

# model = A2C(policy          = 'MlpPolicy',
#             env             = env,
#             learning_rate   = 0.0001,
#             n_steps         = 24,
#             gamma           = 0.99,
#             gae_lambda      = 0.95,
#             tensorboard_log = "./tensorboard_logs/",
#             policy_kwargs   = policy_kwargs,
#             verbose         = 1,
#             device          = 'cuda')
# model = TD3(policy          = 'MlpPolicy',  # 1 neural network metod
#             env             = env,
#             learning_rate   = 0.0001,
#             buffer_size     = 100,
#             batch_size      = 2,
#             gamma           = 0.99,
#             tensorboard_log = "./tensorboard_logs_cont_mult/",
#             policy_kwargs   = policy_kwargs,
#             verbose         = 0,
#             device          = 'cuda')

# model = DDPG(policy         = 'MlpPolicy',  # 1 neural network metod
#             env             = env,
#             learning_rate   = 0.0001,
#             buffer_size     = 100,
#             batch_size      = 2,
#             gamma           = 0.99,
#             tensorboard_log = "./tensorboard_logs_cont_mult/",
#             policy_kwargs   = policy_kwargs,
#             verbose         = 0,
#             device          = 'cuda')

# model = SAC(policy          = 'MlpPolicy',  # 1 neural network metod
#             env             = env,
#             learning_rate   = 0.0001,
#             buffer_size     = 100,
#             batch_size      = 2,
#             gamma           = 0.99,
#             tensorboard_log = "./tensorboard_logs_cont_mult/",
#             policy_kwargs   = policy_kwargs,
#             verbose         = 0,
#             device          = 'cuda')

## Make callback

In [None]:
log_dir = './saved_models/PPO'  # For A2C agent: './saved_models/A2C'
os.makedirs(log_dir, exist_ok=True)
env = Monitor(env, log_dir)
callback = SaveOnBestTrainingRewardCallback(check_freq=500, 
                                            log_dir=log_dir)

## Learn model

In [None]:
model.learn(total_timesteps=1e6,callback=callback)

## Make environment to test trained model and get statistics

In [None]:
env_test = CustomEnv(obstacle_turn = False,
                    vizualaze      = False, 
                    Total_war      = True,
                    head_velocity  = 0.005,
                    num_obs        = 5, 
                    num_enemy      = 1, 
                    size_obs       = [30, 40],
                    rew_col        = -100,
                    rew_win        = 100,
                    rew_defeat     = -100,
                    steps_limit    = 2000)

## Load the best model and get statistics

In [None]:
path = './saved_models/PPO/best_model_PPO'  # For A2C agent: './saved_models/A2C/callback0/best_model_A2C/'
model = PPO.load(path, env=env_test,)  # For A2C agent: A2C.load(path, env=env_test)

In [None]:
# env_test.get_statistic(model, 10000)

In [None]:
env_test.render(model)

## Check tensorboard

In [None]:
# !tensorboard --logdir ./tensorboard_logs/