In [None]:
from Enviroment import Enviroment
import pygame
import time
import random
import numpy as np
import os 

import matplotlib.pyplot as plt

import gym
from gym import spaces
import cv2

In [None]:
class CustomEnv(gym.Env):
    """Custom Environment that follows gym interface"""
    metadata = {'render.modes': ['human']}

    def __init__(self,obstacle_turn = False,vizualaze = True,Total_war = True,
                 head_velocity = 0.01, num_obs = 0, num_enemy = 1, size_obs = [0, 0], steps_limit = 2000):
        super(CustomEnv, self).__init__()

#         obstacle_turn = False
#         vizualaze = True
#         Total_war = True
        self.log_koef = 50
        self.enviroment = Enviroment(obstacle_turn, vizualaze, Total_war, head_velocity, num_obs, num_enemy, size_obs, steps_limit)

        state = self.enviroment.reset()

        self.action_space = spaces.Discrete(8)
        self.observation_space = gym.spaces.Dict({
                    'img': spaces.Box(low=0, high=255, shape=(500, 500, 3), dtype=np.uint8),
                    'posRobot': spaces.Box(low=np.array([0, 0,-3.14]), high=np.array([500, 500, 3.14])),
                    'target': spaces.Box(low  = np.array([[0, 0,-3.14] for i in range(num_enemy)]).reshape(-1), 
                                         high = np.array([[500, 500, 3.14] for i in range(num_enemy)]).reshape(-1)
                                        )
                                                })


    def step(self, action):

        state, reward, done, numstep = self.enviroment.step(action)
        dict_state = {'img':     state.img,
                      'posRobot':state.posRobot,
                      'target':  state.target} 
        for i in range(len(state.target)):
        
            dist = np.sqrt((dict_state['target'][i][0]-dict_state['posRobot'][0])**2 + (dict_state['target'][i][1]-dict_state['posRobot'][1])**2)

            Ax = np.cos(dict_state['target'][i][2])
            Ay = -np.sin(dict_state['target'][i][2])
            Bx = dict_state['posRobot'][0] - dict_state['target'][i][0]
            By = dict_state['posRobot'][1] - dict_state['target'][i][1] 


            phy = np.arccos((Ax*Bx + Ay*By)/(np.sqrt(Ax**2 + Ay**2) * np.sqrt(Bx**2 + By**2)))

            reward = reward + np.log2(phy/dist*self.log_koef)
            

        dict_state = {'img':     state.img,
                      'posRobot':state.posRobot,
                      'target':  state.target.reshape(-1)} 
        return dict_state, reward, done, {}


    def reset(self):

        state = self.enviroment.reset()
        dict_state = {'img':     state.img,  
                      'posRobot':state.posRobot,  
                      'target':  state.target.reshape(-1)}  
        return dict_state  # reward, done, info can't be included
    
    def render(self, model, num_gifs=1):
        for i in range(num_gifs):
            images = []
            obs = self.reset()
            img = obs['img']# env.render(mode='rgb_array')
            done = False
            
            height, width, layers = img.shape
            size = (width,height)
            out = cv2.VideoWriter(f"video{i}.avi",cv2.VideoWriter_fourcc(*'DIVX'), 25, size)
            img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
            out.write(img)
            while not done:

                action, _ = model.predict(obs)
                obs, _, done ,_ = self.step(int(action))
                img = obs['img']
                img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
                out.write(img)
        out.release()

In [None]:
import torch as th
import torch.nn as nn
import torch.nn.functional as F
import torch
import gym
from stable_baselines3.common.torch_layers import BaseFeaturesExtractor

class CustomCNN(BaseFeaturesExtractor):
    """
    :param observation_space: (gym.Space)
    :param features_dim: (int) Number of features extracted.
        This corresponds to the number of unit for the last layer.
    """

    def __init__(self, observation_space: gym.spaces.Dict, features_dim: int = 518):
        super(CustomCNN, self).__init__(observation_space, features_dim)
        
        
        extractors = {}
        
        for key, subspace in observation_space.spaces.items():
            if key == "img":
        
                n_input_channels = observation_space[key].shape[0]
            
                extractors[key] = nn.Sequential(

                nn.Conv2d(n_input_channels, 32, 2),
                nn.MaxPool2d(2, 2),
                nn.Conv2d(32, 64, 2),
                nn.MaxPool2d(2, 2),

                ResBlock(n_filters=64, kernel_size=2),
                nn.MaxPool2d(4, 4),
                ResBlock(n_filters=64, kernel_size=2),
                nn.MaxPool2d(2, 2),
                ResBlock(n_filters=64, kernel_size=2),
                nn.MaxPool2d(2, 2),
                ResBlock(n_filters=64, kernel_size=2), 
                nn.MaxPool2d(2, 2),
                
                nn.Conv2d(64, 128, 2),
                nn.Flatten())
                    
            elif key == "posRobot":
                            
                extractors[key] = nn.Sequential(nn.Linear(3, 9),
                                        nn.ReLU(),
                                        nn.Linear(9, 9),
                                        nn.ReLU(),
                                        nn.Linear(9, 3))
                    
            elif key == "target":
                            
                extractors[key] = nn.Sequential(nn.Linear(3, 9),
                                        nn.ReLU(),
                                        nn.Linear(9, 9),
                                        nn.ReLU(),
                                        nn.Linear(9, 3))
                
        self.extractors = nn.ModuleDict(extractors)

    def forward(self, observations: th.Tensor) -> th.Tensor:
        encoded_tensor_list = []

        # self.extractors contain nn.Modules that do all the processing.
        for key, extractor in self.extractors.items():
            encoded_tensor_list.append(extractor(observations[key]))
        # Return a (B, self._features_dim) PyTorch tensor, where B is batch dimension.
        return th.cat(encoded_tensor_list, dim=1)


    
class ResBlock(nn.Module):
    def __init__(self, n_filters, kernel_size):
        super().__init__()
        self.n_filters = n_filters # the number of filters represents the number of output channels AND the number of input channels
        self.kernel_size = kernel_size

        self.b1 = nn.Conv2d(self.n_filters, self.n_filters, self.kernel_size, padding='same')
        
        self.b2 = nn.BatchNorm2d(self.n_filters, eps = 0.001, momentum= 0.99) # in keras the default epsilon = 0.001 & momentum = 0.99
        # whereas in putorch the default epsilon = 1e-05 & momentom = 0.1, the output is the same shape of input (N,C,H,W).
        self.b3 = nn.Conv2d(self.n_filters, self.n_filters, self.kernel_size, padding='same')
        self.b4 = nn.BatchNorm2d(self.n_filters, eps = 0.001, momentum= 0.99)
        
    def forward(self, x):
        residual = x
        y = F.relu(self.b1(x))
        y = self.b2(y)
        y = F.relu(self.b3(y))
        y = self.b4(y)
        y += residual
        y = F.relu(y)
        return y

In [None]:
from stable_baselines3.common import results_plotter
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.results_plotter import load_results, ts2xy, plot_results
from stable_baselines3.common.noise import NormalActionNoise
from stable_baselines3.common.callbacks import BaseCallback

class SaveOnBestTrainingRewardCallback(BaseCallback):
    """
    Callback for saving a model (the check is done every ``check_freq`` steps)
    based on the training reward (in practice, we recommend using ``EvalCallback``).

    :param check_freq: (int)
    :param log_dir: (str) Path to the folder where the model will be saved.
      It must contains the file created by the ``Monitor`` wrapper.
    :param verbose: (int)
    """
    def __init__(self, check_freq: int, log_dir: str, agent_name: str, verbose=1):
        super(SaveOnBestTrainingRewardCallback, self).__init__(verbose)
        self.check_freq = check_freq
        self.log_dir = log_dir
        self.save_path = os.path.join(log_dir, 'best_model_' + agent_name)
        self.best_mean_reward = -np.inf

    def _init_callback(self) -> None:
        # Create folder if needed
        if self.save_path is not None:
            os.makedirs(self.save_path, exist_ok=True)

    def _on_step(self) -> bool:
        if self.n_calls % self.check_freq == 0:

          # Retrieve training reward
          x, y = ts2xy(load_results(self.log_dir), 'timesteps')
          if len(x) > 0:
              # Mean training reward over the last 100 episodes
              mean_reward = np.mean(y[-100:])
              if self.verbose > 0:
                print("Num timesteps: {}".format(self.num_timesteps))
                print("Best mean reward: {:.2f} - Last mean reward per episode: {:.2f}".format(self.best_mean_reward, mean_reward))

              # New best model, you could save the agent here
              if mean_reward > self.best_mean_reward:
                  self.best_mean_reward = mean_reward
                  # Example for saving best model
                  if self.verbose > 0:
                    print("Saving new best model to {}".format(self.save_path))
                  self.model.save(self.save_path)

        return True

In [None]:
policy_kwargs = dict(
    features_extractor_class=CustomCNN,
    features_extractor_kwargs=dict(features_dim = 518),
    activation_fn=torch.nn.ReLU,
    net_arch = (518,128, 32, 8)
)

In [None]:
env = CustomEnv(obstacle_turn = True, 
                vizualaze     = True, 
                Total_war     = True,
                head_velocity = 0.01,
                num_obs       = 6, 
                num_enemy     = 1, 
                size_obs      = [10, 20],
                steps_limit   = 2000)

In [None]:
from stable_baselines3 import DQN  # 

log_dir = './saved_models_disc_mult/DQN_with/'
os.makedirs(log_dir, exist_ok=True)

callback = SaveOnBestTrainingRewardCallback(check_freq  = 1000, 
                                            log_dir     = log_dir,
                                            agent_name  = 'DQN')

env = Monitor(env, log_dir)

model = DQN(policy                  = 'MlpPolicy',
            env                     = env,
            learning_rate           = 0.0001,
            buffer_size             = 10000,
            batch_size              = 20,
            gamma                   = 0.99,
            tensorboard_log         = "./tensorboard_logs_disc_mult/",
            policy_kwargs           = policy_kwargs,
            verbose                 = 0,
            device                  = 'cuda')

In [None]:
model.learn(total_timesteps=1e6,callback=callback)

## Записать видео

In [None]:
# env.render(model,1)

## tensorboard

In [None]:
# !tensorboard --logdir ./tensorboard_logs_disc_mult/

## load model

In [None]:
# model = PPO.load("./saved_models_disc_mult/TD3/best_model_PPo1", env=env)

## Play

In [None]:
# state = env.reset()
# done = False

In [None]:
# last_action = 1
# while not done:
#     state, reward, done, numstep = env.step(last_action)
# #     print(reward)#, done, state['target'])
#     action = input('action')
#     if action:
#         last_action = int(action)