In [36]:
import numpy as np
import cv2
from PIL import Image
import time
import pickle
import matplotlib.pyplot as plt
from gym import spaces
from matplotlib import style
import gym
import numpy as np
from stable_baselines3 import DQN,A2C
from stable_baselines3.common.env_checker import check_env
from stable_baselines3.common.evaluation import evaluate_policy
import torch
style.use('ggplot')

In [37]:
class Cube:
    def __init__(self, size): # 随机初始化位置坐标
        self.size = size
        self.x = np.random.randint(0, self.size-1)
        self.y = np.random.randint(0, self.size-1)
    def __str__(self):
        return f'{self.x},{self.y}'
    def __sub__(self, other):
        return (self.x-other.x,self.y- other.y)
    def __eq__(self, other):
        return self.x == other.x and self.y == other.y
    def action(self,choise):
        if choise == 0:
            self.move(x=1,y=1)
        elif choise == 1:
            self.move(x=-1, y=1)
        elif choise == 2:
            self.move(x=1, y=-1)
        elif choise == 3:
            self.move(x=-1, y=-1)
        elif choise == 4:
            self.move(x=0, y=1)
        elif choise == 5:
            self.move(x=0, y=-1)
        elif choise == 6:
            self.move(x=1, y=0)
        elif choise == 7:
            self.move(x=-1, y=0)
        elif choise == 8:
            self.move(x=0, y=0)
    def move(self,x=False, y=False):
        if not x:
            self.x += np.random.randint(-1, 2)
        else:
            self.x += x
        if not y:
            self.y += np.random.randint(-1, 2)
        else:
            self.y += y

        if self.x< 0:
            self.x = 0
        if self.x> self.size -1:
            self.x = self.size-1
        if self.y< 0:
            self.y = 0
        if self.y> self.size -1:
            self.y = self.size-1

class envCube(gym.Env):
    SIZE = 20
    # OBSERVATION_SPACE_VALUES = (SIZE,SIZE,3)
    OBSERVATION_SPACE_VALUES = (4,)
    ACTION_SPACE_VALUES = 9
    RETURN_IMAGE = False     # 考虑返回值是否图像

    FOOD_REWARD = 25  # agent获得食物的奖励
    ENEMY_PENALITY = -300  # 遇上对手的惩罚
    MOVE_PENALITY = -1  # 每移动一步的惩罚

    # 设定三个部分的颜色分别是蓝、绿、红
    d = {1: (255, 0, 0),  # blue
         2: (0, 255, 0),  # green
         3: (0, 0, 255)}  # red
    PLAYER_N = 1
    FOOD_N = 2
    ENEMY_N = 3

    def __init__(self):
        super(envCube, self).__init__()
        # Define action and observation space
        # They must be gym.spaces objects
        # Example when using discrete actions:
        self.action_space = spaces.Discrete(self.ACTION_SPACE_VALUES)
        # Example for using image as input (channel-first; channel-last also works):
        self.observation_space = spaces.Box(low=-self.SIZE+1, high=self.SIZE-1,
                                            shape=self.OBSERVATION_SPACE_VALUES, dtype=int)


    # 环境重置
    def reset(self):
        self.player = Cube(self.SIZE)
        self.food = Cube(self.SIZE)
        self.enemy = Cube(self.SIZE)
        # 如果玩家和食物初始位置相同，重置食物的位置，直到位置不同
        while self.player == self.food:
            self.food = Cube(self.SIZE)
        # 如果敌人和玩家或食物的初始位置相同，重置敌人的位置，直到位置不同
        while self.player == self.enemy or self.food == self.enemy:
            self.enemy = Cube(self.SIZE)
        # 判断观测是图像和数字
        if self.RETURN_IMAGE:
            observation = np.array(self.get_image())
        else:
            observation = (self.player - self.food)+(self.player - self.enemy)
        observation = np.array(observation)
        self.episode_step = 0

        return observation

    def step(self,action):
        self.episode_step +=1
        self.player.action(action)
        self.enemy.move()
        self.food.move()
        # 判断观测是图像和数字
        if self.RETURN_IMAGE:
            new_observation = np.array(self.get_image())
        else:
            new_observation = (self.player - self.food) + (self.player - self.enemy)

        # 奖励
        if self.player == self.food:
            reward = self.FOOD_REWARD
        elif self.player == self.enemy:
            reward = self.ENEMY_PENALITY
        else:
            reward = self.MOVE_PENALITY

        done = False

        if self.player == self.food or self.player == self.enemy or self.episode_step >=200:
            done = True

        return np.array(new_observation),reward,done,{}

    def render(self,mode='human'):
        img = self.get_image()
        img = img.resize((800, 800))
        cv2.imshow('Predator',np.array(img))
        cv2.waitKey(1)

    def get_image(self):
        env = np.zeros((self.SIZE, self.SIZE, 3), dtype=np.uint8)
        env[self.food.x][self.food.y] = self.d[self.FOOD_N]
        env[self.player.x][self.player.y] = self.d[self.PLAYER_N]
        env[self.enemy.x][self.enemy.y] = self.d[self.ENEMY_N]
        img = Image.fromarray(env, 'RGB')
        return img

    def get_qtable(self,q_table_name=None):
        # 初始化Q表格
        if q_table_name is None:  # 如果没有表格提供，就随机初始化一个Q表格
            q_table = {}
            for x1 in range(-self.SIZE + 1, self.SIZE):
                for y1 in range(-self.SIZE + 1, self.SIZE):
                    for x2 in range(-self.SIZE + 1, self.SIZE):
                        for y2 in range(-self.SIZE + 1, self.SIZE):
                            q_table[(x1, y1, x2, y2)] = [np.random.randint(-5, 0) for i in range(self.ACTION_SPACE_VALUES)]
        else:  # 提供了，就使用提供的Q表格
            with open(q_table_name, 'rb') as f:
                q_table = pickle.load(f)
        return q_table





In [38]:
env = envCube()

In [39]:
print(env.action_space.sample())
print(env.observation_space.sample())
check_env(env)


8
[  0   7 -19 -13]


In [40]:
# Instantiate the agent
model = A2C('MlpPolicy', env,
            verbose=1,tensorboard_log='./logs',
            learning_rate=5e-4
            ,policy_kwargs={
                'net_arch':[32,dict(vf=[32] ,pi=[16])],'activation_fn':torch.nn.ReLU
            }
            )
print(model.policy)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
ActorCriticPolicy(
  (features_extractor): FlattenExtractor(
    (flatten): Flatten(start_dim=1, end_dim=-1)
  )
  (mlp_extractor): MlpExtractor(
    (shared_net): Sequential(
      (0): Linear(in_features=4, out_features=32, bias=True)
      (1): ReLU()
    )
    (policy_net): Sequential(
      (0): Linear(in_features=32, out_features=16, bias=True)
      (1): ReLU()
    )
    (value_net): Sequential(
      (0): Linear(in_features=32, out_features=32, bias=True)
      (1): ReLU()
    )
  )
  (action_net): Linear(in_features=16, out_features=9, bias=True)
  (value_net): Linear(in_features=32, out_features=1, bias=True)
)


In [41]:
# Train the agent
model.learn(total_timesteps=int(5e5),tb_log_name='A2C')
# Save the agent
model.save("a2c_net32_size30_predator")
del model  # delete trained model to demonstrate loading

Logging to ./logs\A2C_13
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 109      |
|    ep_rew_mean        | -333     |
| time/                 |          |
|    fps                | 205      |
|    iterations         | 100      |
|    time_elapsed       | 2        |
|    total_timesteps    | 500      |
| train/                |          |
|    entropy_loss       | -2.2     |
|    explained_variance | 0.768    |
|    learning_rate      | 0.0005   |
|    n_updates          | 99       |
|    policy_loss        | -4.24    |
|    value_loss         | 4.07     |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 140      |
|    ep_rew_mean        | -311     |
| time/                 |          |
|    fps                | 262      |
|    iterations         | 200      |
|    time_elapsed       | 3        |
|    total_timesteps    | 1000     |
| train/     

In [42]:
model = A2C.load("a2c_net32_size30_predator", env=env)

Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [45]:
# Evaluate the agent
# NOTE: If you use wrappers with your environment that modify rewards,
#       this will be reflected here. To evaluate with original rewards,
#       wrap environment in a "Monitor" wrapper before other wrappers.
mean_reward, std_reward = evaluate_policy(model, model.get_env(),deterministic=True,render=True, n_eval_episodes=10)
print(mean_reward,std_reward)

-25.8 93.0782466530177


In [46]:
episode = 10
# Enjoy trained agent
obs = env.reset()
for i in range(episode):
    dones = False
    rewards = 0
    while not dones:
        action, _states = model.predict(obs, deterministic=True)
        obs, reward, dones, info = env.step(action)
        env.render()
        rewards += reward
    print(rewards)

-302
-300
11
25
20
23
23
15
17
21
