In [31]:
import os
import random
from collections import deque

import numpy as np
import cv2
from PIL import Image
import time
import pickle
import matplotlib.pyplot as plt
from keras.callbacks import TensorBoard
from matplotlib import style
from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten, Conv2D
from keras.optimizers import Adam

from rl.agents.dqn import DQNAgent
from rl.callbacks import ModelIntervalCheckpoint
from rl.policy import BoltzmannQPolicy, LinearAnnealedPolicy, EpsGreedyQPolicy
from rl.memory import SequentialMemory
from tqdm import tqdm
import tensorflow as tf
style.use('ggplot')
from keras import mixed_precision
# mixed_precision.set_global_policy('mixed_float16')

In [32]:
class Cube:
    def __init__(self, size):  # 随机初始化位置坐标
        self.size = size
        self.x = np.random.randint(0, self.size - 1)
        self.y = np.random.randint(0, self.size - 1)

    def __str__(self):
        return f'{self.x},{self.y}'

    def __sub__(self, other):
        return (self.x - other.x, self.y - other.y)

    def __eq__(self, other):
        return self.x == other.x and self.y == other.y

    def action(self, choise):
        if choise == 0:
            self.move(x=1, y=1)
        elif choise == 1:
            self.move(x=-1, y=1)
        elif choise == 2:
            self.move(x=1, y=-1)
        elif choise == 3:
            self.move(x=-1, y=-1)
        elif choise == 4:
            self.move(x=0, y=1)
        elif choise == 5:
            self.move(x=0, y=-1)
        elif choise == 6:
            self.move(x=1, y=0)
        elif choise == 7:
            self.move(x=-1, y=0)
        elif choise == 8:
            self.move(x=0, y=0)

    def move(self, x=False, y=False):
        if not x:
            self.x += np.random.randint(-1, 2)
        else:
            self.x += x
        if not y:
            self.y += np.random.randint(-1, 2)
        else:
            self.y += y

        if self.x < 0:
            self.x = 0
        if self.x > self.size - 1:
            self.x = self.size - 1
        if self.y < 0:
            self.y = 0
        if self.y > self.size - 1:
            self.y = self.size - 1


class envCube:
    SIZE = 10
    OBSERVATION_SPACE_VALUES = (SIZE, SIZE, 3)
    # OBSERVATION_SPACE_VALUES = (4,)
    ACTION_SPACE_VALUES = 9
    RETURN_IMAGE = True  # 考虑返回值是否图像

    FOOD_REWARD = 25  # agent获得食物的奖励
    ENEMY_PENALITY = -300  # 遇上对手的惩罚
    MOVE_PENALITY = -1  # 每移动一步的惩罚

    # 设定三个部分的颜色分别是蓝、绿、红
    d = {1: (255, 0, 0),  # blue
         2: (0, 255, 0),  # green
         3: (0, 0, 255)}  # red
    PLAYER_N = 1
    FOOD_N = 2
    ENEMY_N = 3

    # 环境重置
    def reset(self):
        self.player = Cube(self.SIZE)
        self.food = Cube(self.SIZE)
        self.enemy = Cube(self.SIZE)
        # 如果玩家和食物初始位置相同，重置食物的位置，直到位置不同
        while self.player == self.food:
            self.food = Cube(self.SIZE)
        # 如果敌人和玩家或食物的初始位置相同，重置敌人的位置，直到位置不同
        while self.player == self.enemy or self.food == self.enemy:
            self.enemy = Cube(self.SIZE)
        # 判断观测是图像和数字
        if self.RETURN_IMAGE:
            observation = np.array(self.get_image()) / 255
        else:
            observation = (self.player - self.food) + (self.player - self.enemy)

        self.episode_step = 0

        return observation

    def step(self, action):
        self.episode_step += 1
        self.player.action(action)
        self.enemy.move()
        self.food.move()
        # 判断观测是图像和数字
        if self.RETURN_IMAGE:
            new_observation = np.array(self.get_image()) / 255
        else:
            new_observation = (self.player - self.food) + (self.player - self.enemy)

        # 奖励
        if self.player == self.food:
            reward = self.FOOD_REWARD
        elif self.player == self.enemy:
            reward = self.ENEMY_PENALITY
        else:
            reward = self.MOVE_PENALITY

        done = False

        if self.player == self.food or self.player == self.enemy or self.episode_step >= 200:
            done = True

        return new_observation, reward, done, {}

    def render(self, mode='human'):
        img = self.get_image()
        img = img.resize((800, 800))
        cv2.imshow('Predator', np.array(img))
        cv2.waitKey(1)

    def get_image(self):
        env = np.zeros((self.SIZE, self.SIZE, 3), dtype=np.uint8)
        env[self.food.x][self.food.y] = self.d[self.FOOD_N]
        env[self.player.x][self.player.y] = self.d[self.PLAYER_N]
        env[self.enemy.x][self.enemy.y] = self.d[self.ENEMY_N]
        img = Image.fromarray(env, 'RGB')
        return img

    def get_qtable(self, q_table_name=None):
        # 初始化Q表格
        if q_table_name is None:  # 如果没有表格提供，就随机初始化一个Q表格
            q_table = {}
            for x1 in range(-self.SIZE + 1, self.SIZE):
                for y1 in range(-self.SIZE + 1, self.SIZE):
                    for x2 in range(-self.SIZE + 1, self.SIZE):
                        for y2 in range(-self.SIZE + 1, self.SIZE):
                            q_table[(x1, y1, x2, y2)] = [np.random.randint(-5, 0) for i in
                                                         range(self.ACTION_SPACE_VALUES)]
        else:  # 提供了，就使用提供的Q表格
            with open(q_table_name, 'rb') as f:
                q_table = pickle.load(f)
        return q_table



In [33]:
class ModifiedTensorBoard(TensorBoard):

    # Overriding init to set initial step and writer (we want one log file for all .fit() calls)
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self.step = 1
        self.writer = tf.summary.create_file_writer(self.log_dir)
        self._log_write_dir = self.log_dir

    # Overriding this method to stop creating default log writer
    def set_model(self, model):
        self.model = model
        self._train_dir = os.path.join(self._log_write_dir,'train')
        self.step += 1
        # self._val_dir = os.path.join(self._log_write_dir,'validation')
        # self._val_step = 1
        self._should_write_train_graph = False

    # Overrided, saves logs with our step number
    # (otherwise every .fit() will start writing from 0th step)
    def on_epoch_end(self, epoch, logs=None):
        self.update_stats(**logs)

    # Overrided
    # We train for one batch only, no need to save anything at epoch end
    def on_batch_end(self, batch, logs=None):
        pass

    # Overrided, so won't close writer
    def on_train_end(self, _):
        pass

    def on_train_batch_end(self, batch, logs=None):
        pass

    # Custom method for saving own metrics
    # Creates writer, writes custom metrics and closes writer
    def update_stats(self, **stats):
        self._write_logs(stats, self.step)

    def _write_logs(self, logs, index):
        with self.writer.as_default():
            for name, value in logs.items():
                tf.summary.scalar(name, value, step=self.step)
                self.writer.flush()

In [34]:
env = envCube()

class DQNAgent():
    REPLAY_MEMORY_SIZE = 100
    MINI_REPLAY_MEMORY_SIZE = 32
    DISCOUNT = 0.95
    UPDATE_TARGET_MODEL_SIZE_EVERY = 5
    def __init__(self):
        self.model = self.create_model()
        self.target_model = self.create_model()
        self.target_model.set_weights(self.model.get_weights())
        self.replay_memory = deque(maxlen=100)
        self.update_target_model_count = 0
        # self.tensorboard = ModifiedTensorBoard(log_dir=f'dqn_model_{int(time.time())}')
        self.tensorboard = TensorBoard()
    def create_model(self):
        model = Sequential()
        model.add(Conv2D(32,(3,3),activation='relu',input_shape=envCube.OBSERVATION_SPACE_VALUES))
        model.add(Activation('relu'))
        model.add(Conv2D(32, (3, 3)))
        model.add(Activation('relu'))
        model.add(Flatten())
        model.add(Dense(32))
        model.add(Activation('relu'))
        model.add(Dense(32))
        model.add(Activation('relu'))
        model.add(Dense(envCube.ACTION_SPACE_VALUES, activation='linear'))
        model.compile(loss='mse',optimizer='Adam',metrics=['accuracy'])
        return model

    def train(self,terminal_state):
        if len(self.replay_memory) < self.REPLAY_MEMORY_SIZE:
            return
        minibatch = random.sample(self.replay_memory,self.REPLAY_MEMORY_SIZE)

        X = []
        y = []

        obs_batch = np.array([transition[0] for transition in minibatch])/255
        new_obs_batch = np.array([transition[3] for transition in minibatch])/255
        X = obs_batch
        q_values_current = self.model.predict(obs_batch)
        q_values_future = self.target_model.predict(new_obs_batch)

        for index,(obs,action,reward,new_obs,done) in enumerate(minibatch):
            if not done:
                yt = reward + self.DISCOUNT*np.max(q_values_future[index])
            else:
                yt = reward
            q_values_current_index = q_values_current[index]
            q_values_current_index[action] = yt
            y.append(q_values_current_index)

        self.model.fit(np.array(X),np.array(y),batch_size=self.MINI_REPLAY_MEMORY_SIZE,shuffle=False,verbose=0,callbacks=[self.tensorboard]if terminal_state else None)
        if terminal_state:
            self.update_target_model_count += 1
        if self.update_target_model_count > self.UPDATE_TARGET_MODEL_SIZE_EVERY:
            self.target_model.set_weights(self.model.get_weights())
            self.update_target_model_count = 0

    def update_replay_memory(self,transition):
        return self.replay_memory.append(transition)

    def action_q_values_predict(self,obs):
        return self.model.predict(np.array(obs).reshape(-1,*obs.shape))[0]

In [35]:
agent = DQNAgent()
EPISODES = 3000    # 局数
epsilon = 1
EPS_DECAY = 0.995
SHOW_EVERY = 30   # 定义每隔多少局展示一次图像
episode_rewards = []
for episode in tqdm(range(EPISODES)):
    obs = env.reset()
    done = False
    episode_reward = 0
    while not done:
        if np.random.random() > epsilon:
            action = np.argmax(agent.action_q_values_predict(obs))   # 选择Q值最高的动作，来进行开发
        else:
            action = np.random.randint(0,env.ACTION_SPACE_VALUES)
        new_obs, reward, done,_ = env.step(action)
        transition = (obs,action,reward,new_obs,done)
        agent.update_replay_memory(transition)

        agent.train(done)
        obs = new_obs
        episode_reward += reward

    # print('episode ',episode,'episode_reward:',episode_reward)

    epsilon *= EPS_DECAY
    epsilon = max(epsilon,0.001)
    episode_rewards.append(episode_reward)
    # if episode % SHOW_EVERY == 0:
    #     agent.tensorboard.update_stats(avg_reward=np.mean(episode_rewards[-SHOW_EVERY:]),max_reward=np.max(episode_rewards[-SHOW_EVERY:]),min_reward=np.min(episode_rewards[-SHOW_EVERY:]),epsilon=epsilon)

  updates=self.state_updates,
  6%|▌         | 166/3000 [06:07<1:44:33,  2.21s/it]

KeyboardInterrupt



In [None]:
SHOW_EVERY = 300
moving_avg = np.convolve(episode_rewards, np.ones((SHOW_EVERY,))/SHOW_EVERY,mode='valid')
print(len(moving_avg))
plt.plot([i for i in range(len(moving_avg))], moving_avg)
plt.xlabel('episode #')
plt.ylabel(f'mean{SHOW_EVERY} reward')
plt.show()


