Задача 1

Установить gym (python3 - https://github.com/openai/gym, для тех, кто делает на java - https://github.com/deeplearning4j/gym-java-client), реализовать среду из предыдущего семинара в gym, агенты - тигр и кролик. Реализовать задачу из семинара 2 в openai gym. 

Задача 2

Реализовать поведение "поиска" добычи тигром - тигр исследует карту и выслеживает добычу (оказывается в 3 клетках от нее - добыча выслежена). Далее он следует к добыче и пытается ее поймать (как на 1 семинаре). Если кролик уворачивается от тигра, то он отбегает на 5 клеток в любом направлении. Каждый раз, после неудачной ловли, тигр усовершенствует свой уровень охотника на 10 %. Для обеспечения поиска добычи использовать DQN.

In [1]:
import random
from collections import deque
import numpy as np
import gym
import my_gym
from my_gym.wrappers import FlattenGridObservation
from gym.utils.play import play
import pygame

import tensorflow as tf
from keras import layers, Sequential, losses, optimizers, models


class DQNetwork:
    def __init__(self, observation_shape, action_size, name='DQNetwork'):
        self.exploration_rate = 1.0
        self.exploration_rate_min = 0.01
        self.exploration_rate_decay = 0.995
        self.learning_rate = 0.001
        self.gamma = 0.7
        self.batch_size = 32

        self.replay_memory = deque(maxlen=50_000)

        self.model = Sequential([
            layers.Input(shape=observation_shape),
            layers.Dense(24, activation='relu'),
            layers.Dense(24, activation='relu'),
            layers.Dense(action_size, activation='linear'),
        ], name)

        self.model.compile(
            optimizer=optimizers.Adam(self.learning_rate),
            loss=losses.MeanSquaredError(),
            metrics=['accuracy'],
        )
        print(self.model.summary())

        self.target_model = models.clone_model(self.model)
        self.update()

    def update(self):
        self.target_model.set_weights(self.model.get_weights())

    def train(self):
        if len(self.replay_memory) < self.batch_size:
            return

        mini_batch = random.sample(self.replay_memory, self.batch_size)

        current_states = np.array([transition[0] for transition in mini_batch])
        current_qs_list = self.model.predict(current_states, verbose=0)

        new_current_states = np.array([transition[3] for transition in mini_batch])
        future_qs_list = self.target_model.predict(new_current_states, verbose=0)

        states = []
        target_qs = []
        for index, (observation, action, reward, new_observation, done) in enumerate(mini_batch):
            current_qs = current_qs_list[index]

            if done:
                current_qs[action] = reward
            else:
                current_qs[action] = reward + self.gamma * np.max(future_qs_list[index])

            states.append(observation)
            target_qs.append(current_qs)

        self.model.fit(np.array(states), np.array(target_qs), batch_size=self.batch_size, verbose=0, shuffle=True)

        if self.exploration_rate > self.exploration_rate_min:
            self.exploration_rate *= self.exploration_rate_decay

    def get_qs(self, observation):
        return self.model.predict(observation.reshape([1, observation.shape[0]]), verbose=0)[0]

    def egreedy_policy(self, observation, action_size):
        if np.random.random() < self.exploration_rate:
            return np.random.choice(action_size)
        else:
            return np.argmax(self.get_qs(observation))


# Create env
base_env = gym.make('GridWorld-v0', size=5)
env = FlattenGridObservation(base_env)
env.action_space.seed(42)

# Or play env
# mapping = {(pygame.K_RIGHT,): 0, (pygame.K_DOWN,): 1, (pygame.K_LEFT,): 2, (pygame.K_UP,): 3}
# play(gym.make('GridWorld-v0', render_mode="rgb_array", size=5), keys_to_action=mapping, noop=4)

# Parameters
episodes = 1000

# Instantiate the DQNetwork
dqn = DQNetwork(env.observation_space.shape, env.action_space.n)

for episode in range(episodes):
    observation, info = env.reset()
    done = False
    reward_sum = 0
    steps = 0

    while not done:
        steps += 1
        # if steps == 500:
        #     break

        # Choose action
        action = dqn.egreedy_policy(observation, env.action_space.n)

        # Do the action
        new_observation, reward, terminated, truncated, info = env.step(action)
        reward_sum += reward
        done = terminated or truncated

        # Save in replay memory
        dqn.replay_memory.append([observation, action, reward, new_observation, done])

        # Update q_values
        dqn.train()

        # Update state
        observation = new_observation

    dqn.update()

    print(f"Episode {episode + 1}/{episodes}, Reward {reward_sum}")

env.close()

  logger.warn("Matplotlib is not installed, run `pip install gym[other]`")


Metal device set to: Apple M1 Pro
Model: "DQNetwork"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 24)                624       
                                                                 
 dense_1 (Dense)             (None, 24)                600       
                                                                 
 dense_2 (Dense)             (None, 4)                 100       
                                                                 
Total params: 1,324
Trainable params: 1,324
Non-trainable params: 0
_________________________________________________________________


  logger.warn(
2023-01-06 03:19:29.246093: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2023-01-06 03:19:29.246186: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)
  if not isinstance(terminated, (bool, np.bool8)):
2023-01-06 03:19:29.351262: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
2023-01-06 03:19:29.381029: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2023-01-06 03:19:29.427687: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is

None
Episode 1/1000, Reward 1983


2023-01-06 03:19:29.612501: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


Episode 2/1000, Reward -2036
Episode 3/1000, Reward -2001


2023-01-06 03:19:31.979465: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


Episode 4/1000, Reward -1530
Episode 5/1000, Reward -2028
Episode 6/1000, Reward -2001
Episode 7/1000, Reward 1986
Episode 8/1000, Reward -1967
Episode 9/1000, Reward -2175
Episode 10/1000, Reward -2995
Episode 11/1000, Reward -2579
Episode 12/1000, Reward -3523
Episode 13/1000, Reward -7168
Episode 14/1000, Reward -5767
Episode 15/1000, Reward 1819
Episode 16/1000, Reward -1667
Episode 17/1000, Reward -22029


KeyboardInterrupt: 

In [2]:
base_env = gym.make('GridWorld-v0', size=5, render_mode="human")
env = FlattenGridObservation(base_env)
env.action_space.seed(42)

for episode in range(3):
    observation, info = env.reset()
    done = False

    while not done:
        action = np.argmax(dqn.get_qs(observation))
        new_observation, reward, terminated, truncated, info = env.step(action)
        done = terminated or truncated
        observation = new_observation

env.close()

KeyboardInterrupt: 