# Laboratorium 4 (4 pkt.)

Celem czwartego laboratorium jest zapoznanie się oraz zaimplementowanie algorytmów głębokiego uczenia aktywnego. Zaimplementowane algorytmy będą testowane z wykorzystaniem wcześniej przygotowanych środowisk: *FrozenLake* i *Pacman* oraz środowiska z OpenAI - *CartPole*.


Dołączenie standardowych bibliotek

In [1]:
from collections import deque
import gym
import numpy as np
from tqdm import tqdm

Dołączenie bibliotek ze środowiskami:

In [2]:
from env.FrozenLakeMDP import frozenLake
from env.FrozenLakeMDPExtended import frozenLake as frozenLakeExtended

Dołączenie bibliotek do obsługi sieci neuronowych

In [3]:
import tensorflow as tf

## Zadanie 1 - Deep Q-Network

<p style='text-align: justify;'>
Celem ćwiczenie jest zaimplementowanie algorytmu Deep Q-Network. Wartoscią oczekiwaną sieci jest:
\begin{equation}
        Q(s_t, a_t) = r_{t+1} + \gamma \text{max}_a Q(s_{t + 1}, a)
\end{equation}
</p>

In [4]:
class DQNAgent:
    def __init__(self, action_size, learning_rate, model):
        self.action_size = action_size
        self.memory = deque(maxlen=2000)
        self.gamma = 0.95    # discount rate
        self.epsilon = 1.0  # exploration rate
        self.epsilon_min = 0.01
        self.epsilon_decay_diff = 0.04
        # self.epsilon_decay_dot = 0.999
        self.learning_rate = learning_rate
        self.model = model

    def remember(self, state, action, reward, next_state, done):
        #Function adds information to the memory about last action and its results
        self.memory.append((state, action, reward, next_state, done))

    def get_action(self, state):
        """
        Compute the action to take in the current state, including exploration.
        With probability self.epsilon, we should take a random action.
            otherwise - the best policy action (self.get_best_action).

        Note: To pick randomly from a list, use random.choice(list).
              To pick True or False with a given probablity, generate uniform number in [0, 1]
              and compare it with your probability
        """

        if np.random.random() < self.epsilon:
            return np.random.choice(action_size)

        return self.get_best_action(state)

    def get_best_action(self, state):
        """
        Compute the best action to take in a state.
        """

        prediction = self.model(state, training=False)
        best_action = tf.argmax(prediction[0]).numpy()
        return best_action

    def replay(self, batch_size):
        """
        Function learn network using randomly selected actions from the memory.
        First calculates Q value for the next state and choose action with the biggest value.
        Target value is calculated according to:
                Q(s,a) := (r + gamma * max_a(Q(s', a)))
        except the situation when the next action is the last action, in such case Q(s, a) := r.
        In order to change only those weights responsible for chosing given action, the rest values should be those
        returned by the network for state state.
        The network should be trained on batch_size samples.
        """

        sample_idx = np.random.choice(len(self.memory), size=batch_size, replace=False)
        batch_list = [self.memory[idx] for idx in sample_idx]
        states, actions, rewards, next_states, dones = zip(*batch_list)
        states, actions, rewards, next_states, dones = np.array(states).reshape((batch_size, 64)), \
                                                       np.array(actions).reshape((batch_size, 1)),\
                                                       np.array(rewards).reshape((batch_size, 1)), \
                                                       np.array(next_states).reshape((batch_size, 64)), \
                                                       np.array(dones).reshape((batch_size, 1))

        # states, actions, rewards, next_states, dones = [np.array(x).reshape((batch_size, 64 if i == 0 or i == 3 else 1)) for i, x in enumerate(zip(*batch_list))]

        targets = np.where(dones, rewards, rewards + self.gamma * np.amax(self.model(next_states, training=False).numpy())).reshape((batch_size, 1))
        predictions = self.model(states, training=False).numpy()
        predictions[np.arange(len(actions)), actions[:, 0]] = targets[:, 0]
        self.model(states, predictions)

    def update_epsilon_value(self):
        if self.epsilon > self.epsilon_decay_diff:
            self.epsilon -= self.epsilon_decay_diff
        else:
            self.epsilon = 0.001

Czas przygotować model sieci, która będzie się uczyła poruszania po środowisku *FrozenLake*, warstwa wejściowa powinna mieć tyle neuronów ile jest możlliwych stanów, warstwa wyjściowa tyle neuronów ile jest możliwych akcji do wykonania:

In [5]:
env = frozenLake("8x8")
state_size = env.get_number_of_states()
action_size = len(env.get_possible_actions(None))
learning_rate = 0.00025

model = tf.keras.Sequential()
model.add(tf.keras.layers.Dense(256, input_shape=(state_size,), activation='elu'))
model.add(tf.keras.layers.Dense(128, activation='elu'))
model.add(tf.keras.layers.Dense(64, activation='elu'))
model.add(tf.keras.layers.Dense(action_size, activation='linear'))
model.compile(loss=tf.keras.losses.mse, optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate), metrics=['accuracy'])#, run_eagerly=True)
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 256)               16640     
                                                                 
 dense_1 (Dense)             (None, 128)               32896     
                                                                 
 dense_2 (Dense)             (None, 64)                8256      
                                                                 
 dense_3 (Dense)             (None, 4)                 260       
                                                                 
Total params: 58,052
Trainable params: 58,052
Non-trainable params: 0
_________________________________________________________________


 Czas nauczyć agenta poruszania się po środowisku *FrozenLake*, jako stan przyjmij wektor o liczbie elementów równej liczbie możliwych stanów, z wartością 1 ustawioną w komórce o indeksie równym aktualnemu stanowi, pozostałe elementy mają być wypełnione zerami:
* 1 pkt < 35 epok,
* 0.5 pkt < 60 epok,
* 0.25 pkt - w pozostałych przypadkach.

In [6]:
agent = DQNAgent(action_size, learning_rate, model)

agent.epsilon = 0.75
agent.epsilon_decay_diff = 0.016
agent.gamma = 0.99
batch_size = 16
EPISODES = 100
for e in range(EPISODES):
    if e < 35:
        summary = []
        # pbar = tqdm(range(1000))
        for _ in range(1000):
            total_reward = 0
            env_state = env.reset()

            state = np.zeros((1, env.get_number_of_states()))
            state[0, env._current_state] = 1
            state = tf.convert_to_tensor(state, dtype=tf.float32)

            while True:
                action = agent.get_action(state)
                next_state_env, reward, done, _ = env.step(action)
                total_reward += reward
                next_state = np.zeros((1, env.get_number_of_states()))
                next_state[0, env._current_state] = 1
                next_state = tf.convert_to_tensor(next_state, dtype=tf.float32)

                agent.remember(state, action, reward, next_state, done)
                state = next_state
                if done:
                    break

            if len(agent.memory) > batch_size:
                agent.replay(batch_size)

            summary.append(total_reward)
            # pbar.set_description(f'training epoch')

        agent.update_epsilon_value()
        print("epoch #{}\tmean reward = {:.3f}\tepsilon = {:.3f}".format(e, np.mean(summary), agent.epsilon))
        agent.model.save_weights('test')
        if np.mean(summary) > 0.9:
            print ("You Win!")
            break
    else:
        summary = []
        pbar = tqdm(range(100))
        for _ in range(100):
            total_reward = 0
            env_state = env.reset()

            state = np.zeros((1, env.get_number_of_states()))
            state[0, env._current_state] = 1
            state = tf.convert_to_tensor(state, dtype=tf.float32)

            while True:
                action = agent.get_action(state)
                next_state_env, reward, done, _ = env.step(action)
                total_reward += reward
                next_state = np.zeros((1, env.get_number_of_states()))
                next_state[0, env._current_state] = 1
                next_state = tf.convert_to_tensor(next_state, dtype=tf.float32)

                agent.remember(state, action, reward, next_state, done)
                state = next_state
                if done:
                    break

            # if len(agent.memory) > batch_size:
            #     agent.replay(batch_size)

            summary.append(total_reward)
            # pbar.set_description(f'training epoch')

        # agent.update_epsilon_value()
        print("epoch #{}\tmean reward = {:.3f}\tepsilon = {:.3f}".format(e, np.mean(summary), agent.epsilon))
        # agent.model.save_weights('test')
        if np.mean(summary) > 0.9:
            print ("You Win!")
            break

epoch #0	mean reward = 0.000	epsilon = 0.734
epoch #1	mean reward = 0.000	epsilon = 0.718
epoch #2	mean reward = 0.000	epsilon = 0.702
epoch #3	mean reward = 0.000	epsilon = 0.686
epoch #4	mean reward = 0.000	epsilon = 0.670
epoch #5	mean reward = 0.000	epsilon = 0.654
epoch #6	mean reward = 0.000	epsilon = 0.638
epoch #7	mean reward = 0.000	epsilon = 0.622
epoch #8	mean reward = 0.001	epsilon = 0.606
epoch #9	mean reward = 0.000	epsilon = 0.590
epoch #10	mean reward = 0.000	epsilon = 0.574
epoch #11	mean reward = 0.000	epsilon = 0.558
epoch #12	mean reward = 0.000	epsilon = 0.542
epoch #13	mean reward = 0.000	epsilon = 0.526
epoch #14	mean reward = 0.000	epsilon = 0.510
epoch #15	mean reward = 0.000	epsilon = 0.494
epoch #16	mean reward = 0.000	epsilon = 0.478
epoch #17	mean reward = 0.001	epsilon = 0.462
epoch #18	mean reward = 0.000	epsilon = 0.446
epoch #19	mean reward = 0.000	epsilon = 0.430
epoch #20	mean reward = 0.000	epsilon = 0.414


KeyboardInterrupt: 

Czas przygotować model sieci, która będzie się uczyła poruszania po środowisku *FrozenLakeExtended*, tym razem stan nie jest określany poprzez pojedynczą liczbę, a przez 3 tablice:
* pierwsza zawierająca informacje o celu,
* druga zawierająca informacje o dziurach,
* trzecia zawierająca informację o położeniu gracza.

In [None]:
# env = frozenLakeExtended("4x4")
#
# state_size = env.get_number_of_states()
# action_size = len(env.get_possible_actions(None))
# learning_rate = 0.00025
#
# model = tf.keras.Sequential()
# model.add(tf.keras.layers.Dense(32, input_shape=(3, state_size), activation='elu'))
# model.add(tf.keras.layers.Flatten())
# model.add(tf.keras.layers.Dense(64, activation='elu'))
# model.add(tf.keras.layers.Dense(32, activation='elu'))
# model.add(tf.keras.layers.Dense(action_size, activation='linear'))
# model.compile(loss=tf.keras.losses.mse, optimizer=tf.keras.optimizers.RMSprop(learning_rate=learning_rate), metrics=['accuracy'])#, run_eagerly=True)
# model.summary()

 Czas nauczyć agenta poruszania się po środowisku *FrozenLakeExtended*, jako stan przyjmij wektor składający się ze wszystkich trzech tablic (2 pkt.):

In [None]:
# class DQNAgent:
#     def __init__(self, action_size, learning_rate, model):
#         self.action_size = action_size
#         self.memory = deque(maxlen=2000)
#         self.gamma = 0.95    # discount rate
#         self.epsilon = 1.0  # exploration rate
#         self.epsilon_min = 0.01
#         self.epsilon_decay_diff = 0.03
#         # self.epsilon_decay_dot = 0.999
#         self.learning_rate = learning_rate
#         self.model = model
#
#     def remember(self, state, action, reward, next_state, done):
#         #Function adds information to the memory about last action and its results
#         self.memory.append((state, action, reward, next_state, done))
#
#     def get_action(self, state):
#         """
#         Compute the action to take in the current state, including exploration.
#         With probability self.epsilon, we should take a random action.
#             otherwise - the best policy action (self.get_best_action).
#
#         Note: To pick randomly from a list, use random.choice(list).
#               To pick True or False with a given probablity, generate uniform number in [0, 1]
#               and compare it with your probability
#         """
#
#         if np.random.random() < self.epsilon:
#             return np.random.choice(action_size)
#
#         return self.get_best_action(state)
#
#     def get_best_action(self, state):
#         """
#         Compute the best action to take in a state.
#         """
#
#         prediction = self.model(state, training=False)
#         best_action = tf.argmax(prediction[0]).numpy()
#         return best_action
#
#     def replay(self, batch_size):
#         """
#         Function learn network using randomly selected actions from the memory.
#         First calculates Q value for the next state and choose action with the biggest value.
#         Target value is calculated according to:
#                 Q(s,a) := (r + gamma * max_a(Q(s', a)))
#         except the situation when the next action is the last action, in such case Q(s, a) := r.
#         In order to change only those weights responsible for chosing given action, the rest values should be those
#         returned by the network for state state.
#         The network should be trained on batch_size samples.
#         """
#
#         sample_idx = np.random.choice(len(self.memory), size=batch_size, replace=False)
#         batch_list = [self.memory[idx] for idx in sample_idx]
#
#         for sample in batch_list:
#             state, action, reward, next_state, done = sample
#
#             if done:
#                 target = reward
#             else:
#                 target = reward + self.gamma * np.amax(self.model(next_state, training=False).numpy())
#
#             prediction = self.model(state, training=False).numpy()
#             prediction[0][action] = target
#             self.model(state, prediction)
#
#     def update_epsilon_value(self):
#         if self.epsilon > self.epsilon_decay_diff:
#             self.epsilon -= self.epsilon_decay_diff
#         else:
#             self.epsilon = 0.001

In [None]:
# agent = DQNAgent(action_size, learning_rate, model)
#
# agent.epsilon = 0.75
#
# batch_size = 16
# EPISODES = 2000
# for e in range(EPISODES):
#     summary = []
#     pbar = tqdm(range(100))
#     for _ in pbar:
#         total_reward = 0
#         env_state = np.array(env.reset())
#
#         state = tf.convert_to_tensor(env_state[np.newaxis, :], dtype=tf.float32)
#
#         for time in range(1000):
#             action = agent.get_action(state)
#             next_state_env, reward, done, _ = env.step(action)
#             total_reward += reward
#             next_state = tf.convert_to_tensor(np.array(next_state_env)[np.newaxis, :], dtype=tf.float32)
#             agent.remember(state, action, reward, next_state, done)
#             state = next_state
#             if done:
#                 break
#
#         if len(agent.memory) > batch_size:
#             agent.replay(batch_size)
#
#         summary.append(total_reward)
#         pbar.set_description(f'training epoch')
#
#     agent.update_epsilon_value()
#     print("epoch #{}\tmean reward = {:.3f}\tepsilon = {:.3f}".format(e, np.mean(summary), agent.epsilon))
#     agent.model.save_weights('test')
#     if np.mean(summary) > 0.9:
#         print ("You Win!")
#         break

Czas przygotować model sieci, która będzie się uczyła działania w środowisku [*CartPool*](https://gym.openai.com/envs/CartPole-v0/):

In [None]:
class DQNAgent:
    def __init__(self, action_size, learning_rate, model):
        self.action_size = action_size
        self.memory = deque(maxlen=2000)
        self.gamma = 0.99    # discount rate
        self.epsilon = 1.0  # exploration rate
        self.epsilon_min = 0.01
        self.epsilon_decay_diff = 0.03
        # self.epsilon_decay_dot = 0.999
        self.learning_rate = learning_rate
        self.model = model

    def remember(self, state, action, reward, next_state, done):
        #Function adds information to the memory about last action and its results
        self.memory.append((state, action, reward, next_state, done))

    def get_action(self, state):
        """
        Compute the action to take in the current state, including exploration.
        With probability self.epsilon, we should take a random action.
            otherwise - the best policy action (self.get_best_action).

        Note: To pick randomly from a list, use random.choice(list).
              To pick True or False with a given probablity, generate uniform number in [0, 1]
              and compare it with your probability
        """

        if np.random.random() < self.epsilon:
            return np.random.choice(action_size)

        return self.get_best_action(state)

    def get_best_action(self, state):
        """
        Compute the best action to take in a state.
        """

        prediction = self.model(state, training=False)
        best_action = tf.argmax(prediction[0]).numpy()
        return best_action

    def replay(self, batch_size):
        """
        Function learn network using randomly selected actions from the memory.
        First calculates Q value for the next state and choose action with the biggest value.
        Target value is calculated according to:
                Q(s,a) := (r + gamma * max_a(Q(s', a)))
        except the situation when the next action is the last action, in such case Q(s, a) := r.
        In order to change only those weights responsible for chosing given action, the rest values should be those
        returned by the network for state state.
        The network should be trained on batch_size samples.
        """

        sample_idx = np.random.choice(len(self.memory), size=batch_size, replace=False)
        batch_list = [self.memory[idx] for idx in sample_idx]

        for sample in batch_list:
            state, action, reward, next_state, done = sample

            if done:
                target = reward
            else:
                target = reward + self.gamma * np.amax(self.model(next_state, training=False).numpy())

            prediction = self.model(state, training=False).numpy()
            prediction[0][action] = target
            self.model(state, prediction)

    def update_epsilon_value(self):
        if self.epsilon > self.epsilon_decay_diff:
            self.epsilon -= self.epsilon_decay_diff
        else:
            self.epsilon = 0.001

In [None]:
env = gym.make("CartPole-v1").env
state_size = env.observation_space.shape[0]
action_size = env.action_space.n
learning_rate = 0.001

model = tf.keras.Sequential()
model.add(tf.keras.layers.Dense(32, input_shape=(state_size,), activation='relu'))
model.add(tf.keras.layers.Dense(64, activation='relu'))
model.add(tf.keras.layers.Dense(action_size, activation='linear'))
model.compile(loss=tf.keras.losses.mse, optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate), metrics=['accuracy'])#, run_eagerly=True)
model.summary()

Czas nauczyć agenta gry w środowisku *CartPool*:
* 1 pkt < 10 epok,
* 0.5 pkt < 20 epok,
* 0.25 pkt - w pozostałych przypadkach.

In [None]:
agent = DQNAgent(action_size, learning_rate, model)

agent.epsilon = 0.5
agent.epsilon_decay_diff = 0.055
batch_size = 64
EPISODES = 1000
for e in range(EPISODES):
    summary = []
    pbar = tqdm(range(100))
    for _ in pbar:
        total_reward = 0
        env_state = env.reset()[0]
        state = tf.convert_to_tensor(env_state[np.newaxis, :], dtype=tf.float32)

        for time in range(1000):
            action = agent.get_action(state)
            next_state_env, reward, done, _, _ = env.step(action)
            total_reward += reward
            next_state = tf.convert_to_tensor(next_state_env[np.newaxis, :], dtype=tf.float32)
            agent.remember(state, action, reward, next_state, done)
            state = next_state
            if done:
                break

        if len(agent.memory) > batch_size:
            agent.replay(batch_size)

        summary.append(total_reward)
        pbar.set_description(f'training epoch')

    agent.update_epsilon_value()
    print("epoch #{}\tmean reward = {:.3f}\tepsilon = {:.3f}".format(e, np.mean(summary), agent.epsilon))
    agent.model.save_weights('test')
    if np.mean(summary) > 195:
        print ("You Win!")
        break