# Laboratorium 4 (4 pkt.)

Celem czwartego laboratorium jest zapoznanie się oraz zaimplementowanie algorytmów głębokiego uczenia aktywnego. Zaimplementowane algorytmy będą testowane z wykorzystaniem wcześniej przygotowanych środowisk: *FrozenLake* i *Pacman* oraz środowiska z OpenAI - *CartPole*.


Dołączenie standardowych bibliotek

In [12]:
from collections import deque
import gym
import numpy as np
from tqdm import tqdm

Dołączenie bibliotek ze środowiskami:

In [13]:
from env.FrozenLakeMDP import frozenLake
from env.FrozenLakeMDPExtended import frozenLake as frozenLakeExtended

Dołączenie bibliotek do obsługi sieci neuronowych

In [14]:
import tensorflow as tf

## Zadanie 1 - Deep Q-Network

<p style='text-align: justify;'>
Celem ćwiczenie jest zaimplementowanie algorytmu Deep Q-Network. Wartoscią oczekiwaną sieci jest:
\begin{equation}
        Q(s_t, a_t) = r_{t+1} + \gamma \text{max}_a Q(s_{t + 1}, a)
\end{equation}
</p>

In [15]:
class DQNAgent:
    def __init__(self, action_size, learning_rate, model, environment):
        self.action_size = action_size
        self.memory = deque(maxlen=2000)
        self.gamma = 0.99    # discount rate
        self.epsilon = 1.0  # exploration rate
        self.epsilon_min = 0.1
        self.epsilon_decay = 0.9995
        self.learning_rate = learning_rate
        self.model = model
        self.env = environment

    def remember(self, state, action, reward, next_state, done):
        #Function adds information to the memory about last action and its results
        self.memory.append((state, action, reward, next_state, done)) 

    def get_action(self, state):
        """
        Compute the action to take in the current state, including exploration.
        With probability self.epsilon, we should take a random action.
            otherwise - the best policy action (self.get_best_action).

        Note: To pick randomly from a list, use random.choice(list).
              To pick True or False with a given probablity, generate uniform number in [0, 1]
              and compare it with your probability
        """

        if np.random.random() < self.epsilon:
            return np.random.choice(4)

        return self.get_best_action(state)


  
    def get_best_action(self, state):
        """
        Compute the best action to take in a state.
        """

        prediction = self.model(state, training=False)
        best_action = tf.argmax(prediction[0]).numpy()

        return best_action

    def replay(self, batch_size):
        """
        Function learn network using randomly selected actions from the memory. 
        First calculates Q value for the next state and choose action with the biggest value.
        Target value is calculated according to:
                Q(s,a) := (r + gamma * max_a(Q(s', a)))
        except the situation when the next action is the last action, in such case Q(s, a) := r.
        In order to change only those weights responsible for chosing given action, the rest values should be those
        returned by the network for state state.
        The network should be trained on batch_size samples.
        """

        sample_idx = np.random.choice(len(self.memory), size=batch_size, replace=False)
        batch_list = [self.memory[idx] for idx in sample_idx]

        for sample in batch_list:
            state, action, reward, next_state, done = sample

            if done:
                target = reward
            else:
                target = reward + self.gamma * np.amax(self.model(next_state, training=False).numpy())

            prediction = self.model(state, training=False).numpy()
            prediction[0][action] = target
            self.model(state, prediction)

    def update_epsilon_value(self):
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

Czas przygotować model sieci, która będzie się uczyła poruszania po środowisku *FrozenLake*, warstwa wejściowa powinna mieć tyle neuronów ile jest możlliwych stanów, warstwa wyjściowa tyle neuronów ile jest możliwych akcji do wykonania:

In [16]:
env = frozenLake("8x8")

state_size = env.get_number_of_states()
action_size = len(env.get_possible_actions(None))
learning_rate = 0.00025

model = tf.keras.Sequential()
model.add(tf.keras.layers.Dense(64, input_shape=(state_size,), activation='relu'))
model.add(tf.keras.layers.Dense(32, activation='relu'))
model.add(tf.keras.layers.Dense(action_size, activation='linear'))
model.compile(loss=tf.keras.losses.mae, optimizer=tf.keras.optimizers.RMSprop(learning_rate=learning_rate), metrics=['accuracy'])#, run_eagerly=True)
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_6 (Dense)             (None, 64)                4160      
                                                                 
 dense_7 (Dense)             (None, 32)                2080      
                                                                 
 dense_8 (Dense)             (None, 4)                 132       
                                                                 
Total params: 6,372
Trainable params: 6,372
Non-trainable params: 0
_________________________________________________________________


 Czas nauczyć agenta poruszania się po środowisku *FrozenLake*, jako stan przyjmij wektor o liczbie elementów równej liczbie możliwych stanów, z wartością 1 ustawioną w komórce o indeksie równym aktualnemu stanowi, pozostałe elementy mają być wypełnione zerami:
* 1 pkt < 35 epok,
* 0.5 pkt < 60 epok,
* 0.25 pkt - w pozostałych przypadkach.

In [17]:
agent = DQNAgent(action_size, learning_rate, model, frozenLake("8x8"))

agent.epsilon = 1
batch_size = 50
EPISODES = 100
for e in range(EPISODES):
    summary = []
    pbar = tqdm(range(100))
    for _ in pbar:
        total_reward = 0
        env_state = env.reset()

        state = np.zeros((1, env.get_number_of_states()))
        state[0, env._current_state] = 1
        state = tf.convert_to_tensor(state, dtype=tf.float32)

        while True:
            action = agent.get_action(state)
            next_state_env, reward, done, _ = env.step(action)
            total_reward += reward
            next_state = np.zeros((1, env.get_number_of_states()))
            next_state[0, env._current_state] = 1
            next_state = tf.convert_to_tensor(next_state, dtype=tf.float32)

            agent.remember(state, action, reward, next_state, done)
            state = next_state
            if done:
                break

        if len(agent.memory) > batch_size:
            agent.replay(batch_size)
            agent.update_epsilon_value()

        summary.append(total_reward)
        pbar.set_description(f'training epoch')
    print("epoch #{}\tmean reward = {:.3f}\tepsilon = {:.3f}".format(e, np.mean(summary), agent.epsilon))
    agent.model.save_weights('test')
    if np.mean(summary) > 0.9:
        print ("You Win!")
        break

training epoch: 100%|██████████| 100/100 [00:19<00:00,  5.18it/s]


epoch #0	mean reward = 0.000	epsilon = 0.952


training epoch: 100%|██████████| 100/100 [00:19<00:00,  5.23it/s]


epoch #1	mean reward = 0.010	epsilon = 0.905


training epoch: 100%|██████████| 100/100 [00:19<00:00,  5.14it/s]


epoch #2	mean reward = 0.010	epsilon = 0.861


training epoch: 100%|██████████| 100/100 [00:19<00:00,  5.13it/s]


epoch #3	mean reward = 0.010	epsilon = 0.819


training epoch: 100%|██████████| 100/100 [00:20<00:00,  4.88it/s]


epoch #4	mean reward = 0.010	epsilon = 0.779


training epoch: 100%|██████████| 100/100 [00:19<00:00,  5.01it/s]


epoch #5	mean reward = 0.030	epsilon = 0.741


training epoch: 100%|██████████| 100/100 [00:20<00:00,  4.81it/s]


epoch #6	mean reward = 0.000	epsilon = 0.705


training epoch: 100%|██████████| 100/100 [00:20<00:00,  4.77it/s]


epoch #7	mean reward = 0.010	epsilon = 0.671


training epoch: 100%|██████████| 100/100 [00:22<00:00,  4.43it/s]


epoch #8	mean reward = 0.030	epsilon = 0.638


training epoch: 100%|██████████| 100/100 [00:21<00:00,  4.58it/s]


epoch #9	mean reward = 0.010	epsilon = 0.607


training epoch: 100%|██████████| 100/100 [00:22<00:00,  4.35it/s]


epoch #10	mean reward = 0.050	epsilon = 0.577


training epoch: 100%|██████████| 100/100 [00:24<00:00,  4.04it/s]


epoch #11	mean reward = 0.020	epsilon = 0.549


training epoch: 100%|██████████| 100/100 [00:25<00:00,  3.91it/s]


epoch #12	mean reward = 0.050	epsilon = 0.522


training epoch: 100%|██████████| 100/100 [00:25<00:00,  3.95it/s]


epoch #13	mean reward = 0.040	epsilon = 0.497


training epoch: 100%|██████████| 100/100 [00:27<00:00,  3.68it/s]


epoch #14	mean reward = 0.050	epsilon = 0.473


training epoch: 100%|██████████| 100/100 [00:26<00:00,  3.78it/s]


epoch #15	mean reward = 0.050	epsilon = 0.449


training epoch: 100%|██████████| 100/100 [00:29<00:00,  3.37it/s]


epoch #16	mean reward = 0.080	epsilon = 0.428


training epoch: 100%|██████████| 100/100 [00:32<00:00,  3.08it/s]


epoch #17	mean reward = 0.140	epsilon = 0.407


training epoch: 100%|██████████| 100/100 [00:34<00:00,  2.88it/s]


epoch #18	mean reward = 0.110	epsilon = 0.387


training epoch: 100%|██████████| 100/100 [00:34<00:00,  2.91it/s]


epoch #19	mean reward = 0.070	epsilon = 0.368


training epoch: 100%|██████████| 100/100 [00:36<00:00,  2.74it/s]


epoch #20	mean reward = 0.070	epsilon = 0.350


training epoch: 100%|██████████| 100/100 [00:38<00:00,  2.62it/s]


epoch #21	mean reward = 0.060	epsilon = 0.333


training epoch: 100%|██████████| 100/100 [00:37<00:00,  2.67it/s]


epoch #22	mean reward = 0.090	epsilon = 0.317


training epoch: 100%|██████████| 100/100 [00:41<00:00,  2.41it/s]


epoch #23	mean reward = 0.070	epsilon = 0.301


training epoch: 100%|██████████| 100/100 [00:41<00:00,  2.42it/s]


epoch #24	mean reward = 0.150	epsilon = 0.287


training epoch: 100%|██████████| 100/100 [00:43<00:00,  2.28it/s]


epoch #25	mean reward = 0.110	epsilon = 0.273


training epoch: 100%|██████████| 100/100 [00:50<00:00,  1.97it/s]


epoch #26	mean reward = 0.150	epsilon = 0.259


training epoch: 100%|██████████| 100/100 [00:48<00:00,  2.05it/s]


epoch #27	mean reward = 0.110	epsilon = 0.247


training epoch: 100%|██████████| 100/100 [00:51<00:00,  1.94it/s]


epoch #28	mean reward = 0.110	epsilon = 0.235


training epoch: 100%|██████████| 100/100 [00:51<00:00,  1.95it/s]


epoch #29	mean reward = 0.060	epsilon = 0.223


training epoch: 100%|██████████| 100/100 [00:54<00:00,  1.84it/s]


epoch #30	mean reward = 0.110	epsilon = 0.212


training epoch: 100%|██████████| 100/100 [00:52<00:00,  1.90it/s]


epoch #31	mean reward = 0.160	epsilon = 0.202


training epoch: 100%|██████████| 100/100 [00:58<00:00,  1.70it/s]


epoch #32	mean reward = 0.130	epsilon = 0.192


training epoch: 100%|██████████| 100/100 [01:07<00:00,  1.48it/s]


epoch #33	mean reward = 0.160	epsilon = 0.183


training epoch: 100%|██████████| 100/100 [01:22<00:00,  1.22it/s]


epoch #34	mean reward = 0.100	epsilon = 0.174


training epoch: 100%|██████████| 100/100 [01:11<00:00,  1.40it/s]


epoch #35	mean reward = 0.100	epsilon = 0.165


training epoch: 100%|██████████| 100/100 [01:04<00:00,  1.54it/s]


epoch #36	mean reward = 0.180	epsilon = 0.157


training epoch: 100%|██████████| 100/100 [01:26<00:00,  1.16it/s]


epoch #37	mean reward = 0.130	epsilon = 0.150


training epoch: 100%|██████████| 100/100 [01:13<00:00,  1.36it/s]


epoch #38	mean reward = 0.140	epsilon = 0.142


training epoch: 100%|██████████| 100/100 [01:31<00:00,  1.10it/s]


epoch #39	mean reward = 0.180	epsilon = 0.135


training epoch: 100%|██████████| 100/100 [01:29<00:00,  1.11it/s]


epoch #40	mean reward = 0.120	epsilon = 0.129


training epoch: 100%|██████████| 100/100 [01:38<00:00,  1.01it/s]


epoch #41	mean reward = 0.150	epsilon = 0.122


training epoch: 100%|██████████| 100/100 [01:43<00:00,  1.03s/it]


epoch #42	mean reward = 0.170	epsilon = 0.116


training epoch: 100%|██████████| 100/100 [01:48<00:00,  1.09s/it]


epoch #43	mean reward = 0.200	epsilon = 0.111


training epoch:  28%|██▊       | 28/100 [00:22<00:56,  1.27it/s]


KeyboardInterrupt: 

In [19]:
agent.epsilon = 0.01
agent.model.load_weights('test')
# batch_size = 50
EPISODES = 100
for e in range(EPISODES):
    summary = []
    pbar = tqdm(range(100))
    for _ in pbar:
        total_reward = 0
        env_state = env.reset()

        state = np.zeros((1, env.get_number_of_states()))
        state[0, env._current_state] = 1
        state = tf.convert_to_tensor(state, dtype=tf.float32)

        while True:
            action = agent.get_best_action(state)
            next_state_env, reward, done, _ = env.step(action)
            total_reward += reward
            next_state = np.zeros((1, env.get_number_of_states()))
            next_state[0, env._current_state] = 1
            next_state = tf.convert_to_tensor(next_state, dtype=tf.float32)

            agent.remember(state, action, reward, next_state, done)
            state = next_state
            if done:
                break

        summary.append(total_reward)
        pbar.set_description(f'training epoch')
    print("epoch #{}\tmean reward = {:.3f}\tepsilon = {:.3f}".format(e, np.mean(summary), agent.epsilon))
    if np.mean(summary) > 0.9:
        print ("You Win!")
        break

  0%|          | 0/100 [00:38<?, ?it/s]


KeyboardInterrupt: 

Czas przygotować model sieci, która będzie się uczyła poruszania po środowisku *FrozenLakeExtended*, tym razem stan nie jest określany poprzez pojedynczą liczbę, a przez 3 tablice:
* pierwsza zawierająca informacje o celu,
* druga zawierająca informacje o dziurach,
* trzecia zawierająca informację o położeniu gracza.

In [None]:
env = frozenLakeExtended("4x4")

state_size = env.get_number_of_states()
action_size = len(env.get_possible_actions(None))
learning_rate = 0.001

model =         
        #
        # INSERT CODE HERE to build network
        #

 Czas nauczyć agenta poruszania się po środowisku *FrozenLakeExtended*, jako stan przyjmij wektor składający się ze wszystkich trzech tablic (2 pkt.):

In [None]:
agent = DQNAgent(action_size, learning_rate, model)

agent.epsilon = 0.75

done = False
batch_size = 64
EPISODES = 2000
counter = 0
for e in range(EPISODES):
    summary = []
    for _ in range(100):
        total_reward = 0
        env_state = env.reset()
    
        #
        # INSERT CODE HERE to prepare appropriate format of the state for network
        #
        
        for time in range(1000):
            action = agent.get_action(state)
            next_state_env, reward, done, _ = env.step(action)
            total_reward += reward

            #
            # INSERT CODE HERE to prepare appropriate format of the next state for network
            #

            #add to experience memory
            agent.remember(state, action, reward, next_state, done)
            state = next_state
            if done:
                break

        #
        # INSERT CODE HERE to train network if in the memory is more samples then size of the batch
        #
        
        summary.append(total_reward)
    if np.mean(summary) > 0.9:
        print ("You Win!")
        break
    print("epoch #{}\tmean reward = {:.3f}\tepsilon = {:.3f}".format(e, np.mean(summary), agent.epsilon))

Czas przygotować model sieci, która będzie się uczyła działania w środowisku [*CartPool*](https://gym.openai.com/envs/CartPole-v0/):

In [None]:
env = gym.make("CartPole-v0").env
state_size = env.observation_space.shape[0]
action_size = env.action_space.n
learning_rate = 0.001

model =         
        #
        # INSERT CODE HERE to build network
        #

Czas nauczyć agenta gry w środowisku *CartPool*:
* 1 pkt < 10 epok,
* 0.5 pkt < 20 epok,
* 0.25 pkt - w pozostałych przypadkach.

In [None]:
agent = DQNAgent(action_size, learning_rate, model)

agent.epsilon = 0.75

done = False
batch_size = 64
EPISODES = 1000
counter = 0
for e in range(EPISODES):
    summary = []
    for _ in range(100):
        total_reward = 0
        env_state = env.reset()
    
        #
        # INSERT CODE HERE to prepare appropriate format of the state for network
        #
        
        for time in range(300):
            action = agent.get_action(state)
            next_state_env, reward, done, _ = env.step(action)
            total_reward += reward

            #
            # INSERT CODE HERE to prepare appropriate format of the next state for network
            #

            #add to experience memory
            agent.remember(state, action, reward, next_state, done)
            state = next_state
            if done:
                break

        #
        # INSERT CODE HERE to train network if in the memory is more samples then size of the batch
        #
        
        summary.append(total_reward)
    if np.mean(summary) > 195:
        print ("You Win!")
        break
    print("epoch #{}\tmean reward = {:.3f}\tepsilon = {:.3f}".format(e, np.mean(summary), agent.epsilon))