# Laboratorium 6

Celem szóstego laboratorium jest zapoznanie się oraz zaimplementowanie algorytmu głębokiego uczenia aktywnego - REINFORCE. Zaimplementowany algorytm będzie testowany z wykorzystaniem środowiska z OpenAI - *CartPole*.


Dołączenie standardowych bibliotek

In [1]:
import gym
import numpy as np
from tqdm import tqdm

Dołączenie bibliotek do obsługi sieci neuronowych

In [2]:
import tensorflow as tf

## Zadanie 1 - REINFORCE

<p style='text-align: justify;'>
Celem ćwiczenie jest zaimplementowanie algorytmu REINFORCE. Wagi sieci aktualizowane są zgodnie ze wzorem:
\begin{equation*}
    \theta \leftarrow \theta + \alpha G_t \nabla_\theta log \pi_{\theta}(a_t, s_t | \theta)
\end{equation*}.
</p>

Czas przygotować model sieci, która będzie się uczyła działania w środowisku [*CartPool*](https://gym.openai.com/envs/CartPole-v0/):

In [3]:
class REINFORCEAgent:
    def __init__(self, state_size, action_size, model):
        self.state_size = state_size
        self.action_size = action_size
        self.gamma = 0.99    # discount rate
        self.learning_rate = 0.001
        self.model = model
        self.state_memory = []
        self.action_memory = []
        self.reward_memory = []

    def remember(self, state, action, reward):
        #Function adds information to the memory about last action and its results
        self.state_memory.append(state)
        self.action_memory.append(action)
        self.reward_memory.append(reward)

    def get_cumulative_rewards(self):
        """
        based on https://github.com/yandexdataschool/Practical_RL/blob/spring20/week06_policy_based/reinforce_tensorflow.ipynb
        take a list of immediate rewards r(s,a) for the whole session
        compute cumulative rewards R(s,a) (a.k.a. G(s,a) in Sutton '16)
        R_t = r_t + gamma*r_{t+1} + gamma^2*r_{t+2} + ...

        The simple way to compute cumulative rewards is to iterate from last to first time tick
        and compute R_t = r_t + gamma*R_{t+1} recurrently

        You must return an array/list of cumulative rewards with as many elements as in the initial rewards.
        """

        n_rewards = len(self.reward_memory)
        cumulative_rewards = [0] * n_rewards
        cumulative_rewards[-1] = self.reward_memory[-1]
        for reward_idx in range(n_rewards - 2, -1, -1):
            cumulative_rewards[reward_idx] = self.reward_memory[reward_idx] + self.gamma * cumulative_rewards[reward_idx+1]
        return tf.convert_to_tensor(cumulative_rewards, dtype=tf.float32)

    def get_action(self, state):
        """
        Compute the action to take in the current state, basing on policy returned by the network.

        Note: To pick action according to the probability generated by the network
        """

        prediction = self.model.predict_on_batch(state)
        best_action = np.random.choice(np.arange(action_size), p=prediction[0])

        return best_action

    def replay(self):
        """
        Function learn network using data stored in state, action and reward memory.
        First calculates G_t for each state and train network
        """
        cumulative_rewards = self.get_cumulative_rewards()

        with tf.GradientTape() as tape:
            loss = 0
            for idx, (g, state) in enumerate(zip(cumulative_rewards, self.state_memory)):
                state = tf.convert_to_tensor(state, dtype=tf.float32)

                probabilities = self.model(state)
                log = tf.math.log(probabilities[0, self.action_memory[idx]])
                loss += -g * log

        gradient = tape.gradient(loss, self.model.trainable_variables)
        self.model.optimizer.apply_gradients(zip(gradient, self.model.trainable_variables))

        self.state_memory = []
        self.action_memory = []
        self.reward_memory = []

In [4]:
env = gym.make("CartPole-v1").env
state_size = env.observation_space.shape[0]
action_size = env.action_space.n
learning_rate = 0.001

model = tf.keras.Sequential()
model.add(tf.keras.layers.Dense(128, input_shape=(state_size,), activation='relu'))
model.add(tf.keras.layers.Dense(128, activation='relu'))
model.add(tf.keras.layers.Dense(action_size, activation='softmax'))
model.compile(loss=tf.keras.losses.mse, optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate), run_eagerly=True)
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 128)               640       
                                                                 
 dense_1 (Dense)             (None, 128)               16512     
                                                                 
 dense_2 (Dense)             (None, 2)                 258       
                                                                 
Total params: 17,410
Trainable params: 17,410
Non-trainable params: 0
_________________________________________________________________


Przygotuj funkcję obliczającą wartość nagrody skumulowanej:

In [5]:
# def get_cumulative_rewards(rewards,  # rewards at each step
#                            gamma=0.99,  # discount for reward
#                            ):
#     """
#     based on https://github.com/yandexdataschool/Practical_RL/blob/spring20/week06_policy_based/reinforce_tensorflow.ipynb
#     take a list of immediate rewards r(s,a) for the whole session
#     compute cumulative rewards R(s,a) (a.k.a. G(s,a) in Sutton '16)
#     R_t = r_t + gamma*r_{t+1} + gamma^2*r_{t+2} + ...
#
#     The simple way to compute cumulative rewards is to iterate from last to first time tick
#     and compute R_t = r_t + gamma*R_{t+1} recurrently
#
#     You must return an array/list of cumulative rewards with as many elements as in the initial rewards.
#     """
#
#     n_rewards = len(rewards)
#     R = [0] * n_rewards
#     R[-1] = rewards[-1]
#     for t in range(n_rewards - 2, -1, -1):
#         R[t] = rewards[t] + gamma * R[t+1]
#     return R
#
# get_cumulative_rewards([0, 0, 1, 0, 0, 1, 0], gamma=0.9)
# assert len(get_cumulative_rewards(range(100))) == 100
# assert np.allclose(get_cumulative_rewards([0, 0, 1, 0, 0, 1, 0], gamma=0.9),
#                    [1.40049, 1.5561, 1.729, 0.81, 0.9, 1.0, 0.0])
# assert np.allclose(get_cumulative_rewards([0, 0, 1, -2, 3, -4, 0], gamma=0.5),
#                    [0.0625, 0.125, 0.25, -1.5, 1.0, -4.0, 0.0])
# assert np.allclose(get_cumulative_rewards([0, 0, 1, 2, 3, 4, 0], gamma=0), [0, 0, 1, 2, 3, 4, 0])
# assert np.allclose(get_cumulative_rewards(tf.convert_to_tensor([0, 0, 1, 2, 3, 4, 0], dtype=tf.float32), gamma=0), tf.convert_to_tensor([0, 0, 1, 2, 3, 4, 0], dtype=tf.float32))

Czas nauczyć agenta gry w środowisku *CartPool*:

In [6]:
agent = REINFORCEAgent(state_size=state_size, action_size=action_size, model=model)


def generate_session(t_max=1000):
    """play env with REINFORCE agent and train at the session end"""

    reward = 0

    s = env.reset()[0]
    s = tf.convert_to_tensor(s[np.newaxis, :], dtype=tf.float32)
    for t in range(t_max):

        a = agent.get_action(s)
        new_s, r, done, _, _ = env.step(a)
        new_s = tf.convert_to_tensor(new_s[np.newaxis, :], dtype=tf.float32)
        agent.remember(s, a, r)

        reward += r

        s = new_s
        if done: break

    agent.replay()

    return reward


for i in range(100):

    rewards = [generate_session() for _ in tqdm(range(100))]  # generate new sessions

    print(f"mean reward: {np.mean(rewards).round(3)}, epoch: {i}")

    if np.mean(rewards) > 300:
        print("You Win!")
        break

  if not isinstance(terminated, (bool, np.bool8)):
100%|██████████| 100/100 [01:13<00:00,  1.36it/s]


mean reward: 74.44, epoch: 0


100%|██████████| 100/100 [04:35<00:00,  2.76s/it]


mean reward: 283.11, epoch: 1


100%|██████████| 100/100 [02:42<00:00,  1.62s/it]


mean reward: 166.19, epoch: 2


100%|██████████| 100/100 [07:51<00:00,  4.71s/it]

mean reward: 478.59, epoch: 3
You Win!



