https://towardsdatascience.com/policy-gradient-reinforce-algorithm-with-baseline-e95ace11c1c4

In [3]:
import gym
import tensorflow as tf
import keras
from keras import layers
import numpy as np
import tensorflow_probability as tfp
import os
import matplotlib.pyplot as plt

def export_plot(ys, ylabel, title, filename):
    plt.figure()
    plt.plot(range(len(ys)), ys)
    plt.xlabel("Training Episode")
    plt.ylabel(ylabel)
    plt.title(title)
    plt.savefig(filename)
    plt.close()

In [10]:
class PolicyNet():
    def __init__(self, input_size, output_size):
        self.model = keras.Sequential(
            layers=[
                keras.Input(shape=(input_size,)),
                layers.Dense(64, activation="relu", name="relu_layer"),
                layers.Dense(output_size, activation="linear", name="linear_layer")
            ],
            name="policy")

    def action_distribution(self, observations):
        logtis = self.model(observations)
        return tfp.distributions.Categorical(logits=logtis) # softmax

    def sample_action(self, observartions):
        sample_actions = self.action_distribution(observartions).sample().numpy()
        return sample_actions

    def save_policy_net(self):
        self.model.save("D:\RL\policy-gradient\results")

class BaselineNet():
    def __init__(self, input_size, output_size):
        self.model = keras.Sequential(
            layers=[
                keras.Input(shape=(input_size,)),
                layers.Dense(64, activation="relu", name="relu_layer"),
                layers.Dense(output_size, activation="linear", name="linear_layer")
            ],
            name="baseline")

        self.optimizer = tf.keras.optimizers.Adam(learning_rate=3e-2)

    # predict V(s)
    def forward(self, observations):
        output = tf.squeeze(self.model(observations))
        return output

    def update(self, observartions, target):
        with tf.GradientTape() as tape:
            predictions = self.forward(observartions)
            loss = tf.keras.losses.mean_squared_error(y_true=target, y_pred=predictions)

        # backprob
        grads = tape.gradient(loss, self.model.trainable_weights)
        self.optimizer.apply_gradients(zip(grads, self.model.trainable_weights))

    def save_baseline_net(self):
        self.model.save("D:\RL\policy-gradient\results")

class PolicyGradient(object):
    def __init__(self, env, num_iterations=300, batch_size=2000, max_ep_len=200, output_path="../results/"):
        self.output_path = output_path
        if not os.path.exists(output_path):
            os.makedirs(output_path)

        self.env = env
        self.observation_dim = self.env.observation_space.shape[0]
        self.action_dim = self.env.action_space.n
        self.gamma = 0.9
        self.num_iterations = num_iterations
        self.batch_size = batch_size
        self.max_ep_len = max_ep_len
        self.optimizer = tf.keras.optimizers.Adam(learning_rate=3e-2)
        self.policy_net = PolicyNet(input_size=self.observation_dim, output_size=self.action_dim) # output action probability distribution
        self.baseline_net = BaselineNet(input_size=self.observation_dim, output_size=1) # output V(s)

    def play_games(self, env=None, num_episodes=None):
        episode = 0
        episode_rewards = []
        paths = []
        t = 0
        if not env:
            env = self.env

        while (num_episodes or t < self.batch_size):
            state = env.reset()
            states, actions, rewards = [], [], []
            episode_reward = 0

            for step in range(self.max_ep_len):
                states.append(state)
                action = self.policy_net.sample_action(np.atleast_2d(state))[0] # creating batch
                state, reward, done, _ = env.step(action)
                actions.append(action)
                rewards.append(reward)
                episode_reward += reward
                t += 1

                if (done or step == self.max_ep_len-1):
                    episode_rewards.append(episode_reward)
                    break
                if (not num_episodes) and t == self.batch_size:
                    break

            # each episode's path
            path = {"observation": np.array(states),
                    "reward": np.array(rewards),
                    "action": np.array(actions)}
            paths.append(path)
            episode += 1
            if num_episodes and episode >= num_episodes:
                break

        return paths, episode_rewards # paths is the transitions of all episodes

    def get_returns(self, paths):
        all_returns = []
        for path in paths:
            rewards = path["reward"] # a list of rewards in an episode
            returns = []
            reversed_rewards = np.flip(rewards,0)
            g_t = 0

            # discounted return of an episode
            for r in reversed_rewards:
                g_t = r + self.gamma * g_t
                returns.insert(0, g_t)

            # discounted return of all episodes
            all_returns.append(returns)

        returns = np.concatenate(all_returns)
        return returns

    def get_advantage(self, returns, observations):
        values = self.baseline_net.forward(observations).numpy() # calculate b
        advantages = returns - values # G - b

        # normalize
        advantages = (advantages-np.mean(advantages)) / np.sqrt(np.sum(advantages**2))
        return advantages

    def update_policy(self, observations, actions, advantages):
        observations = tf.convert_to_tensor(observations)
        actions = tf.convert_to_tensor(actions)
        advantages = tf.convert_to_tensor(advantages)

        with tf.GradientTape() as tape:
            log_prob = self.policy_net.action_distribution(observations).log_prob(actions)
            loss = -tf.math.reduce_mean(log_prob * tf.cast(advantages, tf.float32)) # mean of batch

        # backprob
        grads = tape.gradient(loss, self.policy_net.model.trainable_weights)
        self.optimizer.apply_gradients(zip(grads, self.policy_net.model.trainable_weights))

    def train(self):
        all_total_rewards = []
        averaged_total_rewards = []

        for t in range(self.num_iterations):
            paths, total_rewards = self.play_games()
            all_total_rewards.extend(total_rewards)
            observations = np.concatenate([path["observation"] for path in paths])
            actions = np.concatenate([path["action"] for path in paths])
            returns = self.get_returns(paths)
            advantages = self.get_advantage(returns, observations)
            self.baseline_net.update(observations, returns)
            self.update_policy(observations, actions, advantages)
            avg_reward = np.mean(total_rewards)
            averaged_total_rewards.append(avg_reward)
            print("Average reward for batch {}: {:04.2f}".format(t,avg_reward))

        print("Training complete")
        np.save(self.output_path+ "rewards.npy", averaged_total_rewards)
        export_plot(averaged_total_rewards, "Reward", "CartPole-v0", self.output_path + "rewards.png")

In [12]:
env = gym.make("CartPole-v0")
model = PolicyGradient(env)
model.train()

Average reward for batch 0: 18.64
Average reward for batch 1: 31.22
Average reward for batch 2: 44.09
Average reward for batch 3: 52.16
Average reward for batch 4: 64.19
Average reward for batch 5: 72.11
Average reward for batch 6: 142.08
Average reward for batch 7: 144.85
Average reward for batch 8: 181.36
Average reward for batch 9: 185.30
Average reward for batch 10: 173.82
Average reward for batch 11: 141.69
Average reward for batch 12: 164.17
Average reward for batch 13: 196.50
Average reward for batch 14: 191.00
Average reward for batch 15: 196.60
Average reward for batch 16: 195.50
Average reward for batch 17: 180.36
Average reward for batch 18: 200.00
Average reward for batch 19: 176.18
Average reward for batch 20: 197.00
Average reward for batch 21: 193.50
Average reward for batch 22: 194.80
Average reward for batch 23: 168.82
Average reward for batch 24: 192.70
Average reward for batch 25: 193.90
Average reward for batch 26: 200.00
Average reward for batch 27: 200.00
Average 

In [None]:
state = env.reset()
done = False

while not done:
    action = model.policy_net.sample_action(np.atleast_2d(state))[0]
    state, reward, done, _ = env.step(action)
    env.render()

env.close()

In [11]:
model.policy_net.save_policy_net()
model.baseline_net.save_baseline_net()

AttributeError: 'PolicyNet' object has no attribute 'save_policy_net'