In [1]:
import numpy as np
import gym
import random
import matplotlib.pyplot as plt
%matplotlib inline 
import tensorflow as tf
import tensorflow_probability as tfp
from collections import deque 
from tqdm import tqdm
import time

  for external in metadata.entry_points().get(self.group, []):


In [2]:
import os

In [3]:
class PolicyNet():
    def __init__(self, input_size, output_size):
        self.model = tf.keras.Sequential(
            layers=[
                tf.keras.Input(shape=(input_size,)),
                tf.keras.layers.Dense(64, activation="relu", name="relu_layer"),
                tf.keras.layers.Dense(output_size, activation="linear", name="linear_layer")
            ],
            name="policy")

    def action_distribution(self, observations):
        logits = self.model(observations)
        return tfp.distributions.Categorical(logits=logits)

    def sampel_action(self, observations):
        sampled_actions = self.action_distribution(observations).sample().numpy()
        return sampled_actions

In [4]:

class BaselineNet():
    def __init__(self, input_size, output_size):
        self.model = tf.keras.Sequential(
            layers=[
                tf.keras.Input(shape=(input_size,)),
                tf.keras.layers.Dense(64, activation="relu", name="relu_layer"),
                tf.keras.layers.Dense(output_size, activation="linear", name="linear_layer")
            ],
            name="baseline")
        self.optimizer = tf.keras.optimizers.Adam(learning_rate=1e-2)

    def forward(self, observations):
        output = tf.squeeze(self.model(observations))
        return output

    def update(self, observations, target):
        with tf.GradientTape() as tape:
            predictions = self.forward(observations)
            loss = tf.keras.losses.mean_squared_error(y_true=target, y_pred=predictions)
        grads = tape.gradient(loss, self.model.trainable_weights)
        self.optimizer.apply_gradients(zip(grads, self.model.trainable_weights))

In [5]:
def export_plot(ys, ylabel, title, filename):
    plt.figure()
    plt.plot(range(len(ys)), ys)
    plt.xlabel("Training Episode")
    plt.ylabel(ylabel)
    plt.title(title)
    plt.savefig(filename)
    plt.close()

In [6]:
class PolicyGradient(object):
    def __init__(self, env, num_iterations=300, batch_size=2000, max_ep_len=500, output_path="results/"):
        self.output_path = output_path
        if not os.path.exists(output_path):
            os.makedirs(output_path)
        self.env = env
        self.observation_dim = self.env.observation_space.shape[0]
        self.action_dim = self.env.action_space.n
        self.gamma = 0.9
        self.num_iterations = num_iterations
        self.batch_size = batch_size
        self.max_ep_len = max_ep_len
        self.optimizer = tf.keras.optimizers.Adam(learning_rate=3e-2)
        self.policy_net = PolicyNet(input_size=self.observation_dim, output_size=self.action_dim)
        self.baseline_net = BaselineNet(input_size=self.observation_dim, output_size=1)
        
    def play_games(self, env=None, num_episodes = None):
        episode = 0
        episode_rewards = []
        paths = []
        t = 0
        if not env:
            env = self.env

        while (num_episodes or t < self.batch_size):
            state = env.reset()
            states, actions, rewards = [], [], []
            episode_reward = 0
                        
            for step in range(self.max_ep_len):
                env.render()
                states.append(state)
                action = self.policy_net.sampel_action(np.atleast_2d(state))[0]
                
                state, reward, done, _ = env.step(action)
        
        
                actions.append(action)
                rewards.append(reward)
                episode_reward += reward
                t += 1

                if (done or step == self.max_ep_len-1):
                    episode_rewards.append(episode_reward)
                    break
                if (not num_episodes) and t == self.batch_size:
                    break

            path = {"observation": np.array(states),
                    "reward": np.array(rewards),
                    "action": np.array(actions)}
            paths.append(path)
            episode += 1
            if num_episodes and episode >= num_episodes:
                break
        return paths, episode_rewards
        
    def get_returns(self, paths):
        all_returns = []
        for path in paths:
            rewards = path["reward"]
            returns = []
            reversed_rewards = np.flip(rewards,0)
            g_t = 0
            for r in reversed_rewards:
                g_t = r + self.gamma*g_t
                returns.insert(0, g_t)
            all_returns.append(returns)
        returns = np.concatenate(all_returns)
        return returns

    def get_advantage(self, returns, observations):
        values = self.baseline_net.forward(observations).numpy()
        advantages = returns - values
        advantages = (advantages-np.mean(advantages)) / np.sqrt(np.sum(advantages**2))
        return advantages

    def update_policy(self, observations, actions, advantages):
        observations = tf.convert_to_tensor(observations)
        actions = tf.convert_to_tensor(actions)
        advantages = tf.convert_to_tensor(advantages)
        with tf.GradientTape() as tape:
            log_prob = self.policy_net.action_distribution(observations).log_prob(actions)
            loss = -tf.math.reduce_mean(log_prob * tf.cast(advantages, tf.float32))
        grads = tape.gradient(loss, self.policy_net.model.trainable_weights)
        self.optimizer.apply_gradients(zip(grads, self.policy_net.model.trainable_weights))

    def train(self):
        all_total_rewards = []
        averaged_total_rewards = []
        for t in range(self.num_iterations):
            paths, total_rewards = self.play_games()
            all_total_rewards.extend(total_rewards)
            observations = np.concatenate([path["observation"] for path in paths])
            actions = np.concatenate([path["action"] for path in paths])
            returns = self.get_returns(paths)
            advantages = self.get_advantage(returns, observations)
            self.baseline_net.update(observations=observations, target=returns)
            self.update_policy(observations, actions, advantages)
            avg_reward = np.mean(total_rewards)
            averaged_total_rewards.append(avg_reward)
            print("Average reward for batch {}: {:04.2f}".format(t,avg_reward))
        print("Training complete")
        np.save(self.output_path+ "rewards.npy", averaged_total_rewards)
        export_plot(averaged_total_rewards, "Reward", "CartPole-v1", self.output_path + "rewards.png")
            
            

In [7]:
env = gym.make("CartPole-v1")
env.reset()
model = PolicyGradient(env, num_iterations=100)
model.train()

Average reward for batch 0: 21.97
Average reward for batch 1: 35.84
Average reward for batch 2: 45.14
Average reward for batch 3: 58.53
Average reward for batch 4: 88.95
Average reward for batch 5: 98.00
Average reward for batch 6: 118.88
Average reward for batch 7: 177.09
Average reward for batch 8: 162.17
Average reward for batch 9: 188.20
Average reward for batch 10: 192.20
Average reward for batch 11: 260.71
Average reward for batch 12: 306.83
Average reward for batch 13: 383.75
Average reward for batch 14: 231.88
Average reward for batch 15: 197.10
Average reward for batch 16: 155.50
Average reward for batch 17: 117.50
Average reward for batch 18: 125.47
Average reward for batch 19: 158.83
Average reward for batch 20: 189.30
Average reward for batch 21: 230.38
Average reward for batch 22: 248.62
Average reward for batch 23: 222.00
Average reward for batch 24: 212.33
Average reward for batch 25: 282.00
Average reward for batch 26: 500.00
Average reward for batch 27: 500.00
Average 

In [8]:
env.close()

In [10]:
model.policy_net.model.save("REINFORCE_cartpole_policy")

INFO:tensorflow:Assets written to: REINFORCE_cartpole_policy\assets
