# Policy Gradient Setup

In the following we will introduce our policy gradient setup. It consists of:

* Network
* Agent
* The script to run the training.

## Network

The network is a self-sustained entity in this setup and consists only of the neural network.

In [None]:
import lasagne
import theano
import theano.tensor as T
from lasagne.init import Constant
from lasagne.layers import InputLayer, DenseLayer, Conv2DLayer, set_all_param_values
from lasagne.nonlinearities import softmax, rectify
import numpy as np


class Network(object):
    def __init__(self, resolution, number_of_outputs, cropping, weights_file=None):

        # symbolic variables for state, action, and advantage
        self.sym_state = T.tensor4()
        self.sym_action = T.vector("Actions", dtype="int32")
        self.sym_advantage = T.vector("Advantages", dtype="int32")
        self.sym_r = T.vector()
        self.sym_q2 = T.vector()
        self.shape = (None, 1,
                      (resolution[0] - cropping[0] - cropping[1]),
                      (resolution[1] - cropping[2] - cropping[3]))

        self.cropping = cropping

        # Same policy network as Deep Q
        l_in = InputLayer(shape=self.shape, input_var=self.sym_state)
        l_conv1 = Conv2DLayer(l_in, num_filters=16, filter_size=[8, 8], nonlinearity=rectify, stride=4)
        l_conv2 = Conv2DLayer(l_conv1, num_filters=32, filter_size=[4, 4], nonlinearity=rectify, stride=2)
        l_hid1 = DenseLayer(l_conv2, num_units=256, nonlinearity=rectify)
        self.l_out = DenseLayer(incoming=l_hid1, W=Constant(1), num_units=number_of_outputs, nonlinearity=softmax,
                           name='outputlayer')

        if weights_file:
            set_all_param_values(self.l_out, np.load(str(weights_file)))

        # get network output
        eval_out = lasagne.layers.get_output(self.l_out, {l_in: self.sym_state}, deterministic=True)

        # get total number of timesteps
        total_timesteps = self.sym_state.shape[0]

        # loss function that we'll differentiate to get the policy gradient
        loss = -T.log(eval_out[T.arange(total_timesteps), self.sym_action]).dot(self.sym_advantage) / total_timesteps

        # learning_rate
        learning_rate = T.fscalar()

        # get trainable parameters in the network.
        params = lasagne.layers.get_all_params(self.l_out, trainable=True)

        # get gradients
        grads = T.grad(loss, params)

        # update function
        updates = lasagne.updates.adam(grads, params, learning_rate=learning_rate)

        print "Compiling the network ..."
        self.f_train = theano.function(
            [
                self.sym_state,
                self.sym_action,
                self.sym_advantage,
                learning_rate
            ],
            loss,
            updates=updates,
            allow_input_downcast=True
        )
        self.f_eval = theano.function([self.sym_state], eval_out, allow_input_downcast=True)
        print "Network compiled."

    def train(self, all_states, all_actions, all_advantages, learning_rate):
        return self.f_train(all_states, all_actions, all_advantages, learning_rate)

    def evaluate(self, state):
        return self.f_eval(state)


## Agent

The policy agent is shown bellow, heavily inspired by the agent given in class.

In [None]:
import numpy as np
from random import random, randint
from lasagne.layers import get_all_param_values
import pickle


class AgentPolicy(object):
    
    def __init__(self, environment, network):
        self.environment = environment
        self.network = network

    def learn(self,
              epochs=100,
              states_per_batch=10000,
              time_limit=None,
              learning_rate=0.01,
              discount_factor=1.0,
              early_stop=None):
        """
        Learn the given environment by the policy gradient method.
        """
        mean_train_rs = []
        mean_val_rs = []
        self.loss = []
        self.epochs = epochs
        self.epoch = 1
        print "Start training using %d epochs, %d states per batch, %d timelimit, %1.5f learning rate" % (epochs,
            states_per_batch, time_limit, learning_rate)

        best_result = -100  # Just low enough to ensure everything else will be better

        for epoch in xrange(epochs):
            self.epoch = epoch
            # 1. collect trajectories until we have at least states_per_batch total timesteps
            trajectories = []
            total_trajectories = 0
            total_games = 0

            while total_trajectories < states_per_batch:
                print total_trajectories,"/",states_per_batch
                trajectory = self.get_trajectory(time_limit, deterministic=False)
                trajectories.append(trajectory)
                total_trajectories += len(trajectory["reward"])
                total_games += 1

            all_states = np.concatenate([trajectory["state"] for trajectory in trajectories])

            # 2. compute cumulative discounted rewards (returns)
            rewards = [self._cumulative_discount(trajectory["reward"], discount_factor) for trajectory in trajectories]
            maxlen = max(len(reward) for reward in rewards)
            padded_rewards = [np.concatenate([reward, np.zeros(maxlen - len(reward))]) for reward in rewards]

            # 3. compute time-dependent baseline
            baseline = np.mean(padded_rewards, axis=0)

            # 4. compute advantages
            advs = [reward - baseline[:len(reward)] for reward in rewards]
            all_actions = np.concatenate([trajectory["action"] for trajectory in trajectories])
            all_advantages = np.concatenate(advs)

            print 'Updating network...'
            # 5. do policy gradient update step
            loss = self.network.train(all_states, all_actions, all_advantages, learning_rate)

            train_rs = np.array([trajectory["reward"].sum() for trajectory in trajectories])  # trajectory total rewards
            # eplens = np.array([len(trajectory["reward"]) for trajectory in trajectories])  # trajectory lengths

            print("Saving training results...")
            with open("train_results.txt", "w") as train_result_file:
                train_result_file.write(str((train_rs.mean())))

            print "\nTesting..."
            # compute validation reward
            val_reward = np.array(
                [self.get_trajectory(epoch, epochs, time_limit, deterministic=True, render=False)['reward'].sum() for _ in range(1)]
            )

            # update stats
            mean_train_rs.append(train_rs.mean())
            mean_val_rs.append(val_reward.mean())
            self.loss.append(loss)

            if val_reward.max() > best_result:
                print "New best result. Storing weights."
                best_result = val_reward.max()
                pickle.dump(get_all_param_values(self.network.l_out), open('best_weights.dump', "w"))

            print("Saving test results...")
            with open("test_results.txt", "w") as test_result_file:
                test_result_file.write(str((val_reward.mean())))

            print "Saving the network weights..."
            pickle.dump(get_all_param_values(self.network.l_out), open('weights.dump', "w"))

            # print stats
            print '%3d mean_train_r: %6.2f mean_val_r: %6.2f loss: %f games played: %3d' % (
                epoch + 1, train_rs.mean(), val_reward.mean(), loss, total_games)

            # check for early stopping: true if the validation reward has not changed in n_early_stop epochs
            if early_stop and len(mean_val_rs) >= early_stop and \
                    all([x == mean_val_rs[-1] for x in mean_val_rs[-early_stop:-1]]):
                break

    def get_trajectory(self, epoch, epochs, time_limit=None, deterministic=True, render=False):
        """
        Compute state by iteratively evaluating the agent policy on the environment.
        """
        time_limit = time_limit or self.environment.spec.timestep_limit

        # Get stacked initial state
        s1 = self.environment.reset()

        trajectory = {'state': [], 'action': [], 'reward': []}

        for _ in xrange(time_limit):
            action = self.get_action(epoch, epochs, s1, deterministic)
            (s2, reward, done, _) = self.environment.step(action + 1)

            if render:
                self.environment.render()

            s2 = self._state_reshape(s2)

            trajectory['state'].append(s2)
            trajectory['action'].append(action)
            trajectory['reward'].append(reward)

            if done: break

        return {'state': np.array(trajectory['state']),
                'action': np.array(trajectory['action']),
                'reward': np.array(trajectory['reward'])}

    def get_action(self, epoch, epochs, state, deterministic=True):
        """
        Evaluate the agent policy to choose an action, a, given state, s.
        """

        if deterministic:
            # choose action with highest probability
            action_probabilities = self.network.evaluate(np.expand_dims(state, 0))
            action = action_probabilities.argmax()

        else:
            exp_rate = self.exploration_rate(epoch, epochs)

            if random() <= exp_rate:
                action = randint(0, 2)
            else:
                # Choose the best action according to the network.
                action_probabilities = self.network.evaluate(np.expand_dims(state, 0))
                action = action_probabilities.argmax()

        return action

    def exploration_rate(self, epoch, epochs):
        """# Define exploration rate change over time"""
        start_eps = 1.0
        end_eps = 0.1
        const_eps_epochs = 0.01 * epochs  # 10% of learning time
        eps_decay_epochs = 0.9 * epochs  # 60% of learning time

        if epoch < const_eps_epochs:
            return start_eps
        elif epoch < eps_decay_epochs:
            # Linear decay
            return start_eps - (epoch - const_eps_epochs) / \
                               (eps_decay_epochs - const_eps_epochs) * (start_eps - end_eps)
        else:
            return end_eps

    def _cumulative_discount(self, reward, gamma):
        """
        Compute the cumulative discounted rewards.
        """
        reward_out = np.zeros(len(reward), 'float64')
        reward_out[-1] = reward[-1]
        for i in reversed(xrange(len(reward) - 1)):
            reward_out[i] = reward[i] + gamma * reward_out[i + 1]
        return reward_out

    def get_state_shape(self):
        return self.network.shape[1], self.network.shape[2], self.network.shape[3]

    def _state_reshape(self, state):
        img = Image.fromarray(state, 'RGB').convert('L')
        size = (self.network.shape[2], self.network.shape[2])
        img.thumbnail(size, Image.ANTIALIAS)
        return np.expand_dims(np.array(img), 0)

## Training

All there is left is simply to start running the training with the given hyper parameters.

In [None]:
import gym

# init environment
env = gym.make('Pong-v0')

cropping = (30, 10, 6, 6)

network = Network(resolution=env.observation_space.shape, number_of_outputs=3, cropping=cropping)
print 'Completed network'
agent = AgentPolicy(env, network)
print 'Completed policy'

# train agent on the environment
agent.learn(
    epochs=2500,
    learning_rate=0.00025,
    discount_factor=0.99,
    states_per_batch=10000,
    time_limit=10000
)