-
Notifications
You must be signed in to change notification settings - Fork 0
/
policy_net.py
226 lines (195 loc) · 9.4 KB
/
policy_net.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
"""
Handles construction of policy nets.
Should more or less wrap the tensorflow business required.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import numpy as np
import tensorflow as tf
def _filter_summary(filters):
"""save some pics of the convolutional filters"""
# some fancy reshaping/shuffling to turn
# [height, width, in_channel, out_channel] filter tensor into
# a list of [out_channel, height, width, 1] to get greyscale
# summaries of each filter channel individually
with tf.name_scope('summaries'):
all_filters = tf.unpack(tf.transpose(filters, [3, 2, 0, 1]))
all_filters = [tf.expand_dims(filter_, [3]) for filter_ in all_filters]
name = filters.op.name
for i, filter_ in enumerate(all_filters):
tf.image_summary(name + '/filter_{}'.format(i), filter_)
def _activation_summary(act):
"""summarise activations for a given layer"""
with tf.name_scope('summaries'):
name = act.op.name
tf.histogram_summary(name + '/activations', act)
tf.scalar_summary(name + '/sparsity', tf.nn.zero_fraction(act))
def _conv_layer(input_var, shape, stride, name, summarise=True, averager=None):
"""Makes a single convolutional layer.
input_var should be `[batch, height, width, channels]`.
shape should be the shape of the weight tensor required:
`[filter_height, filter_width, in_channels, out_channels]`.
`stride` should just be an int, will be both vertical and horizontal.
"""
with tf.variable_scope(name) as scope:
if not averager:
filters = tf.get_variable('weights', shape)
biases = tf.get_variable('biases', [shape[-1]],
initializer=tf.constant_initializer(0.0))
else:
filters = averager.average(tf.get_variable('weights', shape))
biases = averager.average(tf.get_variable('biases', [shape[-1]]))
conv = tf.nn.conv2d(input_var, filters, [1, stride, stride, 1],
padding='SAME')
conv = tf.nn.bias_add(conv, biases)
conv = tf.nn.relu(conv, name=scope.name)
if summarise:
_activation_summary(conv)
#_filter_summary(filters)
return conv
def _fc_layer(input_var, size, name, nonlin=tf.nn.relu, summarise=True,
averager=None):
"""Make a fully connected layer
input_var should be `[batch, inputs]` and `size` should be a scalar
number of outputs. If input size has more dimensions, we reshape,
assuming that the first dimension is the batch.
"""
with tf.variable_scope(name) as scope:
input_shape = input_var.get_shape().as_list()
if len(input_shape) != 2:
input_var = tf.reshape(input_var, [input_shape[0], -1])
input_dim = input_var.get_shape()[1]
else:
input_dim = input_shape[1]
if not averager: # get variables
weights = tf.get_variable('weights', shape=[input_dim, size],
trainable=True)
biases = tf.get_variable('biases', shape=[size],
initializer=tf.constant_initializer(0.0),
trainable=True)
else: # get shadow variables from the moving average
weights = averager.average(
tf.get_variable('weights'))
biases = averager.average(tf.get_variable('biases'))
activation = nonlin(tf.matmul(input_var, weights) + biases,
name=scope.name)
if summarise:
_activation_summary(activation)
return activation
def convolutional_inference(input_var, shape, averager=None, summarise=False,
dropout=1.0):
"""Build the feedforward part of the model minus the final softmax"""
layer_input = input_var
nonlin = tf.nn.relu
for i, layer in enumerate(shape):
if i == len(shape)-1:
nonlin = tf.identity
try:
if len(layer) == 4:
# then it is a conv layer
layer_input = _conv_layer(layer_input, layer, 1,
'conv{}'.format(i+1),
averager=averager,
summarise=summarise)
elif len(layer) == 1:
layer_input = _fc_layer(layer_input, layer[0],
'full{}'.format(i+1),
averager=averager,
nonlin=nonlin,
summarise=summarise)
else:
raise ValueError("Can't deal with shape {}".format(layer))
except TypeError:
# int has no len()
layer_input = _fc_layer(layer_input, layer,
'full{}'.format(i+1),
averager=averager,
nonlin=nonlin,
summarise=summarise)
if dropout != 1.0 and i < len(shape)-1:
layer_input = tf.nn.dropout(layer_input, keep_prob=dropout)
return layer_input # the final one is in fact the output
def policy_gradient_loss(logits, actions, rewards):
"""Computes a loss given to encourage good moves and discourage bad
ones. This is the top level, organising trajectories into batches
and whatnot will be a pain but will have to be done external to here.
Args:
logits: the output of the network (all of them). Expected to be
`[batch_size, num_actions]`.
actions: the actual actions chosen, so `[batch_size]` of integers.
rewards: `[batch_size]` tensor of whatever advantage function you want.
Returns:
scalar tensor: the loss, which is the negative weighted average
reward (something we want to minimise).
"""
with tf.name_scope('loss'):
# first we have to get the probabilities of the actions we actually
# took
log_probs = tf.nn.log_softmax(logits)
# we have to construct some indices, this is a bit awkward
# because of https://github.com/tensorflow/tensorflow/issues/206
prob_shape = log_probs.get_shape().as_list()
idx_flattened = tf.range(0, prob_shape[0]) * prob_shape[1] + actions
probs = tf.gather(tf.reshape(log_probs, [-1]), idx_flattened)
return -tf.reduce_mean(probs * rewards)
def get_placeholders(batch_size, image_size):
"""Gets the placeholders required for training:
- `inputs`: the actual images, `[batch_size, *image_size]`,
- `actions`: `[batch_size]` integers which reflect the final choices.
- `advantages`: `[batch_size]` floats which are whatever advantage
used. It would be nice to keep these all in tensorflow,
but for now this is the simplest"""
inputs = tf.placeholder(tf.float32, shape=[batch_size] + image_size,
name='inputs')
actions = tf.placeholder(tf.int32, shape=[batch_size], name='actions')
advantages = tf.placeholder(tf.float32, shape=[batch_size],
name='advantages')
return inputs, actions, advantages
def get_training_op(loss, learning_rate=0.01,
collection=None,
global_step=None):
"""Gets an op that runs RMSProp on the given collection of trainable
variables wrt. the given loss.
Args:
loss: the (scalar) loss tensor.
learning_rate: what to use for a learning rate (a tensor or a float).
collection: list of variables. Defaults to tf.trainable_variables().
global_step: integer tensor if you want to keep track of training
steps.
Returns:
an op that runs a step of training.
"""
opt = tf.train.RMSPropOptimizer(learning_rate)
return opt.minimize(loss, global_step=global_step, var_list=collection)
if __name__ == '__main__':
import logging
logging.getLogger().setLevel(logging.DEBUG)
# make sure it does something
shape = [[3, 3, 3, 16], [6, 6, 16, 4], 3]
# make some random 9x9 3 channel images
input_var = tf.Variable(tf.random_normal([10, 9, 9, 3]), trainable=False)
logits = convolutional_inference(input_var, shape)
# make some random rewards
advantages = tf.Variable(tf.random_uniform([10], maxval=2, dtype=tf.int32),
trainable=False)
advantages = 2.0 * tf.cast(advantages, tf.float32) - 1.0
# assume policy is as greedy as can be
actions = tf.cast(tf.argmax(logits, 1), tf.int32)
loss = policy_gradient_loss(logits, actions, advantages)
print([var.name for var in tf.trainable_variables()])
print(tf.gradients(loss, tf.trainable_variables()))
global_step = tf.Variable(0, trainable=False)
train_op = get_training_op(loss, global_step=global_step)
sess = tf.Session()
writer = tf.train.SummaryWriter('/tmp/test/logs', sess.graph)
all_summaries = tf.merge_all_summaries()
with sess.as_default():
sess.run(tf.initialize_all_variables())
print(sess.run(loss))
for _ in range(100):
loss_val, _, summary = sess.run([loss, train_op, all_summaries])
print('\r{}'.format(-loss_val), end='')
writer.add_summary(summary, global_step=global_step.eval())
print('\r{}'.format(-loss_val))