-
Notifications
You must be signed in to change notification settings - Fork 1.6k
/
vgsl_model.py
executable file
·601 lines (530 loc) · 22.8 KB
/
vgsl_model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""String network description language to define network layouts."""
from __future__ import print_function
import re
import time
import decoder
import errorcounter as ec
import shapes
import tensorflow as tf
import vgsl_input
import vgslspecs
import tensorflow.contrib.slim as slim
from tensorflow.core.framework import summary_pb2
from tensorflow.python.platform import tf_logging as logging
# Parameters for rate decay.
# We divide the learning_rate_halflife by DECAY_STEPS_FACTOR and use DECAY_RATE
# as the decay factor for the learning rate, ie we use the DECAY_STEPS_FACTORth
# root of 2 as the decay rate every halflife/DECAY_STEPS_FACTOR to achieve the
# desired halflife.
DECAY_STEPS_FACTOR = 16
DECAY_RATE = pow(0.5, 1.0 / DECAY_STEPS_FACTOR)
def Train(train_dir,
model_str,
train_data,
max_steps,
master='',
task=0,
ps_tasks=0,
initial_learning_rate=0.001,
final_learning_rate=0.001,
learning_rate_halflife=160000,
optimizer_type='Adam',
num_preprocess_threads=1,
reader=None):
"""Testable trainer with no dependence on FLAGS.
Args:
train_dir: Directory to write checkpoints.
model_str: Network specification string.
train_data: Training data file pattern.
max_steps: Number of training steps to run.
master: Name of the TensorFlow master to use.
task: Task id of this replica running the training. (0 will be master).
ps_tasks: Number of tasks in ps job, or 0 if no ps job.
initial_learning_rate: Learing rate at start of training.
final_learning_rate: Asymptotic minimum learning rate.
learning_rate_halflife: Number of steps over which to halve the difference
between initial and final learning rate.
optimizer_type: One of 'GradientDescent', 'AdaGrad', 'Momentum', 'Adam'.
num_preprocess_threads: Number of input threads.
reader: Function that returns an actual reader to read Examples from input
files. If None, uses tf.TFRecordReader().
"""
if master.startswith('local'):
device = tf.ReplicaDeviceSetter(ps_tasks)
else:
device = '/cpu:0'
with tf.Graph().as_default():
with tf.device(device):
model = InitNetwork(train_data, model_str, 'train', initial_learning_rate,
final_learning_rate, learning_rate_halflife,
optimizer_type, num_preprocess_threads, reader)
# Create a Supervisor. It will take care of initialization, summaries,
# checkpoints, and recovery.
#
# When multiple replicas of this program are running, the first one,
# identified by --task=0 is the 'chief' supervisor. It is the only one
# that takes case of initialization, etc.
sv = tf.train.Supervisor(
logdir=train_dir,
is_chief=(task == 0),
saver=model.saver,
save_summaries_secs=10,
save_model_secs=30,
recovery_wait_secs=5)
step = 0
while step < max_steps:
try:
# Get an initialized, and possibly recovered session. Launch the
# services: Checkpointing, Summaries, step counting.
with sv.managed_session(master) as sess:
while step < max_steps:
_, step = model.TrainAStep(sess)
if sv.coord.should_stop():
break
except tf.errors.AbortedError as e:
logging.error('Received error:%s', e)
continue
def Eval(train_dir,
eval_dir,
model_str,
eval_data,
decoder_file,
num_steps,
graph_def_file=None,
eval_interval_secs=0,
reader=None):
"""Restores a model from a checkpoint and evaluates it.
Args:
train_dir: Directory to find checkpoints.
eval_dir: Directory to write summary events.
model_str: Network specification string.
eval_data: Evaluation data file pattern.
decoder_file: File to read to decode the labels.
num_steps: Number of eval steps to run.
graph_def_file: File to write graph definition to for freezing.
eval_interval_secs: How often to run evaluations, or once if 0.
reader: Function that returns an actual reader to read Examples from input
files. If None, uses tf.TFRecordReader().
Returns:
(char error rate, word recall error rate, sequence error rate) as percent.
Raises:
ValueError: If unimplemented feature is used.
"""
decode = None
if decoder_file:
decode = decoder.Decoder(decoder_file)
# Run eval.
rates = ec.ErrorRates(
label_error=None,
word_recall_error=None,
word_precision_error=None,
sequence_error=None)
with tf.Graph().as_default():
model = InitNetwork(eval_data, model_str, 'eval', reader=reader)
sw = tf.summary.FileWriter(eval_dir)
while True:
sess = tf.Session('')
if graph_def_file is not None:
# Write the eval version of the graph to a file for freezing.
if not tf.gfile.Exists(graph_def_file):
with tf.gfile.FastGFile(graph_def_file, 'w') as f:
f.write(
sess.graph.as_graph_def(add_shapes=True).SerializeToString())
ckpt = tf.train.get_checkpoint_state(train_dir)
if ckpt and ckpt.model_checkpoint_path:
step = model.Restore(ckpt.model_checkpoint_path, sess)
if decode:
rates = decode.SoftmaxEval(sess, model, num_steps)
_AddRateToSummary('Label error rate', rates.label_error, step, sw)
_AddRateToSummary('Word recall error rate', rates.word_recall_error,
step, sw)
_AddRateToSummary('Word precision error rate',
rates.word_precision_error, step, sw)
_AddRateToSummary('Sequence error rate', rates.sequence_error, step,
sw)
sw.flush()
print('Error rates=', rates)
else:
raise ValueError('Non-softmax decoder evaluation not implemented!')
if eval_interval_secs:
time.sleep(eval_interval_secs)
else:
break
return rates
def InitNetwork(input_pattern,
model_spec,
mode='eval',
initial_learning_rate=0.00005,
final_learning_rate=0.00005,
halflife=1600000,
optimizer_type='Adam',
num_preprocess_threads=1,
reader=None):
"""Constructs a python tensor flow model defined by model_spec.
Args:
input_pattern: File pattern of the data in tfrecords of Example.
model_spec: Concatenation of input spec, model spec and output spec.
See Build below for input/output spec. For model spec, see vgslspecs.py
mode: One of 'train', 'eval'
initial_learning_rate: Initial learning rate for the network.
final_learning_rate: Final learning rate for the network.
halflife: Number of steps over which to halve the difference between
initial and final learning rate for the network.
optimizer_type: One of 'GradientDescent', 'AdaGrad', 'Momentum', 'Adam'.
num_preprocess_threads: Number of threads to use for image processing.
reader: Function that returns an actual reader to read Examples from input
files. If None, uses tf.TFRecordReader().
Eval tasks need only specify input_pattern and model_spec.
Returns:
A VGSLImageModel class.
Raises:
ValueError: if the model spec syntax is incorrect.
"""
model = VGSLImageModel(mode, model_spec, initial_learning_rate,
final_learning_rate, halflife)
left_bracket = model_spec.find('[')
right_bracket = model_spec.rfind(']')
if left_bracket < 0 or right_bracket < 0:
raise ValueError('Failed to find [] in model spec! ', model_spec)
input_spec = model_spec[:left_bracket]
layer_spec = model_spec[left_bracket:right_bracket + 1]
output_spec = model_spec[right_bracket + 1:]
model.Build(input_pattern, input_spec, layer_spec, output_spec,
optimizer_type, num_preprocess_threads, reader)
return model
class VGSLImageModel(object):
"""Class that builds a tensor flow model for training or evaluation.
"""
def __init__(self, mode, model_spec, initial_learning_rate,
final_learning_rate, halflife):
"""Constructs a VGSLImageModel.
Args:
mode: One of "train", "eval"
model_spec: Full model specification string, for reference only.
initial_learning_rate: Initial learning rate for the network.
final_learning_rate: Final learning rate for the network.
halflife: Number of steps over which to halve the difference between
initial and final learning rate for the network.
"""
# The string that was used to build this model.
self.model_spec = model_spec
# The layers between input and output.
self.layers = None
# The train/eval mode.
self.mode = mode
# The initial learning rate.
self.initial_learning_rate = initial_learning_rate
self.final_learning_rate = final_learning_rate
self.decay_steps = halflife / DECAY_STEPS_FACTOR
self.decay_rate = DECAY_RATE
# Tensor for the labels.
self.labels = None
self.sparse_labels = None
# Debug data containing the truth text.
self.truths = None
# Tensor for loss
self.loss = None
# Train operation
self.train_op = None
# Tensor for the global step counter
self.global_step = None
# Tensor for the output predictions (usually softmax)
self.output = None
# True if we are using CTC training mode.
self.using_ctc = False
# Saver object to load or restore the variables.
self.saver = None
def Build(self, input_pattern, input_spec, model_spec, output_spec,
optimizer_type, num_preprocess_threads, reader):
"""Builds the model from the separate input/layers/output spec strings.
Args:
input_pattern: File pattern of the data in tfrecords of TF Example format.
input_spec: Specification of the input layer:
batchsize,height,width,depth (4 comma-separated integers)
Training will run with batches of batchsize images, but runtime can
use any batch size.
height and/or width can be 0 or -1, indicating variable size,
otherwise all images must be the given size.
depth must be 1 or 3 to indicate greyscale or color.
NOTE 1-d image input, treating the y image dimension as depth, can
be achieved using S1(1x0)1,3 as the first op in the model_spec, but
the y-size of the input must then be fixed.
model_spec: Model definition. See vgslspecs.py
output_spec: Output layer definition:
O(2|1|0)(l|s|c)n output layer with n classes.
2 (heatmap) Output is a 2-d vector map of the input (possibly at
different scale).
1 (sequence) Output is a 1-d sequence of vector values.
0 (value) Output is a 0-d single vector value.
l uses a logistic non-linearity on the output, allowing multiple
hot elements in any output vector value.
s uses a softmax non-linearity, with one-hot output in each value.
c uses a softmax with CTC. Can only be used with s (sequence).
NOTE Only O1s and O1c are currently supported.
optimizer_type: One of 'GradientDescent', 'AdaGrad', 'Momentum', 'Adam'.
num_preprocess_threads: Number of threads to use for image processing.
reader: Function that returns an actual reader to read Examples from input
files. If None, uses tf.TFRecordReader().
"""
self.global_step = tf.Variable(0, name='global_step', trainable=False)
shape = _ParseInputSpec(input_spec)
out_dims, out_func, num_classes = _ParseOutputSpec(output_spec)
self.using_ctc = out_func == 'c'
images, heights, widths, labels, sparse, _ = vgsl_input.ImageInput(
input_pattern, num_preprocess_threads, shape, self.using_ctc, reader)
self.labels = labels
self.sparse_labels = sparse
self.layers = vgslspecs.VGSLSpecs(widths, heights, self.mode == 'train')
last_layer = self.layers.Build(images, model_spec)
self._AddOutputs(last_layer, out_dims, out_func, num_classes)
if self.mode == 'train':
self._AddOptimizer(optimizer_type)
# For saving the model across training and evaluation
self.saver = tf.train.Saver()
def TrainAStep(self, sess):
"""Runs a training step in the session.
Args:
sess: Session in which to train the model.
Returns:
loss, global_step.
"""
_, loss, step = sess.run([self.train_op, self.loss, self.global_step])
return loss, step
def Restore(self, checkpoint_path, sess):
"""Restores the model from the given checkpoint path into the session.
Args:
checkpoint_path: File pathname of the checkpoint.
sess: Session in which to restore the model.
Returns:
global_step of the model.
"""
self.saver.restore(sess, checkpoint_path)
return tf.train.global_step(sess, self.global_step)
def RunAStep(self, sess):
"""Runs a step for eval in the session.
Args:
sess: Session in which to run the model.
Returns:
output tensor result, labels tensor result.
"""
return sess.run([self.output, self.labels])
def _AddOutputs(self, prev_layer, out_dims, out_func, num_classes):
"""Adds the output layer and loss function.
Args:
prev_layer: Output of last layer of main network.
out_dims: Number of output dimensions, 0, 1 or 2.
out_func: Output non-linearity. 's' or 'c'=softmax, 'l'=logistic.
num_classes: Number of outputs/size of last output dimension.
"""
height_in = shapes.tensor_dim(prev_layer, dim=1)
logits, outputs = self._AddOutputLayer(prev_layer, out_dims, out_func,
num_classes)
if self.mode == 'train':
# Setup loss for training.
self.loss = self._AddLossFunction(logits, height_in, out_dims, out_func)
tf.summary.scalar('loss', self.loss)
elif out_dims == 0:
# Be sure the labels match the output, even in eval mode.
self.labels = tf.slice(self.labels, [0, 0], [-1, 1])
self.labels = tf.reshape(self.labels, [-1])
logging.info('Final output=%s', outputs)
logging.info('Labels tensor=%s', self.labels)
self.output = outputs
def _AddOutputLayer(self, prev_layer, out_dims, out_func, num_classes):
"""Add the fully-connected logits and SoftMax/Logistic output Layer.
Args:
prev_layer: Output of last layer of main network.
out_dims: Number of output dimensions, 0, 1 or 2.
out_func: Output non-linearity. 's' or 'c'=softmax, 'l'=logistic.
num_classes: Number of outputs/size of last output dimension.
Returns:
logits: Pre-softmax/logistic fully-connected output shaped to out_dims.
outputs: Post-softmax/logistic shaped to out_dims.
Raises:
ValueError: if syntax is incorrect.
"""
# Reduce dimensionality appropriate to the output dimensions.
batch_in = shapes.tensor_dim(prev_layer, dim=0)
height_in = shapes.tensor_dim(prev_layer, dim=1)
width_in = shapes.tensor_dim(prev_layer, dim=2)
depth_in = shapes.tensor_dim(prev_layer, dim=3)
if out_dims:
# Combine any remaining height and width with batch and unpack after.
shaped = tf.reshape(prev_layer, [-1, depth_in])
else:
# Everything except batch goes to depth, and therefore has to be known.
shaped = tf.reshape(prev_layer, [-1, height_in * width_in * depth_in])
logits = slim.fully_connected(shaped, num_classes, activation_fn=None)
if out_func == 'l':
raise ValueError('Logistic not yet supported!')
else:
output = tf.nn.softmax(logits)
# Reshape to the dessired output.
if out_dims == 2:
output_shape = [batch_in, height_in, width_in, num_classes]
elif out_dims == 1:
output_shape = [batch_in, height_in * width_in, num_classes]
else:
output_shape = [batch_in, num_classes]
output = tf.reshape(output, output_shape, name='Output')
logits = tf.reshape(logits, output_shape)
return logits, output
def _AddLossFunction(self, logits, height_in, out_dims, out_func):
"""Add the appropriate loss function.
Args:
logits: Pre-softmax/logistic fully-connected output shaped to out_dims.
height_in: Height of logits before going into the softmax layer.
out_dims: Number of output dimensions, 0, 1 or 2.
out_func: Output non-linearity. 's' or 'c'=softmax, 'l'=logistic.
Returns:
loss: That which is to be minimized.
Raises:
ValueError: if logistic is used.
"""
if out_func == 'c':
# Transpose batch to the middle.
ctc_input = tf.transpose(logits, [1, 0, 2])
# Compute the widths of each batch element from the input widths.
widths = self.layers.GetLengths(dim=2, factor=height_in)
cross_entropy = tf.nn.ctc_loss(ctc_input, self.sparse_labels, widths)
elif out_func == 's':
if out_dims == 2:
self.labels = _PadLabels3d(logits, self.labels)
elif out_dims == 1:
self.labels = _PadLabels2d(
shapes.tensor_dim(
logits, dim=1), self.labels)
else:
self.labels = tf.slice(self.labels, [0, 0], [-1, 1])
self.labels = tf.reshape(self.labels, [-1])
cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(
logits=logits, labels=self.labels, name='xent')
else:
# TODO(rays) Labels need an extra dimension for logistic, so different
# padding functions are needed, as well as a different loss function.
raise ValueError('Logistic not yet supported!')
return tf.reduce_sum(cross_entropy)
def _AddOptimizer(self, optimizer_type):
"""Adds an optimizer with learning rate decay to minimize self.loss.
Args:
optimizer_type: One of 'GradientDescent', 'AdaGrad', 'Momentum', 'Adam'.
Raises:
ValueError: if the optimizer type is unrecognized.
"""
learn_rate_delta = self.initial_learning_rate - self.final_learning_rate
learn_rate_dec = tf.add(
tf.train.exponential_decay(learn_rate_delta, self.global_step,
self.decay_steps, self.decay_rate),
self.final_learning_rate)
if optimizer_type == 'GradientDescent':
opt = tf.train.GradientDescentOptimizer(learn_rate_dec)
elif optimizer_type == 'AdaGrad':
opt = tf.train.AdagradOptimizer(learn_rate_dec)
elif optimizer_type == 'Momentum':
opt = tf.train.MomentumOptimizer(learn_rate_dec, momentum=0.9)
elif optimizer_type == 'Adam':
opt = tf.train.AdamOptimizer(learning_rate=learn_rate_dec)
else:
raise ValueError('Invalid optimizer type: ' + optimizer_type)
tf.summary.scalar('learn_rate', learn_rate_dec)
self.train_op = opt.minimize(
self.loss, global_step=self.global_step, name='train')
def _PadLabels3d(logits, labels):
"""Pads or slices 3-d labels to match logits.
Covers the case of 2-d softmax output, when labels is [batch, height, width]
and logits is [batch, height, width, onehot]
Args:
logits: 4-d Pre-softmax fully-connected output.
labels: 3-d, but not necessarily matching in size.
Returns:
labels: Resized by padding or clipping to match logits.
"""
logits_shape = shapes.tensor_shape(logits)
labels_shape = shapes.tensor_shape(labels)
labels = tf.reshape(labels, [-1, labels_shape[2]])
labels = _PadLabels2d(logits_shape[2], labels)
labels = tf.reshape(labels, [labels_shape[0], -1])
labels = _PadLabels2d(logits_shape[1] * logits_shape[2], labels)
return tf.reshape(labels, [labels_shape[0], logits_shape[1], logits_shape[2]])
def _PadLabels2d(logits_size, labels):
"""Pads or slices the 2nd dimension of 2-d labels to match logits_size.
Covers the case of 1-d softmax output, when labels is [batch, seq] and
logits is [batch, seq, onehot]
Args:
logits_size: Tensor returned from tf.shape giving the target size.
labels: 2-d, but not necessarily matching in size.
Returns:
labels: Resized by padding or clipping the last dimension to logits_size.
"""
pad = logits_size - tf.shape(labels)[1]
def _PadFn():
return tf.pad(labels, [[0, 0], [0, pad]])
def _SliceFn():
return tf.slice(labels, [0, 0], [-1, logits_size])
return tf.cond(tf.greater(pad, 0), _PadFn, _SliceFn)
def _ParseInputSpec(input_spec):
"""Parses input_spec and returns the numbers obtained therefrom.
Args:
input_spec: Specification of the input layer. See Build.
Returns:
shape: ImageShape with the desired shape of the input.
Raises:
ValueError: if syntax is incorrect.
"""
pattern = re.compile(R'(\d+),(\d+),(\d+),(\d+)')
m = pattern.match(input_spec)
if m is None:
raise ValueError('Failed to parse input spec:' + input_spec)
batch_size = int(m.group(1))
y_size = int(m.group(2)) if int(m.group(2)) > 0 else None
x_size = int(m.group(3)) if int(m.group(3)) > 0 else None
depth = int(m.group(4))
if depth not in [1, 3]:
raise ValueError('Depth must be 1 or 3, had:', depth)
return vgsl_input.ImageShape(batch_size, y_size, x_size, depth)
def _ParseOutputSpec(output_spec):
"""Parses the output spec.
Args:
output_spec: Output layer definition. See Build.
Returns:
out_dims: 2|1|0 for 2-d, 1-d, 0-d.
out_func: l|s|c for logistic, softmax, softmax+CTC
num_classes: Number of classes in output.
Raises:
ValueError: if syntax is incorrect.
"""
pattern = re.compile(R'(O)(0|1|2)(l|s|c)(\d+)')
m = pattern.match(output_spec)
if m is None:
raise ValueError('Failed to parse output spec:' + output_spec)
out_dims = int(m.group(2))
out_func = m.group(3)
if out_func == 'c' and out_dims != 1:
raise ValueError('CTC can only be used with a 1-D sequence!')
num_classes = int(m.group(4))
return out_dims, out_func, num_classes
def _AddRateToSummary(tag, rate, step, sw):
"""Adds the given rate to the summary with the given tag.
Args:
tag: Name for this value.
rate: Value to add to the summary. Perhaps an error rate.
step: Global step of the graph for the x-coordinate of the summary.
sw: Summary writer to which to write the rate value.
"""
sw.add_summary(
summary_pb2.Summary(value=[summary_pb2.Summary.Value(
tag=tag, simple_value=rate)]), step)