Skip to content

Commit

Permalink
Merge pull request #1 from SimulatedANeal/seq-baseline
Browse files Browse the repository at this point in the history
OCR Sequence Baseline
  • Loading branch information
SimulatedANeal committed May 7, 2018
2 parents ef4f47f + 967d6e3 commit ea4ae51
Show file tree
Hide file tree
Showing 11 changed files with 196 additions and 32 deletions.
7 changes: 6 additions & 1 deletion CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,10 @@

Notable changes between releases.

## 0.1.2 (2018-05-07)
* Added 3 Kana OCR taska and baseline model
* Vocabulary save `as_unicode` option
* nn.conv.CNN now accessible via registry

## 0.1.1 (2018-05-03)
* Initial commit of core functionality
* Initial commit of core functionality
2 changes: 1 addition & 1 deletion carpedm/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,4 @@
from carpedm import nn
from carpedm import util

__version__ = '0.1.1'
__version__ = '0.1.2'
15 changes: 11 additions & 4 deletions carpedm/data/lang.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,13 +189,20 @@ def __init__(self, reserved, vocab):
self._vocab[self.UNK] = len(self._vocab)
self._rev_vocab = {idx: key for key, idx in self._vocab.items()}

def save(self, out_dir):
vocab_sorted = [self._rev_vocab[idx]
for idx in sorted(self._rev_vocab.keys())]
def save(self, out_dir, as_unicode=False):
types = self.types()
with open(os.path.join(out_dir, 'vocab.txt'), 'w') as f:
for token in vocab_sorted:
for token in types:
if as_unicode:
try:
token = code2char(token)
except ValueError:
token = token
f.write(token + '\n')

def types(self):
return [self._rev_vocab[idx] for idx in sorted(self._rev_vocab.keys())]

def char_to_id(self, char):
"""Returns the integer id of a character string."""
if char in self._vocab:
Expand Down
2 changes: 1 addition & 1 deletion carpedm/data/providers.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,7 @@ def _parser(self, serialized, distort=False):
if self.sparse_labels:
tensor_dict['image/seq/char/id_sparse'] = tf.serialize_sparse(
ops.sparsify_label(tensor_dict['image/seq/char/id'],
tensor_dict['image/num_chars'])
tensor_dict['image/char/count'])
)

# if distort: image = distort_image(image)
Expand Down
58 changes: 57 additions & 1 deletion carpedm/models/baseline.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,9 @@ def name(self):

def _forward_pass(self, features, data_format, axes_order,
is_training, reuse):
x = features['image/data']
x = self._cnn.forward_pass(
features, data_format, axes_order, is_training, False, reuse)
x, data_format, axes_order, is_training, False, reuse)
x = tf.layers.flatten(x)
tf.logging.info('image after flatten: %s', x.get_shape())

Expand All @@ -51,3 +52,58 @@ def _forward_pass(self, features, data_format, axes_order,
logits = tf.layers.dense(
inputs=x, units=self._num_classes, name='logits')
return logits


@registry.register_model
class SequenceBaseline(TFModel):
"""A simple baseline CNN-LSTM model."""

def __init__(self, num_classes, lstm_layers=2, lstm_units=100,
feature_extractor=nn.conv.CNN(), *args, **kwargs):
"""Initializer.
Overrides TFModel.
Args:
num_classes (int): Number of possible character classes.
lstm_layers (int): Number of LSTM layers.
lstm_unit (int): Number of units in LSTM cell
feature_extractor:
*args: Unused arguments.
**kwargs: Unused arguments.
"""
self._num_classes = num_classes + 1 # Add CTC null label.
self._layers = lstm_layers
self._units = lstm_units
self._feature_extractor = feature_extractor

@property
def name(self):
return 'Baseline_seq_' + self._feature_extractor.name

def _forward_pass(self, features, data_format, axes_order,
is_training, reuse):
x = self._feature_extractor.forward_pass(
features['image/data'], data_format, axes_order,
is_training, False, reuse)
if axes_order == [0, 3, 1, 2]:
x = tf.transpose(x, [0, 2, 3, 1])
x = tf.reshape(x, [-1, x.shape[1], x.shape[2] * x.shape[3]])
x = nn.rnn.bi_lstm(x, n_layers=self._layers, n_units=self._units)
seq_len = tf.tile(tf.expand_dims(tf.to_int32(tf.shape(x)[1]), 0),
[tf.to_int32(tf.shape(x)[0])])
logits = tf.layers.dense(inputs=x, units=self._num_classes)

return {'logits': logits, 'seq_len': seq_len}

def initialize_pretrained(self, pretrained_dir):

submodel = 'Baseline_' + self._feature_extractor.name

variable_mapping = dict()

for i in range(5):
variable_mapping[submodel + '/conv{}/'.format(i)] \
= self.name + '/conv{}/'.format(i)

return variable_mapping
7 changes: 3 additions & 4 deletions carpedm/nn/conv.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,10 @@
import tensorflow as tf

from carpedm.models.generic import TFModel
from carpedm.util import registry
from carpedm.nn import util


@registry.register_model
class CNN(TFModel):
"""Modular convolutional neural network layer class."""

Expand Down Expand Up @@ -80,9 +81,7 @@ def name(self):
p += 1
return name[:-1]

def _forward_pass(self, features, data_format, axes_order,
is_training, reuse):
x = features['image/data']
def _forward_pass(self, x, data_format, axes_order, is_training, reuse):
x = tf.transpose(x, axes_order) if axes_order else x
p = 0
for c in range(len(self._kernels)):
Expand Down
7 changes: 4 additions & 3 deletions carpedm/tasks/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -299,6 +299,7 @@ def sparse_labels(self):
(bool): Use sparse labels.
"""
return False

# ====================== END TASK INTERFACE ====================== #

Expand Down Expand Up @@ -347,9 +348,9 @@ def input_fn(self, batch_size, subset, num_shards, overwrite=False):
out_dir=self.task_data_dir, subset=subset, target_id=self.target,
num_shards=self._num_shards, num_threads=self._num_threads,
format_store=self._dataset_format, shape_store=self._shape_store,
shape_in=self._shape_in, chunk=self.chunk, character=self.character,
line=self.line, label=self.label, bbox=self.bbox,
overwrite=overwrite)
shape_in=self._shape_in, sparse_labels=self.sparse_labels,
chunk=self.chunk, character=self.character, line=self.line,
label=self.label, bbox=self.bbox, overwrite=overwrite)

self._original_format = dataset.format

Expand Down
91 changes: 91 additions & 0 deletions carpedm/tasks/ocr.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,3 +117,94 @@ def regularization(self, hparams):
@property
def sparse_labels(self):
return False


@registry.register_task
class OCRSeqKana3(OCRTask):

def __init__(self, beam_width=100, **kwargs):
self._beam_width = beam_width
super(OCRSeqKana3, self).__init__(**kwargs)

@property
def character_set(self):
return 'kana'

@property
def image_scope(self):
return 'seq'

@property
def sequence_length(self):
return 3

@property
def sparse_labels(self):
return True

@property
def target(self):
return 'image/seq/char/id_sparse'

def loss_fn(self, features, model_output, targets, is_training):
return tf.nn.ctc_loss(labels=targets,
inputs=model_output['logits'],
sequence_length=model_output['seq_len'],
time_major=False)

def results(self, loss, tower_features, tower_preds, tower_targets,
is_training):

tf.summary.image("sample_input", tower_features[0]['image/data'])

all_logits = tf.concat([p['logits'] for p in tower_preds], axis=0)
seq_lens = tf.concat([p['seq_len'] for p in tower_preds], axis=0)

# TODO: fix when seqs are different lengths from multiple GPUs
all_labels = tf.sparse_concat(0, [p for p in tower_targets])
decoded, log_prob = tf.nn.ctc_beam_search_decoder(
inputs=tf.transpose(all_logits, [1, 0, 2]),
sequence_length=seq_lens,
beam_width=self._beam_width)
decoded = decoded[0] # best path

edit_distance = tf.edit_distance(decoded, tf.to_int64(all_labels),
normalize=False)

Z = tf.cast(tf.size(all_labels), tf.float32)
ler = tf.reduce_sum(edit_distance) / Z
S = tf.cast(tf.size(edit_distance), tf.float32)
num_wrong_seqs = tf.cast(tf.count_nonzero(edit_distance), tf.float32)
ser = num_wrong_seqs / S

metrics = {
'ler': tf.metrics.mean(ler),
'ser': tf.metrics.mean(ser)
}

tensors_to_log = {'loss': loss, 'ler': ler, 'ser': ser}

mapping_string = tf.constant(self._meta.vocab.types())
table = tf.contrib.lookup.index_to_string_table_from_tensor(
mapping_string, default_value='NULL')
decoding = table.lookup(tf.to_int64(tf.sparse_tensor_to_dense(decoded)))
gt = table.lookup(tf.to_int64(tf.sparse_tensor_to_dense(all_labels)))

tf.summary.text('decoded', decoding)
tf.summary.text('gt', gt)

predictions = {
'classes': tf.argmax(input=all_logits, axis=1),
'probabilities': tf.nn.softmax(all_logits),
'decoded': decoding,
}

return tensors_to_log, predictions, metrics

def regularization(self, hparams):
model_params = tf.trainable_variables()
weight_loss = tf.multiply(
hparams.weight_decay,
tf.add_n([tf.nn.l2_loss(v) for v in model_params]),
name='weight_loss')
return weight_loss
19 changes: 19 additions & 0 deletions docs/source/benchmarks.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
.. _benchmarks:

Benchmarks
==========

Single Kana OCR
---------------

Running the example :ref:`main.py <main>` for the full PMJTC dataset (171,944 training examples, 131 character classes, as of 2 May 2018)

* On a 2017 MacBook Pro:
* Generating the (train & dev) data: 1 hour, 20 minutes
* Training the model for 5 epochs: 2 hours, 27 minutes
* Dev Accuracy: 94.67%

* On a Linux Machine using 1 Titan X (Pascal) GPU:
* Generating the (train & dev) data: 31 minutes
* Training the model for 5 epochs: 21 minutes
* Dev Accuracy: 95.23%
19 changes: 2 additions & 17 deletions docs/source/guides/usage.rst
Original file line number Diff line number Diff line change
Expand Up @@ -150,27 +150,12 @@ At the end of 30 epochs, it achieved a development set accuracy of **65.27%**. N
And considering the **70** character classes and **4.19%** majority class for this task and specific dataset, we are already doing much better than chance!

Running this same code for the *full* currently available PMJTC dataset takes much longer but---as you would expect when
adding more data---achieves a higher accuracy. Though certainly indicative of the benefit of more data,
note that the accuracies presented below are not a fair comparison to the one above for two reasons:
adding more data---achieves a higher accuracy (see :ref:`benchmarks`). Though certainly indicative of the benefit of more data,
note that the accuracies presented in the benchmarks are not a fair comparison to the one above for two reasons:

1. There are more kana character classes in the full dataset: **131**
2. The development sets on which accuracies are reported are different.

Benchmarks
~~~~~~~~~~

Running the example :ref:`main.py <main>` for the full PMJTC dataset (171,944 training examples, as of 2 May 2018)

* On a 2017 MacBook Pro:
* Generating the (train & dev) data: 1 hour, 20 minutes
* Training the model for 5 epochs: 2 hours, 27 minutes
* Dev Accuracy: 94.67%

* On a Linux Machine using 1 Titan X (Pascal) GPU:
* Generating the (train & dev) data: 31 minutes
* Training the model for 5 epochs: 21 minutes
* Dev Accuracy: 95.23%

Conclusion
~~~~~~~~~~

Expand Down
1 change: 1 addition & 0 deletions docs/source/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ Documentation
:maxdepth: 1
:caption: Other

benchmarks
contribution
conduct
license
Expand Down

0 comments on commit ea4ae51

Please sign in to comment.