Merge pull request #1 from SimulatedANeal/seq-baseline

OCR Sequence Baseline
SimulatedANeal · May 7, 2018 · ea4ae51 · ea4ae51
2 parents ef4f47f + 967d6e3
commit ea4ae51
Show file tree

Hide file tree

Showing 11 changed files with 196 additions and 32 deletions.
diff --git a/CHANGES.md b/CHANGES.md
@@ -2,5 +2,10 @@
 
 Notable changes between releases.
 
+## 0.1.2 (2018-05-07)
+* Added 3 Kana OCR taska and baseline model
+* Vocabulary save `as_unicode` option
+* nn.conv.CNN now accessible via registry
+
 ## 0.1.1 (2018-05-03)
-* Initial commit of core functionality
+* Initial commit of core functionality
diff --git a/carpedm/__init__.py b/carpedm/__init__.py
@@ -11,4 +11,4 @@
 from carpedm import nn
 from carpedm import util
 
-__version__ = '0.1.1'
+__version__ = '0.1.2'
diff --git a/carpedm/data/lang.py b/carpedm/data/lang.py
@@ -189,13 +189,20 @@ def __init__(self, reserved, vocab):
             self._vocab[self.UNK] = len(self._vocab)
         self._rev_vocab = {idx: key for key, idx in self._vocab.items()}
 
-    def save(self, out_dir):
-        vocab_sorted = [self._rev_vocab[idx]
-                        for idx in sorted(self._rev_vocab.keys())]
+    def save(self, out_dir, as_unicode=False):
+        types = self.types()
         with open(os.path.join(out_dir, 'vocab.txt'), 'w') as f:
-            for token in vocab_sorted:
+            for token in types:
+                if as_unicode:
+                    try:
+                        token = code2char(token)
+                    except ValueError:
+                        token = token
                 f.write(token + '\n')
 
+    def types(self):
+        return [self._rev_vocab[idx] for idx in sorted(self._rev_vocab.keys())]
+
     def char_to_id(self, char):
         """Returns the integer id of a character string."""
         if char in self._vocab:

diff --git a/carpedm/data/providers.py b/carpedm/data/providers.py
@@ -165,7 +165,7 @@ def _parser(self, serialized, distort=False):
         if self.sparse_labels:
             tensor_dict['image/seq/char/id_sparse'] = tf.serialize_sparse(
                 ops.sparsify_label(tensor_dict['image/seq/char/id'],
-                                   tensor_dict['image/num_chars'])
+                                   tensor_dict['image/char/count'])
             )
 
         # if distort: image = distort_image(image)

diff --git a/carpedm/models/baseline.py b/carpedm/models/baseline.py
@@ -37,8 +37,9 @@ def name(self):
 
     def _forward_pass(self, features, data_format, axes_order,
                       is_training, reuse):
+        x = features['image/data']
         x = self._cnn.forward_pass(
-            features, data_format, axes_order, is_training, False, reuse)
+            x, data_format, axes_order, is_training, False, reuse)
         x = tf.layers.flatten(x)
         tf.logging.info('image after flatten: %s', x.get_shape())
 
@@ -51,3 +52,58 @@ def _forward_pass(self, features, data_format, axes_order,
         logits = tf.layers.dense(
             inputs=x, units=self._num_classes, name='logits')
         return logits
+
+
+@registry.register_model
+class SequenceBaseline(TFModel):
+    """A simple baseline CNN-LSTM model."""
+
+    def __init__(self, num_classes, lstm_layers=2, lstm_units=100,
+                 feature_extractor=nn.conv.CNN(), *args, **kwargs):
+        """Initializer.
+
+        Overrides TFModel.
+
+        Args:
+            num_classes (int): Number of possible character classes.
+            lstm_layers (int): Number of LSTM layers.
+            lstm_unit (int): Number of units in LSTM cell
+            feature_extractor:
+            *args: Unused arguments.
+            **kwargs: Unused arguments.
+        """
+        self._num_classes = num_classes + 1  # Add CTC null label.
+        self._layers = lstm_layers
+        self._units = lstm_units
+        self._feature_extractor = feature_extractor
+
+    @property
+    def name(self):
+        return 'Baseline_seq_' + self._feature_extractor.name
+
+    def _forward_pass(self, features, data_format, axes_order,
+                      is_training, reuse):
+        x = self._feature_extractor.forward_pass(
+            features['image/data'], data_format, axes_order,
+            is_training, False, reuse)
+        if axes_order == [0, 3, 1, 2]:
+            x = tf.transpose(x, [0, 2, 3, 1])
+        x = tf.reshape(x, [-1, x.shape[1], x.shape[2] * x.shape[3]])
+        x = nn.rnn.bi_lstm(x, n_layers=self._layers, n_units=self._units)
+        seq_len = tf.tile(tf.expand_dims(tf.to_int32(tf.shape(x)[1]), 0),
+                          [tf.to_int32(tf.shape(x)[0])])
+        logits = tf.layers.dense(inputs=x, units=self._num_classes)
+
+        return {'logits': logits, 'seq_len': seq_len}
+
+    def initialize_pretrained(self, pretrained_dir):
+
+        submodel = 'Baseline_' + self._feature_extractor.name
+
+        variable_mapping = dict()
+
+        for i in range(5):
+            variable_mapping[submodel + '/conv{}/'.format(i)] \
+                = self.name + '/conv{}/'.format(i)
+
+        return variable_mapping
diff --git a/carpedm/nn/conv.py b/carpedm/nn/conv.py
@@ -11,9 +11,10 @@
 import tensorflow as tf
 
 from carpedm.models.generic import TFModel
+from carpedm.util import registry
 from carpedm.nn import util
 
-
+@registry.register_model
 class CNN(TFModel):
     """Modular convolutional neural network layer class."""
 
@@ -80,9 +81,7 @@ def name(self):
                 p += 1
         return name[:-1]
 
-    def _forward_pass(self, features, data_format, axes_order,
-                      is_training, reuse):
-        x = features['image/data']
+    def _forward_pass(self, x, data_format, axes_order, is_training, reuse):
         x = tf.transpose(x, axes_order) if axes_order else x
         p = 0
         for c in range(len(self._kernels)):

diff --git a/carpedm/tasks/generic.py b/carpedm/tasks/generic.py
@@ -299,6 +299,7 @@ def sparse_labels(self):
             (bool): Use sparse labels.
 
         """
+        return False
 
     # ====================== END TASK INTERFACE ====================== #
 
@@ -347,9 +348,9 @@ def input_fn(self, batch_size, subset, num_shards, overwrite=False):
             out_dir=self.task_data_dir, subset=subset, target_id=self.target,
             num_shards=self._num_shards, num_threads=self._num_threads,
             format_store=self._dataset_format, shape_store=self._shape_store,
-            shape_in=self._shape_in, chunk=self.chunk, character=self.character,
-            line=self.line, label=self.label, bbox=self.bbox,
-            overwrite=overwrite)
+            shape_in=self._shape_in, sparse_labels=self.sparse_labels,
+            chunk=self.chunk, character=self.character, line=self.line,
+            label=self.label, bbox=self.bbox, overwrite=overwrite)
 
         self._original_format = dataset.format
 

diff --git a/carpedm/tasks/ocr.py b/carpedm/tasks/ocr.py
@@ -117,3 +117,94 @@ def regularization(self, hparams):
     @property
     def sparse_labels(self):
         return False
+
+
+@registry.register_task
+class OCRSeqKana3(OCRTask):
+
+    def __init__(self, beam_width=100, **kwargs):
+        self._beam_width = beam_width
+        super(OCRSeqKana3, self).__init__(**kwargs)
+
+    @property
+    def character_set(self):
+        return 'kana'
+
+    @property
+    def image_scope(self):
+        return 'seq'
+
+    @property
+    def sequence_length(self):
+        return 3
+
+    @property
+    def sparse_labels(self):
+        return True
+
+    @property
+    def target(self):
+        return 'image/seq/char/id_sparse'
+
+    def loss_fn(self, features, model_output, targets, is_training):
+        return tf.nn.ctc_loss(labels=targets,
+                              inputs=model_output['logits'],
+                              sequence_length=model_output['seq_len'],
+                              time_major=False)
+
+    def results(self, loss, tower_features, tower_preds, tower_targets,
+                is_training):
+
+        tf.summary.image("sample_input", tower_features[0]['image/data'])
+
+        all_logits = tf.concat([p['logits'] for p in tower_preds], axis=0)
+        seq_lens = tf.concat([p['seq_len'] for p in tower_preds], axis=0)
+
+        # TODO: fix when seqs are different lengths from multiple GPUs
+        all_labels = tf.sparse_concat(0, [p for p in tower_targets])
+        decoded, log_prob = tf.nn.ctc_beam_search_decoder(
+            inputs=tf.transpose(all_logits, [1, 0, 2]),
+            sequence_length=seq_lens,
+            beam_width=self._beam_width)
+        decoded = decoded[0]  # best path
+
+        edit_distance = tf.edit_distance(decoded, tf.to_int64(all_labels),
+                                         normalize=False)
+
+        Z = tf.cast(tf.size(all_labels), tf.float32)
+        ler = tf.reduce_sum(edit_distance) / Z
+        S = tf.cast(tf.size(edit_distance), tf.float32)
+        num_wrong_seqs = tf.cast(tf.count_nonzero(edit_distance), tf.float32)
+        ser = num_wrong_seqs / S
+
+        metrics = {
+            'ler': tf.metrics.mean(ler),
+            'ser': tf.metrics.mean(ser)
+        }
+
+        tensors_to_log = {'loss': loss, 'ler': ler, 'ser': ser}
+
+        mapping_string = tf.constant(self._meta.vocab.types())
+        table = tf.contrib.lookup.index_to_string_table_from_tensor(
+            mapping_string, default_value='NULL')
+        decoding = table.lookup(tf.to_int64(tf.sparse_tensor_to_dense(decoded)))
+        gt = table.lookup(tf.to_int64(tf.sparse_tensor_to_dense(all_labels)))
+
+        tf.summary.text('decoded', decoding)
+        tf.summary.text('gt', gt)
+
+        predictions = {
+            'classes': tf.argmax(input=all_logits, axis=1),
+            'probabilities': tf.nn.softmax(all_logits),
+            'decoded': decoding,
+        }
+
+        return tensors_to_log, predictions, metrics
+
+    def regularization(self, hparams):
+        model_params = tf.trainable_variables()
+        weight_loss = tf.multiply(
+            hparams.weight_decay,
+            tf.add_n([tf.nn.l2_loss(v) for v in model_params]),
+            name='weight_loss')
+        return weight_loss
diff --git a/docs/source/benchmarks.rst b/docs/source/benchmarks.rst
@@ -0,0 +1,19 @@
+.. _benchmarks:
+
+Benchmarks
+==========
+
+Single Kana OCR
+---------------
+
+Running the example :ref:`main.py <main>` for the full PMJTC dataset (171,944 training examples, 131 character classes, as of 2 May 2018)
+
+* On a 2017 MacBook Pro:
+    * Generating the (train & dev) data: 1 hour, 20 minutes
+    * Training the model for 5 epochs: 2 hours, 27 minutes
+    * Dev Accuracy: 94.67%
+
+* On a Linux Machine using 1 Titan X (Pascal) GPU:
+    * Generating the (train & dev) data: 31 minutes
+    * Training the model for 5 epochs: 21 minutes
+    * Dev Accuracy: 95.23%
diff --git a/docs/source/guides/usage.rst b/docs/source/guides/usage.rst
@@ -150,27 +150,12 @@ At the end of 30 epochs, it achieved a development set accuracy of **65.27%**. N
 And considering the **70** character classes and **4.19%** majority class for this task and specific dataset, we are already doing much better than chance!
 
 Running this same code for the *full* currently available PMJTC dataset takes much longer but---as you would expect when
-adding more data---achieves a higher accuracy. Though certainly indicative of the benefit of more data,
-note that the accuracies presented below are not a fair comparison to the one above for two reasons:
+adding more data---achieves a higher accuracy (see :ref:`benchmarks`). Though certainly indicative of the benefit of more data,
+note that the accuracies presented in the benchmarks are not a fair comparison to the one above for two reasons:
 
     1. There are more kana character classes in the full dataset: **131**
     2. The development sets on which accuracies are reported are different.
 
-Benchmarks
-~~~~~~~~~~
-
-Running the example :ref:`main.py <main>` for the full PMJTC dataset (171,944 training examples, as of 2 May 2018)
-
-* On a 2017 MacBook Pro:
-    * Generating the (train & dev) data: 1 hour, 20 minutes
-    * Training the model for 5 epochs: 2 hours, 27 minutes
-    * Dev Accuracy: 94.67%
-
-* On a Linux Machine using 1 Titan X (Pascal) GPU:
-    * Generating the (train & dev) data: 31 minutes
-    * Training the model for 5 epochs: 21 minutes
-    * Dev Accuracy: 95.23%
-
 Conclusion
 ~~~~~~~~~~
 

diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -20,6 +20,7 @@ Documentation
     :maxdepth: 1
     :caption: Other
 
+    benchmarks
     contribution
     conduct
     license