OpenNMT · guillaumekln · Apr 18, 2017 · Mar 19, 2017 · Mar 19, 2017 · Mar 20, 2017
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,8 @@
 
 ### New features
 * Display sentence length distribution in preprocess
+* Support vectors as inputs using [Kaldi](http://kaldi-asr.org/) input format
+* Support parallel file alignment by index in addition to line-by-line
 
 ### Fixes and improvements
 

diff --git a/docs/data/preparation.md b/docs/data/preparation.md
@@ -1,12 +1,59 @@
 The data preparation (or preprocessing) passes over the data to generate word vocabularies and sequences of indices used by the training.
 
+## Data type
+
+By default, the data type is `bitext` which are aligned source and target files. Alignment is by default done at the line level, but can also be done through aligned index (see [Index files](#index-files)).
+
+For training language models, data type is `monotext` which is only one language file.
+
+Finally, you can also manipulate the `feattext` data type (see [Input vectors](#input-vectors)) which allows to code sequences of vectors (e.g. sequence of features generated by a device).
+
+!!! note "Note"
+    Input vectors can only be used for the source.
+
+
 ## Delimiters
 
-Training data are expected to follow the following format:
+Training data (for `bitext` and `monotext` data types) are expected to follow the following format:
 
 * sentences are newline-separated
 * tokens are space-separated
 
+## Index files
+
+Index files are aligning different files by index and not by line. For instance the following files are aligned by index:
+
+```text
+line1 First line
+line2 Second line
+```
+
+```text
+line2 Deuxième ligne
+line1 Première ligne
+```
+
+where the first token of each line is an index which must have an equivalent (at any position) in aligned files.
+
+The option `-idx_files` is used (in `preprocess.lua` or `translate.lua`) to enable this feature.
+
+## Input vectors
+
+OpenNMT supports the use of vector sequence instead of word sequence on the source side.
+
+The data type is `feattext` and is using the [Kaldi](http://kaldi-asr.org) text format (`.ark` files). For instance the following entry, indexed by `KEY` is representing a sequence
+of `m` vectors of `n` values:
+
+```text
+KEY [
+FEAT1.1 FEAT1.2 FEAT1.3 ... FEAT1.n
+...
+FEATm.1 FEATm.2 FEATm.3 ... FEATm.n ]
+```
+
+!!! warning "Warning"
+    Note that you need to use index files for representing input vectors.
+
 ## Vocabularies
 
 The main goal of the preprocessing is to build the word vocabularies and assign each word to an index within these dictionaries.

diff --git a/onmt/Factory.lua b/onmt/Factory.lua
@@ -70,6 +70,14 @@ local function resolveEmbSizes(opt, dicts, wordSizes)
 end
 
 local function buildInputNetwork(opt, dicts, wordSizes, pretrainedWords, fixWords, verbose)
+
+  if not dicts then
+    -- if input vector - skip word embbedding
+    local inputNetwork = nn.Identity()
+    inputNetwork.inputSize = opt.dimInputSize
+    return inputNetwork
+  end
+
   local wordEmbSize, featEmbSizes = resolveEmbSizes(opt, dicts, wordSizes)
 
   local wordEmbedding = onmt.WordEmbedding.new(dicts.words:size(), -- vocab size

diff --git a/onmt/Seq2Seq.lua b/onmt/Seq2Seq.lua
@@ -95,6 +95,11 @@ function Seq2Seq:__init(args, dicts, verbose)
   onmt.utils.Table.merge(self.args, onmt.utils.ExtendedCmdLine.getModuleOpts(args, options))
   self.args.uneven_batches = args.uneven_batches
 
+  if not dicts.src then
+    -- the input is already a vector
+    args.dimInputSize = dicts.srcInputSize
+  end
+
   self.models.encoder = onmt.Factory.buildWordEncoder(args, dicts.src, verbose)
   self.models.decoder = onmt.Factory.buildWordDecoder(args, dicts.tgt, verbose)
   self.criterion = onmt.ParallelClassNLLCriterion(onmt.Factory.getOutputSizes(dicts.tgt))
@@ -146,24 +151,27 @@ function Seq2Seq:getOutput(batch)
 end
 
 function Seq2Seq:maskPadding(batch)
-  if self.args.uneven_batches then
-    self.models.encoder:maskPadding()
-    if batch.uneven then
-      self.models.decoder:maskPadding(batch.sourceSize, batch.sourceLength)
-    else
-      self.models.decoder:maskPadding()
-    end
+  self.models.encoder:maskPadding()
+  if batch and batch.uneven then
+    self.models.decoder:maskPadding(self.models.encoder:contextSize(batch.sourceSize, batch.sourceLength))
+  else
+    self.models.decoder:maskPadding()
   end
 end
 
 function Seq2Seq:forwardComputeLoss(batch)
-  self:maskPadding(batch)
+  if self.args.uneven_batches then
+    self:maskPadding(batch)
+  end
+
   local encoderStates, context = self.models.encoder:forward(batch)
   return self.models.decoder:computeLoss(batch, encoderStates, context, self.criterion)
 end
 
 function Seq2Seq:trainNetwork(batch, dryRun)
-  self:maskPadding(batch)
+  if self.args.uneven_batches then
+    self:maskPadding(batch)
+  end
 
   local encStates, context = self.models.encoder:forward(batch)
 

diff --git a/onmt/data/Batch.lua b/onmt/data/Batch.lua
@@ -55,7 +55,7 @@ local Batch = torch.class('Batch')
 
 Parameters:
 
-  * `src` - 2D table of source batch indices
+  * `src` - 2D table of source batch indices or prebuilt source batch vectors
   * `srcFeatures` - 2D table of source batch features (opt)
   * `tgt` - 2D table of target batch indices
   * `tgtFeatures` - 2D table of target batch features (opt)
@@ -74,14 +74,24 @@ function Batch:__init(src, srcFeatures, tgt, tgtFeatures)
 
   self.sourceLength, self.sourceSize, self.uneven = getLength(src)
 
+  -- if input vectors (speech for instance)
+  self.inputVectors = src[1]:dim() > 1
+
   local sourceSeq = torch.LongTensor(self.sourceLength, self.size):fill(onmt.Constants.PAD)
-  self.sourceInput = sourceSeq:clone()
-  self.sourceInputRev = sourceSeq:clone()
+
+  if not self.inputVectors then
+    self.sourceInput = sourceSeq:clone()
+    self.sourceInputRev = sourceSeq:clone()
+    -- will be used to return extra padded value
+    self.padTensor = torch.LongTensor(self.size):fill(onmt.Constants.PAD)
+  else
+    self.sourceInput = torch.Tensor(self.sourceLength, self.size, src[1]:size(2))
+    self.sourceInputRev = torch.Tensor(self.sourceLength, self.size, src[1]:size(2))
+    self.padTensor = torch.Tensor(self.size, src[1]:size(2)):zero()
+  end
 
   self.sourceInputFeatures = {}
   self.sourceInputRevFeatures = {}
-  -- will be used to return extra padded value
-  self.padTensor = torch.LongTensor(self.size):fill(onmt.Constants.PAD)
 
   if #srcFeatures > 0 then
     for _ = 1, #srcFeatures[1] do

diff --git a/onmt/data/Dataset.lua b/onmt/data/Dataset.lua
@@ -6,11 +6,11 @@ local Dataset = torch.class("Dataset")
 --]]
 function Dataset:__init(srcData, tgtData)
 
-  self.src = srcData.words
+  self.src = srcData.words or srcData.vectors
   self.srcFeatures = srcData.features
 
   if tgtData ~= nil then
-    self.tgt = tgtData.words
+    self.tgt = tgtData.words or tgtData.vectors
     self.tgtFeatures = tgtData.features
   end
 end