Romance Multi way recipe

OpenNMT · Jan 17, 2017 · 480b4c9 · 480b4c9
1 parent 9679010
commit 480b4c9
Show file tree

Hide file tree

Showing 2 changed files with 246 additions and 0 deletions.
diff --git a/romance-multi-way/local/parse_options.sh b/romance-multi-way/local/parse_options.sh
@@ -0,0 +1,97 @@
+#!/bin/bash
+
+# Copyright 2012  Johns Hopkins University (Author: Daniel Povey);
+#                 Arnab Ghoshal, Karel Vesely
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Parse command-line options.
+# To be sourced by another script (as in ". parse_options.sh").
+# Option format is: --option-name arg
+# and shell variable "option_name" gets set to value "arg."
+# The exception is --help, which takes no arguments, but prints the 
+# $help_message variable (if defined).
+
+
+###
+### The --config file options have lower priority to command line 
+### options, so we need to import them first...
+###
+
+# Now import all the configs specified by command-line, in left-to-right order
+for ((argpos=1; argpos<$#; argpos++)); do
+  if [ "${!argpos}" == "--config" ]; then
+    argpos_plus1=$((argpos+1))
+    config=${!argpos_plus1}
+    [ ! -r $config ] && echo "$0: missing config '$config'" && exit 1
+    . $config  # source the config file.
+  fi
+done
+
+
+###
+### No we process the command line options
+###
+while true; do
+  [ -z "${1:-}" ] && break;  # break if there are no arguments
+  case "$1" in
+    # If the enclosing script is called with --help option, print the help 
+    # message and exit.  Scripts should put help messages in $help_message
+  --help|-h) if [ -z "$help_message" ]; then echo "No help found." 1>&2;
+	  else printf "$help_message\n" 1>&2 ; fi; 
+	  exit 0 ;; 
+  --*=*) echo "$0: options to scripts must be of the form --name value, got '$1'"
+       exit 1 ;;
+    # If the first command-line argument begins with "--" (e.g. --foo-bar), 
+    # then work out the variable name as $name, which will equal "foo_bar".
+  --*) name=`echo "$1" | sed s/^--// | sed s/-/_/g`; 
+    # Next we test whether the variable in question is undefned-- if so it's 
+    # an invalid option and we die.  Note: $0 evaluates to the name of the 
+    # enclosing script.
+    # The test [ -z ${foo_bar+xxx} ] will return true if the variable foo_bar
+    # is undefined.  We then have to wrap this test inside "eval" because 
+    # foo_bar is itself inside a variable ($name).
+      eval '[ -z "${'$name'+xxx}" ]' && echo "$0: invalid option $1" 1>&2 && exit 1;
+
+      oldval="`eval echo \\$$name`";
+    # Work out whether we seem to be expecting a Boolean argument.
+      if [ "$oldval" == "true" ] || [ "$oldval" == "false" ]; then 
+	was_bool=true;
+      else 
+	was_bool=false;
+      fi
+
+    # Set the variable to the right value-- the escaped quotes make it work if
+    # the option had spaces, like --cmd "queue.pl -sync y"
+      eval $name=\"$2\"; 
+
+    # Check that Boolean-valued arguments are really Boolean.
+      if $was_bool && [[ "$2" != "true" && "$2" != "false" ]]; then
+        echo "$0: expected \"true\" or \"false\": $1 $2" 1>&2
+        exit 1;
+      fi
+      shift 2;
+      ;;
+  *) break;
+  esac
+done
+
+
+# Check for an empty argument to the --cmd option, which can easily occur as a 
+# result of scripting errors.
+[ ! -z "${cmd+xxx}" ] && [ -z "$cmd" ] && echo "$0: empty argument to --cmd option" 1>&2 && exit 1;
+
+
+true; # so this script returns exit code 0.
diff --git a/romance-multi-way/run.sh b/romance-multi-way/run.sh
@@ -0,0 +1,149 @@
+#!/bin/bash
+#
+# Copyright 2017 Ubiqus   (Author: Vincent Nguyen)
+#		 Systran  (Author: Jean Senellart)
+# License MIT
+#
+# This recipe shows how to build an openNMT translation model for Romance Multi way languages
+# based on 200 000 parallel sentences for each pair
+#
+# Based on the tuto from the OpenNMT forum
+
+
+# TODO test is GPU is present or not
+CUDA_VISIBLE_DEVICES=0
+decode_cpu=false
+
+# Make symlinks to access OpenNMT scripts - change this line if needed
+OPENNMT_PATH=../../OpenNMT
+[ ! -h tools ] && ln -s $OPENNMT_PATH/tools tools
+[ ! -h preprocess.lua ] && ln -s $OPENNMT_PATH/preprocess.lua preprocess.lua
+[ ! -h train.lua ] && ln -s $OPENNMT_PATH/train.lua train.lua
+[ ! -h translate.lua ] && ln -s $OPENNMT_PATH/translate.lua translate.lua
+[ ! -h onmt ] && ln -s $OPENNMT_PATH/onmt onmt
+
+# this is usefull to skip some stages during step by step execution
+stage=0
+
+# if you want to run without training and use an existing model in the "exp" folder set notrain to true
+notrain=false
+
+# At the moment only "stage" option is available anyway
+. local/parse_options.sh
+
+# Data download and preparation
+
+if [ $stage -le 0 ]; then
+# TODO put this part in a local/download_data.sh script ?
+  mkdir -p data
+  cd data
+  if [ ! -f multi-esfritptro-parallel.tgz ]; then
+    echo "$0: downloading the baseline corpus from amazon s3"
+    wget https://s3.amazonaws.com/opennmt-trainingdata/multi-esfritptro-parallel.tgz
+    tar xzfv multi-esfritptro-parallel.tgz
+  fi
+  cd ../local
+  if [ ! -f mteval-v13a.pl ]; then
+    wget https://raw.githubusercontent.com/moses-smt/mosesdecoder/master/scripts/generic/mteval-v13a.pl
+  fi
+  if [ ! -f input-from-sgm.perl ]; then
+    wget https://raw.githubusercontent.com/moses-smt/mosesdecoder/master/scripts/ems/support/input-from-sgm.perl
+  fi
+  if [ ! -f wrap-xml.perl ]; then
+    wget https://raw.githubusercontent.com/moses-smt/mosesdecoder/master/scripts/ems/support/wrap-xml.perl
+  fi
+  if [ ! -f multi-bleu.perl ]; then
+    wget https://raw.githubusercontent.com/moses-smt/mosesdecoder/master/scripts/generic/multi-bleu.perl
+  fi
+  if [ ! -f learn_bpe.py ]; then
+    wget https://raw.githubusercontent.com/rsennrich/subword-nmt/master/learn_bpe.py
+  fi
+  cd ..
+fi
+
+# Tokenize and prepare the Corpus
+if [ $stage -le 1 ]; then
+  echo "$0: tokenizing corpus"
+  for f in data/train*.?? ; do th tools/tokenize.lua < $f > $f.rawtok ; done
+  cat data/train*.rawtok | python local/learn_bpe.py -s 32000 > data/esfritptro.bpe32000
+  for f in data/*-????.?? ; do \
+    th tools/tokenize.lua -case_feature -joiner_annotate -nparrallel 4 -bpe_model data/esfritptro.bpe32000 < $f > $f.tok
+  done
+  for set in train valid test ; do rm data/$set-multi.???.tok ; done
+  for src in es fr it pt ro ; do
+    for tgt in es fr it pt ro ; do
+      [ ! $src = $tgt ] && perl -i.bak -pe "s//__opt_tgt_$tgt\xEF\xBF\xA8N /" data/*-$src$tgt.$src.tok
+      for set in train valid test ; do
+        [ ! $src = $tgt ] && cat data/$set-$src$tgt.$src.tok >> data/$set-multi.src.tok
+        [ ! $src = $tgt ] && cat data/$set-$src$tgt.$tgt.tok >> data/$set-multi.tgt.tok
+      done
+    done
+  done
+  paste data/valid-multi.src.tok data/valid-multi.tgt.tok | shuf > data/valid-multi.srctgt.tok
+  head -2000 data/valid-multi.srctgt.tok | cut -f1 > data/valid-multi2000.src.tok
+  head -2000 data/valid-multi.srctgt.tok | cut -f2 > data/valid-multi2000.tgt.tok
+fi
+
+# Preprocess the data - decide here the vocabulary size 50000 default value
+if [ $stage -le 2 ]; then
+  mkdir -p exp
+  echo "$0: preprocessing corpus"
+  th preprocess.lua -src_vocab_size 50000 -tgt_vocab_size 50000 \
+  -train_src data/train-multi.src.tok -train_tgt data/train-multi.tgt.tok \
+  -valid_src data/valid-multi2000.src.tok -valid_tgt data/valid-multi2000.tgt.tok \
+  -save_data exp/model-multi
+fi
+
+# Train the model !!!! even if OS cuda device ID is 0 you need -gpuid=1
+# Decide here the number of epochs, learning rate, which epoch to start decay, decay rate
+# if you change number of epochs do not forget to change the model name too
+# This example has a smaller topology compared to tuto for faster training (worse results)
+if [ $stage -le 3 ]; then
+  if [ $notrain = false ]; then
+    echo "$0: training starting, will take a while."
+    th train.lua -layers 2 -rnn_size 500 -brnn -word_vec_size 600 \
+    -end_epoch 13 -learning_rate 1 -start_decay_at 5 -learning_rate_decay 0.65 \
+    -data  exp/model-multi-train.t7 -save_model exp/model-multi-2-500-600 -gpuid 1
+    cp -f exp/model-multi-2-500-600"_epoch13_"*".t7" exp/model-multi-2-500-600"_final.t7"
+  else
+    echo "$0: using an existing model"
+    if [ ! -f exp/model-multi-2-500-600"_final.t7" ]; then
+      echo "$0: mode file does not exist"
+      exit 1
+    fi
+  fi
+fi
+
+# Deploy model for CPU usage
+if [ $stage -le 4 ]; then
+  if [ $decode_cpu = true ]; then
+    th tools/release_model.lua -force -model exp/model-multi-2-500-600"_final.t7" \
+    -output_model exp/model-multi-2-500-600"_cpu.t7" -gpuid 1
+  fi
+fi
+
+# Translate using gpu
+# you can change this by changing the model name from _final to _cpu and remove -gpuid 1
+if [ $stage -le 5 ]; then
+  [ $decode_cpu = true ] && dec_opt="" || dec_opt="-gpuid 1"
+  for src in es fr it pt ro ; do
+    for tgt in es fr it pt ro ; do
+      [ ! $src = $tgt ] && th translate.lua -replace_unk -model exp/model-multi-2-500-600"_final"*".t7" \
+       -src data/test-$src$tgt.$src.tok -output exp/test-$src$tgt.hyp.$tgt.tok $dec_opt
+    done
+  done
+fi
+
+# Evaluate the generic test set with multi-bleu
+if [ $stage -le 6 ]; then
+  for src in es fr it pt ro ; do
+    for tgt in es fr it pt ro ; do
+      [ ! $src = $tgt ] && th tools/detokenize.lua -case_feature < exp/test-$src$tgt.hyp.$tgt.tok \
+      > exp/test-$src$tgt.hyp.$tgt.detok
+      [ ! $src = $tgt ] && local/multi-bleu.perl data/test-$src$tgt.$tgt \
+      < exp/test-$src$tgt.hyp.$tgt.detok > exp/test-$src$tgt"_multibleu".txt
+    done
+  done
+fi
+
+