This repository has been archived by the owner on May 24, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 10
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
2 changed files
with
246 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,97 @@ | ||
#!/bin/bash | ||
|
||
# Copyright 2012 Johns Hopkins University (Author: Daniel Povey); | ||
# Arnab Ghoshal, Karel Vesely | ||
|
||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED | ||
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, | ||
# MERCHANTABLITY OR NON-INFRINGEMENT. | ||
# See the Apache 2 License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
|
||
# Parse command-line options. | ||
# To be sourced by another script (as in ". parse_options.sh"). | ||
# Option format is: --option-name arg | ||
# and shell variable "option_name" gets set to value "arg." | ||
# The exception is --help, which takes no arguments, but prints the | ||
# $help_message variable (if defined). | ||
|
||
|
||
### | ||
### The --config file options have lower priority to command line | ||
### options, so we need to import them first... | ||
### | ||
|
||
# Now import all the configs specified by command-line, in left-to-right order | ||
for ((argpos=1; argpos<$#; argpos++)); do | ||
if [ "${!argpos}" == "--config" ]; then | ||
argpos_plus1=$((argpos+1)) | ||
config=${!argpos_plus1} | ||
[ ! -r $config ] && echo "$0: missing config '$config'" && exit 1 | ||
. $config # source the config file. | ||
fi | ||
done | ||
|
||
|
||
### | ||
### No we process the command line options | ||
### | ||
while true; do | ||
[ -z "${1:-}" ] && break; # break if there are no arguments | ||
case "$1" in | ||
# If the enclosing script is called with --help option, print the help | ||
# message and exit. Scripts should put help messages in $help_message | ||
--help|-h) if [ -z "$help_message" ]; then echo "No help found." 1>&2; | ||
else printf "$help_message\n" 1>&2 ; fi; | ||
exit 0 ;; | ||
--*=*) echo "$0: options to scripts must be of the form --name value, got '$1'" | ||
exit 1 ;; | ||
# If the first command-line argument begins with "--" (e.g. --foo-bar), | ||
# then work out the variable name as $name, which will equal "foo_bar". | ||
--*) name=`echo "$1" | sed s/^--// | sed s/-/_/g`; | ||
# Next we test whether the variable in question is undefned-- if so it's | ||
# an invalid option and we die. Note: $0 evaluates to the name of the | ||
# enclosing script. | ||
# The test [ -z ${foo_bar+xxx} ] will return true if the variable foo_bar | ||
# is undefined. We then have to wrap this test inside "eval" because | ||
# foo_bar is itself inside a variable ($name). | ||
eval '[ -z "${'$name'+xxx}" ]' && echo "$0: invalid option $1" 1>&2 && exit 1; | ||
|
||
oldval="`eval echo \\$$name`"; | ||
# Work out whether we seem to be expecting a Boolean argument. | ||
if [ "$oldval" == "true" ] || [ "$oldval" == "false" ]; then | ||
was_bool=true; | ||
else | ||
was_bool=false; | ||
fi | ||
|
||
# Set the variable to the right value-- the escaped quotes make it work if | ||
# the option had spaces, like --cmd "queue.pl -sync y" | ||
eval $name=\"$2\"; | ||
|
||
# Check that Boolean-valued arguments are really Boolean. | ||
if $was_bool && [[ "$2" != "true" && "$2" != "false" ]]; then | ||
echo "$0: expected \"true\" or \"false\": $1 $2" 1>&2 | ||
exit 1; | ||
fi | ||
shift 2; | ||
;; | ||
*) break; | ||
esac | ||
done | ||
|
||
|
||
# Check for an empty argument to the --cmd option, which can easily occur as a | ||
# result of scripting errors. | ||
[ ! -z "${cmd+xxx}" ] && [ -z "$cmd" ] && echo "$0: empty argument to --cmd option" 1>&2 && exit 1; | ||
|
||
|
||
true; # so this script returns exit code 0. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,149 @@ | ||
#!/bin/bash | ||
# | ||
# Copyright 2017 Ubiqus (Author: Vincent Nguyen) | ||
# Systran (Author: Jean Senellart) | ||
# License MIT | ||
# | ||
# This recipe shows how to build an openNMT translation model for Romance Multi way languages | ||
# based on 200 000 parallel sentences for each pair | ||
# | ||
# Based on the tuto from the OpenNMT forum | ||
|
||
|
||
# TODO test is GPU is present or not | ||
CUDA_VISIBLE_DEVICES=0 | ||
decode_cpu=false | ||
|
||
# Make symlinks to access OpenNMT scripts - change this line if needed | ||
OPENNMT_PATH=../../OpenNMT | ||
[ ! -h tools ] && ln -s $OPENNMT_PATH/tools tools | ||
[ ! -h preprocess.lua ] && ln -s $OPENNMT_PATH/preprocess.lua preprocess.lua | ||
[ ! -h train.lua ] && ln -s $OPENNMT_PATH/train.lua train.lua | ||
[ ! -h translate.lua ] && ln -s $OPENNMT_PATH/translate.lua translate.lua | ||
[ ! -h onmt ] && ln -s $OPENNMT_PATH/onmt onmt | ||
|
||
# this is usefull to skip some stages during step by step execution | ||
stage=0 | ||
|
||
# if you want to run without training and use an existing model in the "exp" folder set notrain to true | ||
notrain=false | ||
|
||
# At the moment only "stage" option is available anyway | ||
. local/parse_options.sh | ||
|
||
# Data download and preparation | ||
|
||
if [ $stage -le 0 ]; then | ||
# TODO put this part in a local/download_data.sh script ? | ||
mkdir -p data | ||
cd data | ||
if [ ! -f multi-esfritptro-parallel.tgz ]; then | ||
echo "$0: downloading the baseline corpus from amazon s3" | ||
wget https://s3.amazonaws.com/opennmt-trainingdata/multi-esfritptro-parallel.tgz | ||
tar xzfv multi-esfritptro-parallel.tgz | ||
fi | ||
cd ../local | ||
if [ ! -f mteval-v13a.pl ]; then | ||
wget https://raw.githubusercontent.com/moses-smt/mosesdecoder/master/scripts/generic/mteval-v13a.pl | ||
fi | ||
if [ ! -f input-from-sgm.perl ]; then | ||
wget https://raw.githubusercontent.com/moses-smt/mosesdecoder/master/scripts/ems/support/input-from-sgm.perl | ||
fi | ||
if [ ! -f wrap-xml.perl ]; then | ||
wget https://raw.githubusercontent.com/moses-smt/mosesdecoder/master/scripts/ems/support/wrap-xml.perl | ||
fi | ||
if [ ! -f multi-bleu.perl ]; then | ||
wget https://raw.githubusercontent.com/moses-smt/mosesdecoder/master/scripts/generic/multi-bleu.perl | ||
fi | ||
if [ ! -f learn_bpe.py ]; then | ||
wget https://raw.githubusercontent.com/rsennrich/subword-nmt/master/learn_bpe.py | ||
fi | ||
cd .. | ||
fi | ||
|
||
# Tokenize and prepare the Corpus | ||
if [ $stage -le 1 ]; then | ||
echo "$0: tokenizing corpus" | ||
for f in data/train*.?? ; do th tools/tokenize.lua < $f > $f.rawtok ; done | ||
cat data/train*.rawtok | python local/learn_bpe.py -s 32000 > data/esfritptro.bpe32000 | ||
for f in data/*-????.?? ; do \ | ||
th tools/tokenize.lua -case_feature -joiner_annotate -nparrallel 4 -bpe_model data/esfritptro.bpe32000 < $f > $f.tok | ||
done | ||
for set in train valid test ; do rm data/$set-multi.???.tok ; done | ||
for src in es fr it pt ro ; do | ||
for tgt in es fr it pt ro ; do | ||
[ ! $src = $tgt ] && perl -i.bak -pe "s//__opt_tgt_$tgt\xEF\xBF\xA8N /" data/*-$src$tgt.$src.tok | ||
for set in train valid test ; do | ||
[ ! $src = $tgt ] && cat data/$set-$src$tgt.$src.tok >> data/$set-multi.src.tok | ||
[ ! $src = $tgt ] && cat data/$set-$src$tgt.$tgt.tok >> data/$set-multi.tgt.tok | ||
done | ||
done | ||
done | ||
paste data/valid-multi.src.tok data/valid-multi.tgt.tok | shuf > data/valid-multi.srctgt.tok | ||
head -2000 data/valid-multi.srctgt.tok | cut -f1 > data/valid-multi2000.src.tok | ||
head -2000 data/valid-multi.srctgt.tok | cut -f2 > data/valid-multi2000.tgt.tok | ||
fi | ||
|
||
# Preprocess the data - decide here the vocabulary size 50000 default value | ||
if [ $stage -le 2 ]; then | ||
mkdir -p exp | ||
echo "$0: preprocessing corpus" | ||
th preprocess.lua -src_vocab_size 50000 -tgt_vocab_size 50000 \ | ||
-train_src data/train-multi.src.tok -train_tgt data/train-multi.tgt.tok \ | ||
-valid_src data/valid-multi2000.src.tok -valid_tgt data/valid-multi2000.tgt.tok \ | ||
-save_data exp/model-multi | ||
fi | ||
|
||
# Train the model !!!! even if OS cuda device ID is 0 you need -gpuid=1 | ||
# Decide here the number of epochs, learning rate, which epoch to start decay, decay rate | ||
# if you change number of epochs do not forget to change the model name too | ||
# This example has a smaller topology compared to tuto for faster training (worse results) | ||
if [ $stage -le 3 ]; then | ||
if [ $notrain = false ]; then | ||
echo "$0: training starting, will take a while." | ||
th train.lua -layers 2 -rnn_size 500 -brnn -word_vec_size 600 \ | ||
-end_epoch 13 -learning_rate 1 -start_decay_at 5 -learning_rate_decay 0.65 \ | ||
-data exp/model-multi-train.t7 -save_model exp/model-multi-2-500-600 -gpuid 1 | ||
cp -f exp/model-multi-2-500-600"_epoch13_"*".t7" exp/model-multi-2-500-600"_final.t7" | ||
else | ||
echo "$0: using an existing model" | ||
if [ ! -f exp/model-multi-2-500-600"_final.t7" ]; then | ||
echo "$0: mode file does not exist" | ||
exit 1 | ||
fi | ||
fi | ||
fi | ||
|
||
# Deploy model for CPU usage | ||
if [ $stage -le 4 ]; then | ||
if [ $decode_cpu = true ]; then | ||
th tools/release_model.lua -force -model exp/model-multi-2-500-600"_final.t7" \ | ||
-output_model exp/model-multi-2-500-600"_cpu.t7" -gpuid 1 | ||
fi | ||
fi | ||
|
||
# Translate using gpu | ||
# you can change this by changing the model name from _final to _cpu and remove -gpuid 1 | ||
if [ $stage -le 5 ]; then | ||
[ $decode_cpu = true ] && dec_opt="" || dec_opt="-gpuid 1" | ||
for src in es fr it pt ro ; do | ||
for tgt in es fr it pt ro ; do | ||
[ ! $src = $tgt ] && th translate.lua -replace_unk -model exp/model-multi-2-500-600"_final"*".t7" \ | ||
-src data/test-$src$tgt.$src.tok -output exp/test-$src$tgt.hyp.$tgt.tok $dec_opt | ||
done | ||
done | ||
fi | ||
|
||
# Evaluate the generic test set with multi-bleu | ||
if [ $stage -le 6 ]; then | ||
for src in es fr it pt ro ; do | ||
for tgt in es fr it pt ro ; do | ||
[ ! $src = $tgt ] && th tools/detokenize.lua -case_feature < exp/test-$src$tgt.hyp.$tgt.tok \ | ||
> exp/test-$src$tgt.hyp.$tgt.detok | ||
[ ! $src = $tgt ] && local/multi-bleu.perl data/test-$src$tgt.$tgt \ | ||
< exp/test-$src$tgt.hyp.$tgt.detok > exp/test-$src$tgt"_multibleu".txt | ||
done | ||
done | ||
fi | ||
|
||
|