-
Notifications
You must be signed in to change notification settings - Fork 0
/
run_translation_multi_modalities_pipeline.sh
288 lines (288 loc) · 10.3 KB
/
run_translation_multi_modalities_pipeline.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
#!/bin/bash
# Setting environment
# Change the below command to point to your own conda execution script
source /c/Users/TuAhnDinh/Anaconda3/etc/profile.d/conda.sh
conda activate BachelorThesisST
# ------------------------- Manual variable setting -------------------------
CONT_FROM_CHECKPOINT="no" # yes or no
SRC_LANG=en
TGT_LANG=de
SUB_DATA_NAME=one_fourth
# EXPERIMENT_NAME contains task name (i.e. asr, mt, st) and feature type (DEPI, SE, JoinEmbedding, AuxLoss)
EXPERIMENT_NAME=${SUB_DATA_NAME}_asr_mt_SE_JoinEmbedding_AuxLoss
# ------------------------- End of manual variable setting -------------------------
FINAL_MODEL="best" # if best, evaluate the best model. if latest, evaluate the latest model
EVALUATE_ADDITIONAL_TASKS="yes" # whether to evaluate on test set for additional tasks
SRC_MODALITY=mix
TGT_MODALITY=text
TGT_EXTENSION=txt
if [ "${SRC_LANG}" = "en" ]; then
DATA_DIR=data/CoVoST2/preprocessed/${SUB_DATA_NAME}/en-X
else
DATA_DIR=data/CoVoST2/preprocessed/${SUB_DATA_NAME}/${SRC_LANG}-${TGT_LANG}
fi
CONCAT=4
SUB_DIR=${SRC_MODALITY}_${SRC_LANG}_${TGT_MODALITY}_${TGT_LANG}
# Preprocess data
if [ -d ${DATA_DIR}/${SUB_DIR} ]; then
echo "${SUB_DIR} already preprocessed"
else
echo "Preprocessing ${SUB_DIR} data"
mkdir ${DATA_DIR}/${SUB_DIR}
# Create a vocabulary for all text sources and targets
python vocab_generator.py -filenames "$DATA_DIR/${SRC_LANG}_text_train.txt|${DATA_DIR}/${TGT_LANG}_text_train.txt" \
-out_file $DATA_DIR/${SUB_DIR}/src_tgt_vocab
# Use the above vocabs while preprocessing
# Preprocess ASR data
python preprocess.py -train_src $DATA_DIR/${SRC_LANG}_audio_train.scp \
-train_tgt $DATA_DIR/${SRC_LANG}_text_train.txt \
-valid_src $DATA_DIR/${SRC_LANG}_audio_val.scp \
-valid_tgt $DATA_DIR/${SRC_LANG}_text_val.txt \
-train_src_lang ${SRC_LANG} \
-train_tgt_lang ${SRC_LANG} \
-valid_src_lang ${SRC_LANG} \
-valid_tgt_lang ${SRC_LANG} \
-all_langs "${SRC_LANG}|${TGT_LANG}" \
-src_seq_length 1024 \
-tgt_seq_length 512 \
-concat 4 \
-asr \
-src_type audio \
-asr_format scp \
-save_data $DATA_DIR/${SUB_DIR}/asr_data \
-format scp \
-tgt_vocab $DATA_DIR/${SUB_DIR}/src_tgt_vocab
# # Preprocess ST data
# python preprocess.py -train_src $DATA_DIR/${SRC_LANG}_audio_train.scp \
# -train_tgt $DATA_DIR/${TGT_LANG}_text_train.txt \
# -valid_src $DATA_DIR/${SRC_LANG}_audio_val.scp \
# -valid_tgt $DATA_DIR/${TGT_LANG}_text_val.txt \
# -train_src_lang ${SRC_LANG} \
# -train_tgt_lang ${TGT_LANG} \
# -valid_src_lang ${SRC_LANG} \
# -valid_tgt_lang ${TGT_LANG} \
# -all_langs "${SRC_LANG}|${TGT_LANG}" \
# -src_seq_length 1024 \
# -tgt_seq_length 512 \
# -concat 4 \
# -asr \
# -src_type audio \
# -asr_format scp \
# -save_data $DATA_DIR/${SUB_DIR}/st_data \
# -format scp \
# -tgt_vocab $DATA_DIR/${SUB_DIR}/src_tgt_vocab
# Preprocess MT data
python preprocess.py -train_src $DATA_DIR/${SRC_LANG}_text_train.txt \
-train_tgt $DATA_DIR/${TGT_LANG}_text_train.txt \
-valid_src $DATA_DIR/${SRC_LANG}_text_val.txt \
-valid_tgt $DATA_DIR/${TGT_LANG}_text_val.txt \
-train_src_lang ${SRC_LANG} \
-train_tgt_lang ${TGT_LANG} \
-valid_src_lang ${SRC_LANG} \
-valid_tgt_lang ${TGT_LANG} \
-all_langs "${SRC_LANG}|${TGT_LANG}" \
-src_seq_length 512 \
-tgt_seq_length 512 \
-concat 1 \
-src_type text \
-save_data $DATA_DIR/${SUB_DIR}/mt_data \
-format mmem \
-src_vocab $DATA_DIR/${SUB_DIR}/src_tgt_vocab \
-tgt_vocab $DATA_DIR/${SUB_DIR}/src_tgt_vocab
fi
# Whether continue from a checkpoint
MODEL_DIR=models/${SUB_DIR}_${EXPERIMENT_NAME}
EXPERIMENT_DIR=experiments/${SUB_DIR}_${EXPERIMENT_NAME}
TOTAL_EPOCHS=64
if [ "$CONT_FROM_CHECKPOINT" = "yes" ]; then
# Find latest model to continue from
LATEST_CHECKPONTED=${MODEL_DIR}/$(python finding_latest_model.py -model_dir $MODEL_DIR)
# Set the number of remanining epochs to be run
CURRENT_EPOCH=`echo $LATEST_CHECKPONTED | sed -nr 's/.*e(.*).00.pt.*/\1/p'`
N_EPOCHS=$(($TOTAL_EPOCHS-$CURRENT_EPOCH))
cont_checkpoint_str="-load_from ${LATEST_CHECKPONTED}"
else
# Delete old models and log files if any and create new ones
if [ -d ${MODEL_DIR} ]; then
rm -r ${MODEL_DIR}
fi
mkdir ${MODEL_DIR}
if [ -d ${EXPERIMENT_DIR} ]; then
rm -r ${EXPERIMENT_DIR}
fi
mkdir ${EXPERIMENT_DIR}
# No checkpointed model to train from
LATEST_CHECKPONTED=""
# Set the number of epochs to be run
N_EPOCHS=$TOTAL_EPOCHS
fi
# Train model
echo "Training model..."
# Define some argument values
# NOTE, the main data should have src audio, not text, since with the same number of sentences, src audio would need
# more batches, and we want all data to be covered
DATA=${DATA_DIR}/${SUB_DIR}/asr_data
DATA_FORMAT=scp
ADDITIONAL_DATA="${DATA_DIR}/${SUB_DIR}/mt_data"
ADDITIONAL_DATA_FORMAT="mmem"
DATA_RATIO="-1"
input_size=$((80*$CONCAT))
LAYER=12
TRANSFORMER=transformer
OPTIM=Adam
LR=0.001
size=512
innersize=$((size*4))
AUDIO_ENC_LAYERS=32
TEXT_ENC_LAYERS=$LAYER
optim_str="-optim adam"
BATCH_SIZE_WORDS=2048
BATCH_SIZE_SENT=9999
DEATH_RATE=0.0
SHARE_ENCODERS="all_text_enc"
# Setting for share encoders (SE)
if [[ "$EXPERIMENT_NAME" = *"SE"* ]]; then
share_encoder_str="-share_encoders_parameter ${SHARE_ENCODERS}"
fi
# Setting for disentangling positional info (DEPI)
if [[ "$EXPERIMENT_NAME" = *"DEPI"* ]]; then
text_enc_depi_layer_str="-text_enc_change_residual_at $((TEXT_ENC_LAYERS/2))"
text_enc_depi_type_str="-text_enc_change_residual 2"
fi
# Setting for JoinEmbedding
if [[ "$EXPERIMENT_NAME" = *"JoinEmbedding"* ]]; then
join_embedding_str="-join_embedding"
fi
# Setting for AuxLoss
if [[ "$EXPERIMENT_NAME" = *"AuxLoss"* ]]; then
aux_loss_start_from_str="-aux_loss_start_from 1"
sim_loss_type_str="-sim_loss_type 11"
aux_loss_weight_str="-aux_loss_weight 5"
fi
# Run training process
python -u train.py -data $DATA \
$cont_checkpoint_str \
-data_format $DATA_FORMAT \
-additional_data $ADDITIONAL_DATA \
-additional_data_format $ADDITIONAL_DATA_FORMAT \
-data_ratio $DATA_RATIO \
-use_language_embedding \
-language_embedding_type concat \
$text_enc_depi_layer_str \
$text_enc_depi_type_str \
$aux_loss_start_from_str \
$sim_loss_type_str \
$aux_loss_weight_str \
-save_model ${MODEL_DIR}/model \
-model $TRANSFORMER \
-batch_size_words $BATCH_SIZE_WORDS \
-batch_size_update 24568 \
-batch_size_sents $BATCH_SIZE_SENT \
-batch_size_multiplier 8 \
-encoder_type $SRC_MODALITY \
-checkpointing 0 \
-input_size $input_size \
-concat $CONCAT \
-layers $LAYER \
-audio_encoder_layers $AUDIO_ENC_LAYERS \
-text_encoder_layers $TEXT_ENC_LAYERS \
$share_encoder_str \
-death_rate $DEATH_RATE \
-model_size $size \
-inner_size $innersize \
-n_heads 8 \
-dropout 0.2 \
-attn_dropout 0.2 \
-word_dropout 0.1 \
-emb_dropout 0.2 \
-label_smoothing 0.1 \
-epochs $N_EPOCHS \
$optim_str \
-learning_rate $LR \
-normalize_gradient \
-warmup_steps 8000 \
-tie_weights \
$join_embedding_str \
-seed 8877 \
-log_interval 1000 \
-update_frequency -1 \
-gpus 0 | tee -a ${EXPERIMENT_DIR}/train.log
sed '/.*Validation perplexity.*/{s///;q;}' ${EXPERIMENT_DIR}/train.log > ${EXPERIMENT_DIR}/shortened_train.log
grep -e "Train perplexity" -e "Validation perplexity" ${EXPERIMENT_DIR}/train.log >> ${EXPERIMENT_DIR}/shortened_train.log
if [ "${FINAL_MODEL}" = "best" ]; then
# Run best model on test set
CHOSEN_MODEL_NAME=$(python finding_best_model.py -model_dir ${MODEL_DIR})
else
# Run latest model on test set
CHOSEN_MODEL_NAME=$(python finding_latest_model.py -model_dir ${MODEL_DIR})
fi
echo "Running ${FINAL_MODEL} model: ${CHOSEN_MODEL_NAME} on test set..." | tee ${EXPERIMENT_DIR}/note.txt
# Here we set -encoder_type=audio since we're only insterested in Speech Translation task
echo "Evaluating ST"
python translate.py -model ${MODEL_DIR}/$CHOSEN_MODEL_NAME \
-src $DATA_DIR/${SRC_LANG}_audio_test.scp \
-src_lang $SRC_LANG \
-tgt_lang $TGT_LANG \
-concat $CONCAT \
-asr_format scp \
-encoder_type audio \
-tgt $DATA_DIR/${TGT_LANG}_${TGT_MODALITY}_test.${TGT_EXTENSION} \
-output ${EXPERIMENT_DIR}/encoded_translations_st.txt \
-batch_size 5 \
-max_sent_length 1024 \
-gpu 0
# Evaluate the model's translations
TASK=translation
python translation_evaluation.py -save_data ${EXPERIMENT_DIR} \
-encoded_output_text ${EXPERIMENT_DIR}/encoded_translations_st.txt \
-text_encoder_decoder $DATA_DIR/${TGT_LANG}_${TGT_MODALITY}.model \
-reference_text $DATA_DIR/${TGT_LANG}_raw_${TGT_MODALITY}_test.txt \
-task $TASK \
-specific_task st
# Evaluate ASR
if [[ "$EXPERIMENT_NAME" = *"asr"* ]] && [[ "$EVALUATE_ADDITIONAL_TASKS" = "yes" ]]; then
echo "Evaluating ASR"
python translate.py -model ${MODEL_DIR}/$CHOSEN_MODEL_NAME \
-src $DATA_DIR/${SRC_LANG}_audio_test.scp \
-src_lang $SRC_LANG \
-tgt_lang $SRC_LANG \
-concat $CONCAT \
-asr_format scp \
-encoder_type audio \
-tgt $DATA_DIR/${SRC_LANG}_text_test.txt \
-output ${EXPERIMENT_DIR}/encoded_translations_asr.txt \
-batch_size 5 \
-max_sent_length 1024 \
-gpu 0
# Evaluate the model's translations
TASK=asr
python translation_evaluation.py -save_data ${EXPERIMENT_DIR} \
-encoded_output_text ${EXPERIMENT_DIR}/encoded_translations_asr.txt \
-text_encoder_decoder $DATA_DIR/${SRC_LANG}_text.model \
-reference_text $DATA_DIR/${SRC_LANG}_raw_text_test.txt \
-task $TASK \
-specific_task asr
fi
# Evaluate MT
if [[ "$EXPERIMENT_NAME" = *"mt"* ]] && [[ "$EVALUATE_ADDITIONAL_TASKS" = "yes" ]]; then
echo "Evaluating MT"
python translate.py -model ${MODEL_DIR}/$CHOSEN_MODEL_NAME \
-src $DATA_DIR/${SRC_LANG}_text_test.txt \
-src_lang $SRC_LANG \
-tgt_lang $TGT_LANG \
-concat $CONCAT \
-encoder_type text \
-tgt $DATA_DIR/${TGT_LANG}_text_test.txt \
-output ${EXPERIMENT_DIR}/encoded_translations_mt.txt \
-batch_size 5 \
-max_sent_length 1024 \
-gpu 0
# Evaluate the model's translations
TASK=translation
python translation_evaluation.py -save_data ${EXPERIMENT_DIR} \
-encoded_output_text ${EXPERIMENT_DIR}/encoded_translations_mt.txt \
-text_encoder_decoder $DATA_DIR/${TGT_LANG}_text.model \
-reference_text $DATA_DIR/${TGT_LANG}_raw_text_test.txt \
-task $TASK \
-specific_task mt
fi