In [None]:
!nvidia-smi

Cloning if used on Google Colab

In [None]:
%%bash
git clone https://github.com/TuAnh23/MultiModalST.git
cd tuanh_thesis
git pull origin master
cd ../

Change working directory to the root of the repo

In [None]:
import os
os.chdir('tuanh_thesis')

# Repository usage instruction

## 1. Environment settings

Use a Conda environment with `python=3.7`

In [None]:
# try to run the bare minimum to get a new conda env working
conda_path = ''
try:
    conda_path = !which conda
finally:
    print('')

if (len(conda_path) == 0):
    print('installing miniconda')
    !wget https://repo.anaconda.com/miniconda/Miniconda3-py37_4.9.2-Linux-x86_64.sh && bash Miniconda3-py37_4.9.2-Linux-x86_64.sh -bfp /usr/local
    !source /usr/local/etc/profile.d/conda.sh
    !conda init 
    !conda install -n root _license -y -q
else:
    print('found miniconda')

conda_envs = !conda env list
res = [i for i in conda_envs if 'BachelorThesisST' in i]
if (len(res) == 0):
    print('not found BachelorThesisST env', len(res))
    !conda create -y -q --name BachelorThesisST python=3.7 conda=4.9.2 
else:
    print('found BachelorThesisST env', len(res))

Install the necessary packages

In [None]:
%%bash
source activate BachelorThesisST
conda install -y numpy
conda install -y pandas
conda install -y -c conda-forge sentencepiece
conda install -y pytorch torchvision torchaudio cudatoolkit=10.1 -c pytorch
conda install -y -c anaconda hdf5
conda install -y -c conda-forge nvidia-apex
conda install -y -c conda-forge librosa
conda install -y -c powerai sacrebleu
conda install -y h5py
pip install kaldiio
pip install vizseq
git clone https://github.com/thomasZen/python_speech_features2
cd python_speech_features2 
python setup.py install
cd ../
conda install -y ipykernel

## 2. Download and prepare [Covost 2](https://github.com/facebookresearch/covost)

Create data folders.

`full`, `one_half`, `one_fourth`,... denote the portion of training data

In [None]:
%%bash
mkdir data
mkdir data/CoVoST2
mkdir data/CoVoST2/preprocessed
mkdir data/CoVoST2/preprocessed/one_seventy

Let's download and prepare a small data pair as an example: NL speech --> EN text

In [None]:
%%bash
source activate BachelorThesisST
python 
# ----------- Activate the conda env in every cell, neccessary if using Google colab. Remove the above lines if not needed -----------

from covost_data_preparation import download, prepare_X_to_en_data

SRC_LANG = 'nl'
TGT_lang = 'en'

print('Downloading Covost data')

# CommonVoice urls for the audio
urls = {SRC_LANG: f'https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-4-2019-12-10/{SRC_LANG}.tar.gz'}
# Specify language pair
XX_EN_LANGUAGES = [SRC_LANG]

download(urls, en_xx_languages=[], xx_en_languages=XX_EN_LANGUAGES)

print('Preparing Covost data')
prepare_X_to_en_data([SRC_LANG], training_portion=0.0143) # Use 1.43% of the training data only


At this point:
- .mp3 audio is saved in `nl`
- raw transcription and translation is saved in `covost2`
- prepared data is saved in `preprocessed`

## 3. Preprocess data and train models

- Preprocess data: `preprocess.py`
- Train models: `train.py`
- Evaluate: `translate.py` and `translation_evaluation.py`

We will train and evaluate a plain ZS model as an example

Preprocess ASR and MT data.

The preprocessed data is stored at `data/CoVoST2/preprocessed/one_seventy/nl-en/mix_nl_text_de`

In [None]:
%%bash
source activate BachelorThesisST
# ----------- Activate the conda env in every cell, neccessary if using Google colab. Remove the above lines if not needed -----------
DATA_DIR=data/CoVoST2/preprocessed/one_seventy/nl-en
SUB_DIR=mix_nl_text_de
echo "Preprocessing ${SUB_DIR} data"
mkdir ${DATA_DIR}/${SUB_DIR} 
# Create a vocabulary for all text sources and targets
python vocab_generator.py -filenames "$DATA_DIR/nl_text_train.txt|${DATA_DIR}/en_text_train.txt" \
    -out_file $DATA_DIR/${SUB_DIR}/src_tgt_vocab
# Use the above vocabs while preprocessing
# Preprocess ASR data
python preprocess.py -train_src $DATA_DIR/nl_audio_train.scp  \
    -train_tgt $DATA_DIR/nl_text_train.txt  \
    -valid_src $DATA_DIR/nl_audio_val.scp  \
    -valid_tgt $DATA_DIR/nl_text_val.txt  \
    -train_src_lang nl \
    -train_tgt_lang nl \
    -valid_src_lang nl \
    -valid_tgt_lang nl \
    -all_langs "nl|en" \
    -src_seq_length 1024  \
    -tgt_seq_length 512  \
    -concat 4 \
    -asr \
    -src_type audio \
    -asr_format scp \
    -save_data $DATA_DIR/${SUB_DIR}/asr_data \
    -format scp \
    -tgt_vocab $DATA_DIR/${SUB_DIR}/src_tgt_vocab
# Preprocess MT data
python preprocess.py -train_src $DATA_DIR/nl_text_train.txt  \
    -train_tgt $DATA_DIR/en_text_train.txt  \
    -valid_src $DATA_DIR/nl_text_val.txt  \
    -valid_tgt $DATA_DIR/en_text_val.txt  \
    -train_src_lang nl \
    -train_tgt_lang en \
    -valid_src_lang nl \
    -valid_tgt_lang en \
    -all_langs "nl|en" \
    -src_seq_length 512  \
    -tgt_seq_length 512  \
    -concat 1 \
    -src_type text \
    -save_data $DATA_DIR/${SUB_DIR}/mt_data \
    -format mmem \
    -src_vocab $DATA_DIR/${SUB_DIR}/src_tgt_vocab \
    -tgt_vocab $DATA_DIR/${SUB_DIR}/src_tgt_vocab

Train a dummy model.

Model checkpoints and experiment results are stored at `models/mix_nl_text_en_dummy`, `experiments/mix_nl_text_en_dummy`

In [None]:
%%bash
source activate BachelorThesisST
# ----------- Activate the conda env in every cell, neccessary if using Google colab. Remove the above lines if not needed -----------
DATA_DIR=data/CoVoST2/preprocessed/one_seventy/nl-en
SUB_DIR=mix_nl_text_de
MODEL_DIR=models/${SUB_DIR}_dummy
EXPERIMENT_DIR=experiments/${SUB_DIR}_dummy
mkdir ${MODEL_DIR}
mkdir ${EXPERIMENT_DIR}
python -u train.py -data ${DATA_DIR}/${SUB_DIR}/asr_data \
  $cont_checkpoint_str \
  -data_format scp \
  -additional_data "${DATA_DIR}/${SUB_DIR}/mt_data" \
  -additional_data_format mmem \
  -data_ratio -1 \
  -use_language_embedding \
  -language_embedding_type concat \
  -save_model ${MODEL_DIR}/model \
  -model transformer \
  -batch_size_words 2048 \
  -batch_size_update 24568 \
  -batch_size_sents 9999 \
  -batch_size_multiplier 8 \
  -encoder_type mix \
  -checkpointing 0 \
  -input_size $((80*4)) \
  -concat 4 \
  -layers 12 \
  -audio_encoder_layers 32 \
  -text_encoder_layers 12 \
  -share_encoders_parameter all_text_enc \
  -death_rate 0.0 \
  -model_size 512 \
  -inner_size $((512*4)) \
  -n_heads 8 \
  -dropout 0.2 \
  -attn_dropout 0.2 \
  -word_dropout 0.1 \
  -emb_dropout 0.2 \
  -label_smoothing 0.1 \
  -epochs 5 \
  -optim adam \
  -learning_rate 0.001 \
  -normalize_gradient \
  -warmup_steps 8000 \
  -tie_weights \
  -join_embedding \
  -seed 8877 \
  -log_interval 1000 \
  -update_frequency -1 \
  -gpus 0 | tee -a ${EXPERIMENT_DIR}/train.log
  sed '/.*Validation perplexity.*/{s///;q;}' ${EXPERIMENT_DIR}/train.log > ${EXPERIMENT_DIR}/shortened_train.log
  grep -e "Train perplexity" -e "Validation perplexity" ${EXPERIMENT_DIR}/train.log >> ${EXPERIMENT_DIR}/shortened_train.log

Perfrom translation on the test set

In [None]:
%%bash
source activate BachelorThesisST
# ----------- Activate the conda env in every cell, neccessary if using Google colab. Remove the above lines if not needed -----------
DATA_DIR=data/CoVoST2/preprocessed/one_seventy/nl-en
SUB_DIR=mix_nl_text_de
MODEL_DIR=models/${SUB_DIR}_dummy
EXPERIMENT_DIR=experiments/${SUB_DIR}_dummy
CHOSEN_MODEL_NAME=$(python finding_best_model.py -model_dir ${MODEL_DIR})
echo "Evaluating ST"
python translate.py -model ${MODEL_DIR}/$CHOSEN_MODEL_NAME \
    -src $DATA_DIR/nl_audio_test.scp \
    -src_lang nl \
    -tgt_lang en \
    -concat 4 \
    -asr_format scp \
    -encoder_type audio \
    -tgt $DATA_DIR/en_text_test.txt  \
    -output ${EXPERIMENT_DIR}/encoded_translations_st.txt \
    -batch_size 5 \
    -max_sent_length 1024 \
    -gpu 0


Evaluate the translation

In [None]:
%%bash
source activate BachelorThesisST
# ----------- Activate the conda env in every cell, neccessary if using Google colab. Remove the above lines if not needed -----------
DATA_DIR=data/CoVoST2/preprocessed/one_seventy/nl-en
SUB_DIR=mix_nl_text_de
EXPERIMENT_DIR=experiments/${SUB_DIR}_dummy
python translation_evaluation.py -save_data ${EXPERIMENT_DIR} \
      -encoded_output_text ${EXPERIMENT_DIR}/encoded_translations_st.txt \
      -text_encoder_decoder $DATA_DIR/en_text.model \
      -reference_text $DATA_DIR/en_raw_text_test.txt \
      -task translation \
      -specific_task st

The following shell scripts run the preprocess-train-eval pipeline:
- `run_translation_pipeline.sh` for single-task models
- `cascaded_ST_evaluation.sh` evaluates cascaded ST using pretrained ASR and MT models
- `run_translation_multi_modalities_pipeline.sh` for multi-task, multi-modality models (including zero-shot)
- `run_zeroshot_with_artificial_data.sh` for zero-shot models using data augmentation
- `run_bidirectional_zeroshot.sh` for zero-shot models using additional opposite training data
- `run_fine_tunning.sh`, `run_fine_tunning_fromASR.sh` for fine-tuning pre-trained models
- `modality_similarity_svcca.sh`, `modality_similarity_classifier.sh` measure text-audio similarity in representation

See the shell script comments to modify the variables as desired.

In [None]:
while True: pass