## Installs / Imports

In [None]:
!pip install fairseq -q
!pip install pandas torchaudio sentencepiece -q
!pip install datasets -q

In [None]:
from google.colab import drive
drive.mount('/content/drive')

## Preprocessing

### Guarani

In [None]:
%cd /content/drive/MyDrive/hw2_mnlp/fairseq/examples/speech_to_text

In [None]:
vocab_type = 'bpe'     # ["bpe", "unigram", "char"]
vocab_size = 2000


!python prep_librispeech_data.py \
  --data-dir /content/drive/MyDrive/hw2_mnlp/cv-corpus-15.0-2023-09-08/gn \
  --output-root /content/drive/MyDrive/hw2_mnlp/cv-corpus-15.0-2023-09-08/gn/processed_{vocab_type}_{vocab_size}/ \
  --vocab-type {vocab_type} \
  --vocab-size {vocab_size}

In [None]:
vocab_type = 'char'     # ["bpe", "unigram", "char"]
vocab_size = 2000


!python prep_librispeech_data.py \
  --data-dir /content/drive/MyDrive/hw2_mnlp/cv-corpus-15.0-2023-09-08/gn \
  --output-root /content/drive/MyDrive/hw2_mnlp/cv-corpus-15.0-2023-09-08/gn/processed_{vocab_type}_{vocab_size}/ \
  --vocab-type {vocab_type} \
  --vocab-size {vocab_size}

### Quechua

In [None]:
%cd /content/drive/MyDrive/hw2_mnlp/que_spa_clean/

In [None]:
vocab_type = 'bpe'     # ["bpe", "unigram", "char"]
vocab_size = 2000

!python preprocess_quechua_data.py \
  --data-dir /content/drive/MyDrive/hw2_mnlp/que_spa_clean \
  --output-root /content/drive/MyDrive/hw2_mnlp/que_spa_clean_processed_{vocab_type}_{vocab_size}/ \
  --vocab-type {vocab_type} \
  --vocab-size {vocab_size}

## Training

### Guarani

#### Unigram Vocab, 2000 Vocab Size

In [1]:
vocab_type = 'unigram'     # ["bpe", "unigram", "char"]
vocab_size = 2000

%cd /content/drive/MyDrive/hw2_mnlp/cv-corpus-15.0-2023-09-08/gn/processed

!fairseq-train . --save-dir save \
  --config-yaml config.yaml --train-subset train --valid-subset validation \
  --num-workers 4 --max-tokens 40000 --max-update 300000 \
  --task speech_to_text --criterion label_smoothed_cross_entropy --label-smoothing 0.1 --report-accuracy \
  --arch s2t_transformer_s --share-decoder-input-output-embed \
  --optimizer adam --lr 2e-3 --lr-scheduler inverse_sqrt --warmup-updates 10000 \
  --clip-norm 10.0 --seed 1 --update-freq 8 --max-epoch 30 --log-format tqdm --fp16

LS_ROOT = f'/content/drive/MyDrive/hw2_mnlp/cv-corpus-15.0-2023-09-08/gn/processed'
SAVE_DIR = f'{LS_ROOT}/save'
CHECKPOINT_FILENAME = 'checkpoint_best.pt'
SUBSETS = ['test','validation']

# Loop over subsets
for SUBSET in SUBSETS:
    !fairseq-generate {LS_ROOT} --config-yaml config.yaml --gen-subset {SUBSET} \
    --task speech_to_text --path {SAVE_DIR}/{CHECKPOINT_FILENAME} \
    --max-tokens 50000 --beam 10 --scoring wer

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
                                                                           [A2023-11-02 02:28:04 | INFO | validation | epoch 022 | valid on 'validation' subset | loss 9.78 | nll_loss 9.546 | total 806.8 | n_correct 23.8 | ppl 747.36 | accuracy 2.95 | wps 11676.3 | wpb 806.8 | bsz 71.8 | num_updates 86 | best_loss 9.78
2023-11-02 02:28:04 | INFO | fairseq.checkpoint_utils | Preparing to save checkpoint for epoch 22 @ 86 updates
2023-11-02 02:28:04 | INFO | fairseq.trainer | Saving checkpoint to /content/drive/MyDrive/hw2_mnlp/cv-corpus-15.0-2023-09-08/gn/processed/save/checkpoint22.pt
2023-11-02 02:28:11 | INFO | fairseq.trainer | Finished saving checkpoint to /content/drive/MyDrive/hw2_mnlp/cv-corpus-15.0-2023-09-08/gn/processed/save/checkpoint22.pt
2023-11-02 02:28:30 | INFO | fairseq.checkpoint_utils | Saved checkpoint save/checkpoint22.pt (epoch 22 @ 86 updates, score 9.78) (writing took 25.732196023999677 seconds)
20

#### Char Vocab, 2000 Vocab Size

In [2]:
vocab_type = 'char'     # ["bpe", "unigram", "char"]
vocab_size = 2000

%cd /content/drive/MyDrive/hw2_mnlp/cv-corpus-15.0-2023-09-08/gn/processed_{vocab_type}_{vocab_size}

!fairseq-train . --save-dir save \
  --config-yaml config.yaml --train-subset train --valid-subset validation \
  --num-workers 4 --max-tokens 40000 --max-update 300000 \
  --task speech_to_text --criterion label_smoothed_cross_entropy --label-smoothing 0.1 --report-accuracy \
  --arch s2t_transformer_s --share-decoder-input-output-embed \
  --optimizer adam --lr 2e-3 --lr-scheduler inverse_sqrt --warmup-updates 10000 \
  --clip-norm 10.0 --seed 1 --update-freq 8 --max-epoch 30 --log-format tqdm --fp16

LS_ROOT = f'/content/drive/MyDrive/hw2_mnlp/cv-corpus-15.0-2023-09-08/gn/processed_{vocab_type}_{vocab_size}'
SAVE_DIR = f'{LS_ROOT}/save'
CHECKPOINT_FILENAME = 'checkpoint_best.pt'
SUBSETS = ['test','validation']

# Loop over subsets
for SUBSET in SUBSETS:
    !fairseq-generate {LS_ROOT} --config-yaml config.yaml --gen-subset {SUBSET} \
    --task speech_to_text --path {SAVE_DIR}/{CHECKPOINT_FILENAME} \
    --max-tokens 50000 --beam 10 --scoring wer

[1;30;43mStreaming output truncated to the last 5000 lines.[0m

epoch 022 | valid on 'validation' subset:   0% 0/5 [00:00<?, ?it/s][A
epoch 022 | valid on 'validation' subset:  20% 1/5 [00:00<00:03,  1.21it/s][A
epoch 022 | valid on 'validation' subset:  60% 3/5 [00:01<00:00,  3.50it/s][A
epoch 022 | valid on 'validation' subset: 100% 5/5 [00:01<00:00,  5.83it/s][A
                                                                           [A2023-11-02 02:43:55 | INFO | validation | epoch 022 | valid on 'validation' subset | loss 5.405 | nll_loss 5.243 | total 2406.4 | n_correct 16.6 | ppl 37.86 | accuracy 0.69 | wps 29491.1 | wpb 2406.4 | bsz 71.8 | num_updates 83 | best_loss 5.405
2023-11-02 02:43:55 | INFO | fairseq.checkpoint_utils | Preparing to save checkpoint for epoch 22 @ 83 updates
2023-11-02 02:43:55 | INFO | fairseq.trainer | Saving checkpoint to /content/drive/MyDrive/hw2_mnlp/cv-corpus-15.0-2023-09-08/gn/processed_char_2000/save/checkpoint22.pt
2023-11-02 02:44:06 |

#### BPE Vocab, 2000 Vocab Size

In [3]:
vocab_type = 'bpe'     # ["bpe", "unigram", "char"]
vocab_size = 2000

%cd /content/drive/MyDrive/hw2_mnlp/cv-corpus-15.0-2023-09-08/gn/processed_{vocab_type}_{vocab_size}

!fairseq-train . --save-dir save \
  --config-yaml config.yaml --train-subset train --valid-subset validation \
  --num-workers 4 --max-tokens 40000 --max-update 300000 \
  --task speech_to_text --criterion label_smoothed_cross_entropy --label-smoothing 0.1 --report-accuracy \
  --arch s2t_transformer_s --share-decoder-input-output-embed \
  --optimizer adam --lr 2e-3 --lr-scheduler inverse_sqrt --warmup-updates 10000 \
  --clip-norm 10.0 --seed 1 --update-freq 8 --max-epoch 30 --log-format tqdm --fp16

LS_ROOT = f'/content/drive/MyDrive/hw2_mnlp/cv-corpus-15.0-2023-09-08/gn/processed_{vocab_type}_{vocab_size}'
SAVE_DIR = f'{LS_ROOT}/save'
CHECKPOINT_FILENAME = 'checkpoint_best.pt'
SUBSETS = ['test','validation']

# Loop over subsets
for SUBSET in SUBSETS:
    !fairseq-generate {LS_ROOT} --config-yaml config.yaml --gen-subset {SUBSET} \
    --task speech_to_text --path {SAVE_DIR}/{CHECKPOINT_FILENAME} \
    --max-tokens 50000 --beam 10 --scoring wer

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
epoch 022 | valid on 'validation' subset:   0% 0/5 [00:00<?, ?it/s][A
epoch 022 | valid on 'validation' subset:  20% 1/5 [00:00<00:03,  1.30it/s][A
epoch 022 | valid on 'validation' subset:  80% 4/5 [00:00<00:00,  5.45it/s][A
                                                                           [A2023-11-02 03:03:04 | INFO | validation | epoch 022 | valid on 'validation' subset | loss 9.747 | nll_loss 9.505 | total 770.8 | n_correct 23.4 | ppl 726.71 | accuracy 3.036 | wps 18069.4 | wpb 770.8 | bsz 71.8 | num_updates 86 | best_loss 9.747
2023-11-02 03:03:04 | INFO | fairseq.checkpoint_utils | Preparing to save checkpoint for epoch 22 @ 86 updates
2023-11-02 03:03:04 | INFO | fairseq.trainer | Saving checkpoint to /content/drive/MyDrive/hw2_mnlp/cv-corpus-15.0-2023-09-08/gn/processed_bpe_2000/save/checkpoint22.pt
2023-11-02 03:03:06 | INFO | fairseq.trainer | Finished saving checkpoint to /content/drive/MyDrive/hw

### Quechua

#### Unigram Vocab, 1636 Vocab Size

In [5]:
vocab_type = 'unigram'     # ["bpe", "unigram", "char"]
vocab_size = 1636

%cd /content/drive/MyDrive/hw2_mnlp/que_spa_clean_processed_{vocab_type}_{vocab_size}

!fairseq-train . --save-dir save \
  --config-yaml config.yaml --train-subset train --valid-subset validation \
  --num-workers 4 --max-tokens 40000 --max-update 300000 \
  --task speech_to_text --criterion label_smoothed_cross_entropy --label-smoothing 0.1 --report-accuracy \
  --arch s2t_transformer_s --share-decoder-input-output-embed \
  --optimizer adam --lr 2e-3 --lr-scheduler inverse_sqrt --warmup-updates 10000 \
  --clip-norm 10.0 --seed 1 --update-freq 8 --max-epoch 30 --log-format tqdm --fp16

LS_ROOT = f'/content/drive/MyDrive/hw2_mnlp/que_spa_clean_processed_{vocab_type}_{vocab_size}'
SAVE_DIR = f'{LS_ROOT}/save'
CHECKPOINT_FILENAME = 'checkpoint_best.pt'
SUBSETS = ['validation']

# Loop over subsets
for SUBSET in SUBSETS:
    !fairseq-generate {LS_ROOT} --config-yaml config.yaml --gen-subset {SUBSET} \
    --task speech_to_text --path {SAVE_DIR}/{CHECKPOINT_FILENAME} \
    --max-tokens 50000 --beam 10 --scoring wer

/content/drive/MyDrive/hw2_mnlp/que_spa_clean_processed_unigram_1636
2023-11-02 03:11:30.987818: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-11-02 03:11:30.987870: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-11-02 03:11:30.987906: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-11-02 03:11:30.995839: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the a

#### Char Vocab, 2000 Vocab Size

In [6]:
vocab_type = 'char'     # ["bpe", "unigram", "char"]
vocab_size = 2000

%cd /content/drive/MyDrive/hw2_mnlp/que_spa_clean_processed_{vocab_type}_{vocab_size}

!fairseq-train . --save-dir save \
  --config-yaml config.yaml --train-subset train --valid-subset validation \
  --num-workers 4 --max-tokens 40000 --max-update 300000 \
  --task speech_to_text --criterion label_smoothed_cross_entropy --label-smoothing 0.1 --report-accuracy \
  --arch s2t_transformer_s --share-decoder-input-output-embed \
  --optimizer adam --lr 2e-3 --lr-scheduler inverse_sqrt --warmup-updates 10000 \
  --clip-norm 10.0 --seed 1 --update-freq 8 --max-epoch 30 --log-format tqdm --fp16

LS_ROOT = f'/content/drive/MyDrive/hw2_mnlp/que_spa_clean_processed_{vocab_type}_{vocab_size}'
SAVE_DIR = f'{LS_ROOT}/save'
CHECKPOINT_FILENAME = 'checkpoint_best.pt'
SUBSETS = ['validation']

# Loop over subsets
for SUBSET in SUBSETS:
    !fairseq-generate {LS_ROOT} --config-yaml config.yaml --gen-subset {SUBSET} \
    --task speech_to_text --path {SAVE_DIR}/{CHECKPOINT_FILENAME} \
    --max-tokens 50000 --beam 10 --scoring wer

/content/drive/MyDrive/hw2_mnlp/que_spa_clean_processed_char_2000
2023-11-02 03:22:07.517448: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-11-02 03:22:07.517512: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-11-02 03:22:07.517549: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-11-02 03:22:07.525456: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appr

#### BPE Vocab, 2000 Vocab Size

In [7]:
vocab_type = 'bpe'     # ["bpe", "unigram", "char"]
vocab_size = 2000

%cd /content/drive/MyDrive/hw2_mnlp/que_spa_clean_processed_{vocab_type}_{vocab_size}

!fairseq-train . --save-dir save \
  --config-yaml config.yaml --train-subset train --valid-subset validation \
  --num-workers 4 --max-tokens 40000 --max-update 300000 \
  --task speech_to_text --criterion label_smoothed_cross_entropy --label-smoothing 0.1 --report-accuracy \
  --arch s2t_transformer_s --share-decoder-input-output-embed \
  --optimizer adam --lr 2e-3 --lr-scheduler inverse_sqrt --warmup-updates 10000 \
  --clip-norm 10.0 --seed 1 --update-freq 8 --max-epoch 30 --log-format tqdm --fp16

LS_ROOT = f'/content/drive/MyDrive/hw2_mnlp/que_spa_clean_processed_{vocab_type}_{vocab_size}'
SAVE_DIR = f'{LS_ROOT}/save'
CHECKPOINT_FILENAME = 'checkpoint_best.pt'
SUBSETS = ['validation']

# Loop over subsets
for SUBSET in SUBSETS:
    !fairseq-generate {LS_ROOT} --config-yaml config.yaml --gen-subset {SUBSET} \
    --task speech_to_text --path {SAVE_DIR}/{CHECKPOINT_FILENAME} \
    --max-tokens 50000 --beam 10 --scoring wer

/content/drive/MyDrive/hw2_mnlp/que_spa_clean_processed_bpe_2000
2023-11-02 03:33:45.479625: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-11-02 03:33:45.479683: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-11-02 03:33:45.479726: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-11-02 03:33:45.487632: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appro