In [None]:
%%capture
# Later this will just be 'pip install leb'
!git clone https://github.com/jqug/leb.git
!pip install -r leb/requirements.txt

In [None]:
%%capture
!pip install transformers[torch]
!pip install accelerate -U
!pip install jiwer
!pip install omegaconf
!pip install datasets
!pip install sacremoses

In [None]:
from torch import nn
import torch
from transformers import (
    AutoFeatureExtractor,
    AutoModelForCTC,
    AutoProcessor,
    HfArgumentParser,
    Trainer,
    TrainingArguments,
    Wav2Vec2CTCTokenizer,
    Wav2Vec2FeatureExtractor,
    Wav2Vec2ForCTC,
    Wav2Vec2Processor,
    is_apex_available,
    set_seed,
)
from dataclasses import dataclass, field
from typing import Union, List, Dict
import string
import datasets


In [None]:
import leb.dataset
import yaml

# ASR data example

In [None]:
yaml_config = '''
huggingface_load:
    path: Sunbird/salt
    split: train
    name: multispeaker-lug
source:
  type: speech
  language: lug
  preprocessing:
    - set_sample_rate:
        rate: 16_000
target:
  type: text
  language: lug
  preprocessing:
    - lower_case
    - remove_punctuation

'''

config = yaml.safe_load(yaml_config)
train_ds = leb.dataset.create(config)

In [None]:
yaml_config = '''
huggingface_load:
    path: Sunbird/salt
    split: dev
    name: multispeaker-lug
source:
  type: speech
  language: lug
  preprocessing:
    - set_sample_rate:
        rate: 16_000
target:
  type: text
  language: lug
  preprocessing:
    - lower_case
    - remove_punctuation

'''

config = yaml.safe_load(yaml_config)
eval_ds = leb.dataset.create(config)

leb.utils.show_dataset(eval_ds.take(5), audio_features=['source'])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Unnamed: 0,source,target
0,Your browser does not support the audio element.,waliwo pulogulaamu nnyingi ezokweggya mu bwavu ezeetooloorera ku byobulimi nobulunzi
1,Your browser does not support the audio element.,bafuna obubaka ku budde bwokusimba enkozesa yebimera ennungi nebiddiria oluvannyuma lwamakungula
2,Your browser does not support the audio element.,enkuyege zifuuse ensonga ennene ennyo mu nnimiro eno
3,Your browser does not support the audio element.,ebikoola byekimera bikwatiddwa obulwadde
4,Your browser does not support the audio element.,ensi yonna eri mu kirwadde bbunansi


In [None]:
# Create dict for vocabulary
def extract_all_chars(batch):
    all_text = " ".join(batch["target"])
    vocab = list(set(all_text))
    return {"vocab": vocab, "all_text": [all_text]}

In [None]:
vocab_dict = {}

for item in train_ds:
    result = extract_all_chars(item)
    for char in result["vocab"]:
        vocab_dict[char] = 1

vocab_list = list(vocab_dict.keys())
vocab_dict = {v: k for k, v in enumerate(vocab_list)}

In [None]:
vocab_dict["|"] = vocab_dict[" "]
vocab_dict["[UNK]"] = len(vocab_dict)
vocab_dict["[PAD]"] = len(vocab_dict)

In [None]:
target_lang = "lug"
new_vocab_dict = {target_lang: vocab_dict}

In [None]:
new_vocab_dict

{'lug': {'k': 0,
  'b': 1,
  'e': 2,
  'l': 3,
  's': 4,
  'u': 5,
  'i': 6,
  'a': 7,
  'y': 8,
  ' ': 9,
  'g': 10,
  'm': 11,
  'n': 12,
  'r': 13,
  'o': 14,
  'z': 15,
  'd': 16,
  't': 17,
  'w': 18,
  'f': 19,
  'v': 20,
  'j': 21,
  'p': 22,
  'c': 23,
  'h': 24,
  'x': 25,
  '|': 9,
  '[UNK]': 27,
  '[PAD]': 28}}

In [None]:
def prepare_dataset(batch):
    # check that all files have the correct sampling rate
    # print(batch)
    # print(batch.keys())
    # assert (
    #     len(set(batch["sampling_rate"])) == 1
    # ), f"Make sure all inputs have the same sampling rate of {processor.feature_extractor.sampling_rate}."

    batch["input_values"] = processor(
        batch["source"], sampling_rate=16000
    ).input_values
    # Setup the processor for targets
    # with processor.as_target_processor():
        # batch["labels"] = processor(batch["target_text"]).input_ids
    batch["labels"] = processor(text=batch["target"]).input_ids

    return batch

In [None]:
final_train_dataset = train_ds.map(
    prepare_dataset,
    batch_size=4,
    batched=True,
)

In [None]:
final_val_dataset = eval_ds.map(
    prepare_dataset,
    batch_size=4,
    batched=True,
)

In [None]:
import json
with open("vocab.json", "w") as vocab_file:
    json.dump(new_vocab_dict, vocab_file)

In [None]:
@dataclass
class DataCollatorCTCWithPadding:
    """
    Data collator that will dynamically pad the inputs received.
    Args:
        processor (:class:`~transformers.Wav2Vec2Processor`)
            The processor used for proccessing the data.
        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
            among:
            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
              sequence if provided).
            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
              maximum acceptable input length for the model if that argument is not provided.
            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
              different lengths).
    """

    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True

    def __call__(
        self, features: List[Dict[str, Union[List[int], torch.Tensor]]]
    ) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lenghts and need
        # different padding methods


        input_features = [
            {"input_values": feature["input_values"]} for feature in features
        ]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            return_tensors="pt",
        )
        labels_batch = self.processor.pad(
            labels=label_features,
            padding=self.padding,
            return_tensors="pt",
        )

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(
            labels_batch.attention_mask.ne(1), -100
        )

        batch["labels"] = labels

        return batch

In [None]:
tokenizer = Wav2Vec2CTCTokenizer.from_pretrained("./", unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|", target_lang=target_lang)
feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=True)

processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)

In [None]:
wer_metric = datasets.load_metric("wer")

  wer_metric = datasets.load_metric("wer")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


In [None]:
data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

In [None]:
import numpy as np

def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)

    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id

    pred_str = processor.batch_decode(pred_ids)
    # we do not want to group tokens when computing the metrics
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)

    wer = wer_metric.compute(predictions=pred_str, references=label_str)
    return {"wer": wer}

In [None]:
model = Wav2Vec2ForCTC.from_pretrained(
    "facebook/mms-1b-all",
    attention_dropout=0.0,
    hidden_dropout=0.0,
    feat_proj_dropout=0.0,
    layerdrop=0.0,
    ctc_loss_reduction="mean",
    pad_token_id=processor.tokenizer.pad_token_id,
    vocab_size=len(processor.tokenizer),
    ignore_mismatched_sizes=True,
)

Some weights of the model checkpoint at facebook/mms-1b-all were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/mms-1b-all and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream

In [None]:
model.gradient_checkpointing_enable()
model.init_adapter_layers()
model.freeze_base_model()

adapter_weights = model._get_adapters()
for param in adapter_weights.values():
    param.requires_grad = True

In [None]:
training_args = TrainingArguments(
    output_dir="output/mms-lug",
    # group_by_length=True,
    per_device_train_batch_size=2,
    evaluation_strategy="steps",
    num_train_epochs=5,
    max_steps=10000,
    gradient_checkpointing=True,
    fp16=True,
    save_steps=1000,
    eval_steps=1000,
    logging_steps=1000,
    learning_rate=1e-3,
    warmup_steps=100,
    save_total_limit=2,
    # push_to_hub=True,
    # report_to="wandb",
    run_name="mms-lug",
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
    weight_decay=0.01,
)

In [None]:
trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=final_train_dataset,
    eval_dataset=final_val_dataset,
    tokenizer=processor.feature_extractor,
)

In [None]:
trainer.train()



Step,Training Loss,Validation Loss,Wer
1000,0.1706,0.270076,0.281981
2000,0.1769,0.249865,0.277854
3000,0.2568,0.258447,0.309491
4000,0.1152,0.236613,0.283356




Step,Training Loss,Validation Loss,Wer
1000,0.1706,0.270076,0.281981
2000,0.1769,0.249865,0.277854
3000,0.2568,0.258447,0.309491
4000,0.1152,0.236613,0.283356
5000,0.2276,0.210257,0.270977
6000,0.1495,0.222611,0.276479
7000,0.1382,0.218585,0.277854


