## Fairness in AI

### Fine tuning Whisper-Small with AfriSpeech-200 English Accent Data.
Apr 16th 2024

reference:https://huggingface.co/blog/fine-tune-whisper?source=post_page-----976164a5eac8--------------------------------

afriSpeech-2000: https://huggingface.co/datasets/tobiolatunji/afrispeech-200

### Preparing Environment

In [None]:
!pip install datasets jiwer
!pip install evaluate



In [None]:
!pip install transformers[torch]
!pip install accelerate -U
!pip install -U accelerate transformers



In [None]:
from transformers import WhisperFeatureExtractor, WhisperProcessor
from transformers import WhisperForConditionalGeneration, WhisperFeatureExtractor
from jiwer import wer
from datasets import load_dataset, DatasetDict
import torch
from evaluate import load
from transformers import WhisperTokenizer
from datasets import Audio, load_dataset
import torch
from transformers import WhisperForConditionalGeneration
import evaluate
from dataclasses import dataclass
from typing import Any, Dict, List, Union
from transformers import Seq2SeqTrainingArguments
from transformers import Seq2SeqTrainer
import librosa
from transformers import WhisperProcessor
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from datasets import load_dataset, DatasetDict

### Preprocessing AfriSpeech-200 English Accent Data

In [None]:
afrispeech2 = DatasetDict()
afrispeech2['test'] = load_dataset("tobiolatunji/afrispeech-200", "english", split="validation")
afrispeech2["train"] = load_dataset("tobiolatunji/afrispeech-200", "english", split="test")
afrispeech2 = afrispeech2.remove_columns(['speaker_id', 'path', 'audio_id', 'age_group', 'gender', 'accent', 'domain', 'country', 'duration'])

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


In [None]:
print(afrispeech2)

DatasetDict({
    test: Dataset({
        features: ['audio', 'transcript'],
        num_rows: 44
    })
    train: Dataset({
        features: ['audio', 'transcript'],
        num_rows: 106
    })
})


In [None]:
feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-small")

preprocessor_config.json:   0%|          | 0.00/185k [00:00<?, ?B/s]

In [None]:
tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-small", language="en", task="transcribe")

tokenizer_config.json:   0%|          | 0.00/283k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/836k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.48M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]

normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/34.6k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.19k [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
input_str = afrispeech2["train"][0]["transcript"]
labels = tokenizer(input_str).input_ids
decoded_with_special = tokenizer.decode(labels, skip_special_tokens=False)
decoded_str = tokenizer.decode(labels, skip_special_tokens=True)

print(f"Input:                 {input_str}")
print(f"Decoded w/ special:    {decoded_with_special}")
print(f"Decoded w/out special: {decoded_str}")
print(f"Are equal:             {input_str == decoded_str}")

Input:                 Noted to have intermittent snorting sounds with respirations but no upper airway secretions this past shift.
Decoded w/ special:    <|startoftranscript|><|en|><|transcribe|><|notimestamps|>Noted to have intermittent snorting sounds with respirations but no upper airway secretions this past shift.<|endoftext|>
Decoded w/out special: Noted to have intermittent snorting sounds with respirations but no upper airway secretions this past shift.
Are equal:             True


In [None]:
processor = WhisperProcessor.from_pretrained("openai/whisper-small", language="en", task="transcribe")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
print(afrispeech2)

DatasetDict({
    test: Dataset({
        features: ['audio', 'transcript'],
        num_rows: 44
    })
    train: Dataset({
        features: ['audio', 'transcript'],
        num_rows: 106
    })
})


In [None]:
afrispeech2 = afrispeech2.cast_column("audio", Audio(sampling_rate=16000))

#print(afrispeech2["train"][0])

TypeError: Couldn't cast array of type
list<item: float>
to
struct<bytes: binary, path: string>

In [None]:
print(afrispeech2["train"][0])

{'audio': {'path': '/root/.cache/huggingface/datasets/downloads/extracted/6252a4255d76ab213f4b25a0809dee5ba40fe1183a493b99d81b712bf6659f22/763f3b6b-f7f9-43ac-8ea0-55d48c802dde/cbd6e3693a746386fe41eb649d144249.wav', 'array': array([ 0.00079287,  0.00117101,  0.00097308, ...,  0.00132559,
       -0.00011966, -0.00118353]), 'sampling_rate': 16000}, 'transcript': 'Noted to have intermittent snorting sounds with respirations but no upper airway secretions this past shift.'}


In [None]:
def prepare_dataset(batch):
    # load and resample audio data from 48 to 16kHz
    audio = batch["audio"]

    # compute log-Mel input features from input audio array
    batch["input_features"] = feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]

    # encode target text to label ids
    batch["labels"] = tokenizer(batch["transcript"], padding="max_length", truncation=True, max_length=448).input_ids
    return batch

afrispeech2 = afrispeech2.map(prepare_dataset, remove_columns=afrispeech2.column_names["train"], num_proc=4)

In [None]:
import torch

from dataclasses import dataclass
from typing import Any, Dict, List, Union

@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need different padding methods
        # first treat the audio inputs by simply returning torch tensors
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # get the tokenized label sequences
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        # pad the labels to max length
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # if bos token is appended in previous tokenization step,
        # cut bos token here as it's append later anyways
        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch


In [None]:
data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)

In [None]:
import evaluate

metric = evaluate.load("wer")
def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # replace -100 with the pad_token_id
    label_ids[label_ids == -100] = tokenizer.pad_token_id

    # we do not want to group tokens when computing the metrics
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    wer = 100 * metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

### Model Training

In [None]:
from transformers import WhisperForConditionalGeneration

model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")
model.generation_config.language = "en"
model.generation_config.task = "transcribe"

model.generation_config.forced_decoder_ids = None

In [None]:
from transformers import Seq2SeqTrainingArguments

training_args5 = Seq2SeqTrainingArguments(
    output_dir="/content/",
    per_device_train_batch_size=16,  # Increased batch size
    gradient_accumulation_steps=1,  # Adjusted as we increase the batch size
    learning_rate=5e-6,  # Keep the lower learning rate
    warmup_steps=200,
    max_steps=400,  # Extended training steps
    gradient_checkpointing=True,
    fp16=True,
    evaluation_strategy="steps",
    per_device_eval_batch_size=4,  # Also increasing eval batch size for consistency
    predict_with_generate=True,
    generation_max_length=225,
    save_steps=50,
    eval_steps=25,
    logging_steps=25,
    report_to=["tensorboard"],
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
    push_to_hub=False,
)

In [None]:
from transformers import Seq2SeqTrainer

trainer5 = Seq2SeqTrainer(
    args=training_args5,
    model=model,
    train_dataset=afrispeech2["train"],
    eval_dataset=afrispeech2["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=processor.feature_extractor,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [None]:
trainer5.train()

`use_cache = True` is incompatible with gradient checkpointing. Setting `use_cache = False`...


Step,Training Loss,Validation Loss,Wer
25,1.6649,0.482826,56.714286
50,0.3575,0.204349,55.428571
75,0.2379,0.161095,55.285714
100,0.1572,0.127665,53.285714
125,0.1137,0.113443,53.285714
150,0.0822,0.11029,54.714286
175,0.0541,0.110479,56.714286
200,0.0415,0.109747,54.428571
225,0.0278,0.108413,55.142857
250,0.0241,0.108483,54.714286


Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618

TrainOutput(global_step=400, training_loss=0.17692752659320832, metrics={'train_runtime': 2564.6668, 'train_samples_per_second': 2.495, 'train_steps_per_second': 0.156, 'total_flos': 1.74825035513856e+18, 'train_loss': 0.17692752659320832, 'epoch': 57.14})